1 /*
2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * CRAM I/O primitives.
33 *
34 * - ITF8 encoding and decoding.
35 * - Block based I/O
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
40 */
41
42 /*
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44 * a way to return errors for when malloc fails.
45 */
46
47 #include <config.h>
48
49 #include <stdio.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <zlib.h>
56 #ifdef HAVE_LIBBZ2
57 #include <bzlib.h>
58 #endif
59 #ifdef HAVE_LIBLZMA
60 #ifdef HAVE_LZMA_H
61 #include <lzma.h>
62 #else
63 #include "os/lzma_stub.h"
64 #endif
65 #endif
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <math.h>
69 #include <time.h>
70 #include <stdint.h>
71
72 #ifdef HAVE_LIBDEFLATE
73 #include <libdeflate.h>
74 #define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
75 #endif
76
77 #include "cram/cram.h"
78 #include "cram/os.h"
79 #include "htslib/hts.h"
80 #include "cram/open_trace_file.h"
81 #include "cram/rANS_static.h"
82
83 //#define REF_DEBUG
84
85 #ifdef REF_DEBUG
86 #include <sys/syscall.h>
87 #define gettid() (int)syscall(SYS_gettid)
88
89 #define RP(...) fprintf (stderr, __VA_ARGS__)
90 #else
91 #define RP(...)
92 #endif
93
94 #include "htslib/hfile.h"
95 #include "htslib/bgzf.h"
96 #include "htslib/faidx.h"
97 #include "hts_internal.h"
98
99 #ifndef PATH_MAX
100 #define PATH_MAX FILENAME_MAX
101 #endif
102
103 #define TRIAL_SPAN 50
104 #define NTRIALS 3
105
106
107 /* ----------------------------------------------------------------------
108 * ITF8 encoding and decoding.
109 *
110 * Also see the itf8_get and itf8_put macros in cram_io.h
111 */
112
113 /*
114 * LEGACY: consider using itf8_decode_crc.
115 *
116 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
117 * *val.
118 *
119 * Returns the number of bytes read on success
120 * -1 on failure
121 */
itf8_decode(cram_fd * fd,int32_t * val_p)122 int itf8_decode(cram_fd *fd, int32_t *val_p) {
123 static int nbytes[16] = {
124 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
125 1,1,1,1, // 1000xxxx - 1011xxxx
126 2,2, // 1100xxxx - 1101xxxx
127 3, // 1110xxxx
128 4, // 1111xxxx
129 };
130
131 static int nbits[16] = {
132 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
133 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
134 0x1f, 0x1f, // 1100xxxx - 1101xxxx
135 0x0f, // 1110xxxx
136 0x0f, // 1111xxxx
137 };
138
139 int32_t val = hgetc(fd->fp);
140 if (val == -1)
141 return -1;
142
143 int i = nbytes[val>>4];
144 val &= nbits[val>>4];
145
146 switch(i) {
147 case 0:
148 *val_p = val;
149 return 1;
150
151 case 1:
152 val = (val<<8) | (unsigned char)hgetc(fd->fp);
153 *val_p = val;
154 return 2;
155
156 case 2:
157 val = (val<<8) | (unsigned char)hgetc(fd->fp);
158 val = (val<<8) | (unsigned char)hgetc(fd->fp);
159 *val_p = val;
160 return 3;
161
162 case 3:
163 val = (val<<8) | (unsigned char)hgetc(fd->fp);
164 val = (val<<8) | (unsigned char)hgetc(fd->fp);
165 val = (val<<8) | (unsigned char)hgetc(fd->fp);
166 *val_p = val;
167 return 4;
168
169 case 4: // really 3.5 more, why make it different?
170 val = (val<<8) | (unsigned char)hgetc(fd->fp);
171 val = (val<<8) | (unsigned char)hgetc(fd->fp);
172 val = (val<<8) | (unsigned char)hgetc(fd->fp);
173 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
174 *val_p = val;
175 }
176
177 return 5;
178 }
179
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)180 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
181 static int nbytes[16] = {
182 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
183 1,1,1,1, // 1000xxxx - 1011xxxx
184 2,2, // 1100xxxx - 1101xxxx
185 3, // 1110xxxx
186 4, // 1111xxxx
187 };
188
189 static int nbits[16] = {
190 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
191 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
192 0x1f, 0x1f, // 1100xxxx - 1101xxxx
193 0x0f, // 1110xxxx
194 0x0f, // 1111xxxx
195 };
196 unsigned char c[5];
197
198 int32_t val = hgetc(fd->fp);
199 if (val == -1)
200 return -1;
201
202 c[0]=val;
203
204 int i = nbytes[val>>4];
205 val &= nbits[val>>4];
206
207 switch(i) {
208 case 0:
209 *val_p = val;
210 *crc = crc32(*crc, c, 1);
211 return 1;
212
213 case 1:
214 val = (val<<8) | (c[1]=hgetc(fd->fp));
215 *val_p = val;
216 *crc = crc32(*crc, c, 2);
217 return 2;
218
219 case 2:
220 val = (val<<8) | (c[1]=hgetc(fd->fp));
221 val = (val<<8) | (c[2]=hgetc(fd->fp));
222 *val_p = val;
223 *crc = crc32(*crc, c, 3);
224 return 3;
225
226 case 3:
227 val = (val<<8) | (c[1]=hgetc(fd->fp));
228 val = (val<<8) | (c[2]=hgetc(fd->fp));
229 val = (val<<8) | (c[3]=hgetc(fd->fp));
230 *val_p = val;
231 *crc = crc32(*crc, c, 4);
232 return 4;
233
234 case 4: // really 3.5 more, why make it different?
235 {
236 uint32_t uv = val;
237 uv = (uv<<8) | (c[1]=hgetc(fd->fp));
238 uv = (uv<<8) | (c[2]=hgetc(fd->fp));
239 uv = (uv<<8) | (c[3]=hgetc(fd->fp));
240 uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
241 *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
242 *crc = crc32(*crc, c, 5);
243 }
244 }
245
246 return 5;
247 }
248
249 /*
250 * Encodes and writes a single integer in ITF-8 format.
251 * Returns 0 on success
252 * -1 on failure
253 */
itf8_encode(cram_fd * fd,int32_t val)254 int itf8_encode(cram_fd *fd, int32_t val) {
255 char buf[5];
256 int len = itf8_put(buf, val);
257 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
258 }
259
260 const int itf8_bytes[16] = {
261 1, 1, 1, 1, 1, 1, 1, 1,
262 2, 2, 2, 2, 3, 3, 4, 5
263 };
264
265 const int ltf8_bytes[256] = {
266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
270
271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
275
276 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
277 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
278 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
279 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
280
281 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
282 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
283
284 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
285
286 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
287 };
288
289 /*
290 * LEGACY: consider using ltf8_decode_crc.
291 */
ltf8_decode(cram_fd * fd,int64_t * val_p)292 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
293 int c = hgetc(fd->fp);
294 int64_t val = (unsigned char)c;
295 if (c == -1)
296 return -1;
297
298 if (val < 0x80) {
299 *val_p = val;
300 return 1;
301
302 } else if (val < 0xc0) {
303 val = (val<<8) | (unsigned char)hgetc(fd->fp);
304 *val_p = val & (((1LL<<(6+8)))-1);
305 return 2;
306
307 } else if (val < 0xe0) {
308 val = (val<<8) | (unsigned char)hgetc(fd->fp);
309 val = (val<<8) | (unsigned char)hgetc(fd->fp);
310 *val_p = val & ((1LL<<(5+2*8))-1);
311 return 3;
312
313 } else if (val < 0xf0) {
314 val = (val<<8) | (unsigned char)hgetc(fd->fp);
315 val = (val<<8) | (unsigned char)hgetc(fd->fp);
316 val = (val<<8) | (unsigned char)hgetc(fd->fp);
317 *val_p = val & ((1LL<<(4+3*8))-1);
318 return 4;
319
320 } else if (val < 0xf8) {
321 val = (val<<8) | (unsigned char)hgetc(fd->fp);
322 val = (val<<8) | (unsigned char)hgetc(fd->fp);
323 val = (val<<8) | (unsigned char)hgetc(fd->fp);
324 val = (val<<8) | (unsigned char)hgetc(fd->fp);
325 *val_p = val & ((1LL<<(3+4*8))-1);
326 return 5;
327
328 } else if (val < 0xfc) {
329 val = (val<<8) | (unsigned char)hgetc(fd->fp);
330 val = (val<<8) | (unsigned char)hgetc(fd->fp);
331 val = (val<<8) | (unsigned char)hgetc(fd->fp);
332 val = (val<<8) | (unsigned char)hgetc(fd->fp);
333 val = (val<<8) | (unsigned char)hgetc(fd->fp);
334 *val_p = val & ((1LL<<(2+5*8))-1);
335 return 6;
336
337 } else if (val < 0xfe) {
338 val = (val<<8) | (unsigned char)hgetc(fd->fp);
339 val = (val<<8) | (unsigned char)hgetc(fd->fp);
340 val = (val<<8) | (unsigned char)hgetc(fd->fp);
341 val = (val<<8) | (unsigned char)hgetc(fd->fp);
342 val = (val<<8) | (unsigned char)hgetc(fd->fp);
343 val = (val<<8) | (unsigned char)hgetc(fd->fp);
344 *val_p = val & ((1LL<<(1+6*8))-1);
345 return 7;
346
347 } else if (val < 0xff) {
348 val = (val<<8) | (unsigned char)hgetc(fd->fp);
349 val = (val<<8) | (unsigned char)hgetc(fd->fp);
350 val = (val<<8) | (unsigned char)hgetc(fd->fp);
351 val = (val<<8) | (unsigned char)hgetc(fd->fp);
352 val = (val<<8) | (unsigned char)hgetc(fd->fp);
353 val = (val<<8) | (unsigned char)hgetc(fd->fp);
354 val = (val<<8) | (unsigned char)hgetc(fd->fp);
355 *val_p = val & ((1LL<<(7*8))-1);
356 return 8;
357
358 } else {
359 val = (val<<8) | (unsigned char)hgetc(fd->fp);
360 val = (val<<8) | (unsigned char)hgetc(fd->fp);
361 val = (val<<8) | (unsigned char)hgetc(fd->fp);
362 val = (val<<8) | (unsigned char)hgetc(fd->fp);
363 val = (val<<8) | (unsigned char)hgetc(fd->fp);
364 val = (val<<8) | (unsigned char)hgetc(fd->fp);
365 val = (val<<8) | (unsigned char)hgetc(fd->fp);
366 val = (val<<8) | (unsigned char)hgetc(fd->fp);
367 *val_p = val;
368 }
369
370 return 9;
371 }
372
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)373 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
374 unsigned char c[9];
375 int64_t val = (unsigned char)hgetc(fd->fp);
376 if (val == -1)
377 return -1;
378
379 c[0] = val;
380
381 if (val < 0x80) {
382 *val_p = val;
383 *crc = crc32(*crc, c, 1);
384 return 1;
385
386 } else if (val < 0xc0) {
387 val = (val<<8) | (c[1]=hgetc(fd->fp));;
388 *val_p = val & (((1LL<<(6+8)))-1);
389 *crc = crc32(*crc, c, 2);
390 return 2;
391
392 } else if (val < 0xe0) {
393 val = (val<<8) | (c[1]=hgetc(fd->fp));;
394 val = (val<<8) | (c[2]=hgetc(fd->fp));;
395 *val_p = val & ((1LL<<(5+2*8))-1);
396 *crc = crc32(*crc, c, 3);
397 return 3;
398
399 } else if (val < 0xf0) {
400 val = (val<<8) | (c[1]=hgetc(fd->fp));;
401 val = (val<<8) | (c[2]=hgetc(fd->fp));;
402 val = (val<<8) | (c[3]=hgetc(fd->fp));;
403 *val_p = val & ((1LL<<(4+3*8))-1);
404 *crc = crc32(*crc, c, 4);
405 return 4;
406
407 } else if (val < 0xf8) {
408 val = (val<<8) | (c[1]=hgetc(fd->fp));;
409 val = (val<<8) | (c[2]=hgetc(fd->fp));;
410 val = (val<<8) | (c[3]=hgetc(fd->fp));;
411 val = (val<<8) | (c[4]=hgetc(fd->fp));;
412 *val_p = val & ((1LL<<(3+4*8))-1);
413 *crc = crc32(*crc, c, 5);
414 return 5;
415
416 } else if (val < 0xfc) {
417 val = (val<<8) | (c[1]=hgetc(fd->fp));;
418 val = (val<<8) | (c[2]=hgetc(fd->fp));;
419 val = (val<<8) | (c[3]=hgetc(fd->fp));;
420 val = (val<<8) | (c[4]=hgetc(fd->fp));;
421 val = (val<<8) | (c[5]=hgetc(fd->fp));;
422 *val_p = val & ((1LL<<(2+5*8))-1);
423 *crc = crc32(*crc, c, 6);
424 return 6;
425
426 } else if (val < 0xfe) {
427 val = (val<<8) | (c[1]=hgetc(fd->fp));;
428 val = (val<<8) | (c[2]=hgetc(fd->fp));;
429 val = (val<<8) | (c[3]=hgetc(fd->fp));;
430 val = (val<<8) | (c[4]=hgetc(fd->fp));;
431 val = (val<<8) | (c[5]=hgetc(fd->fp));;
432 val = (val<<8) | (c[6]=hgetc(fd->fp));;
433 *val_p = val & ((1LL<<(1+6*8))-1);
434 *crc = crc32(*crc, c, 7);
435 return 7;
436
437 } else if (val < 0xff) {
438 val = (val<<8) | (c[1]=hgetc(fd->fp));;
439 val = (val<<8) | (c[2]=hgetc(fd->fp));;
440 val = (val<<8) | (c[3]=hgetc(fd->fp));;
441 val = (val<<8) | (c[4]=hgetc(fd->fp));;
442 val = (val<<8) | (c[5]=hgetc(fd->fp));;
443 val = (val<<8) | (c[6]=hgetc(fd->fp));;
444 val = (val<<8) | (c[7]=hgetc(fd->fp));;
445 *val_p = val & ((1LL<<(7*8))-1);
446 *crc = crc32(*crc, c, 8);
447 return 8;
448
449 } else {
450 val = (val<<8) | (c[1]=hgetc(fd->fp));;
451 val = (val<<8) | (c[2]=hgetc(fd->fp));;
452 val = (val<<8) | (c[3]=hgetc(fd->fp));;
453 val = (val<<8) | (c[4]=hgetc(fd->fp));;
454 val = (val<<8) | (c[5]=hgetc(fd->fp));;
455 val = (val<<8) | (c[6]=hgetc(fd->fp));;
456 val = (val<<8) | (c[7]=hgetc(fd->fp));;
457 val = (val<<8) | (c[8]=hgetc(fd->fp));;
458 *crc = crc32(*crc, c, 9);
459 *val_p = val;
460 }
461
462 return 9;
463 }
464
465 /*
466 * Pushes a value in ITF8 format onto the end of a block.
467 * This shouldn't be used for high-volume data as it is not the fastest
468 * method.
469 *
470 * Returns the number of bytes written
471 */
itf8_put_blk(cram_block * blk,int val)472 int itf8_put_blk(cram_block *blk, int val) {
473 char buf[5];
474 int sz;
475
476 sz = itf8_put(buf, val);
477 BLOCK_APPEND(blk, buf, sz);
478 return sz;
479 }
480
481 /*
482 * Decodes a 32-bit little endian value from fd and stores in val.
483 *
484 * Returns the number of bytes read on success
485 * -1 on failure
486 */
int32_decode(cram_fd * fd,int32_t * val)487 int int32_decode(cram_fd *fd, int32_t *val) {
488 int32_t i;
489 if (4 != hread(fd->fp, &i, 4))
490 return -1;
491
492 *val = le_int4(i);
493 return 4;
494 }
495
496 /*
497 * Encodes a 32-bit little endian value 'val' and writes to fd.
498 *
499 * Returns the number of bytes written on success
500 * -1 on failure
501 */
int32_encode(cram_fd * fd,int32_t val)502 int int32_encode(cram_fd *fd, int32_t val) {
503 val = le_int4(val);
504 if (4 != hwrite(fd->fp, &val, 4))
505 return -1;
506
507 return 4;
508 }
509
510 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)511 int int32_get_blk(cram_block *b, int32_t *val) {
512 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
513 return -1;
514
515 *val =
516 b->data[b->byte ] |
517 (b->data[b->byte+1] << 8) |
518 (b->data[b->byte+2] << 16) |
519 (b->data[b->byte+3] << 24);
520 BLOCK_SIZE(b) += 4;
521 return 4;
522 }
523
524 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)525 int int32_put_blk(cram_block *b, int32_t val) {
526 unsigned char cp[4];
527 cp[0] = ( val & 0xff);
528 cp[1] = ((val>>8) & 0xff);
529 cp[2] = ((val>>16) & 0xff);
530 cp[3] = ((val>>24) & 0xff);
531
532 BLOCK_APPEND(b, cp, 4);
533 return b->data ? 0 : -1;
534 }
535
536 /* ----------------------------------------------------------------------
537 * zlib compression code - from Gap5's tg_iface_g.c
538 * They're static here as they're only used within the cram_compress_block
539 * and cram_uncompress_block functions, which are the external interface.
540 */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)541 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
542 z_stream s;
543 unsigned char *data = NULL; /* Uncompressed output */
544 int data_alloc = 0;
545 int err;
546
547 /* Starting point at uncompressed size, and scale after that */
548 data = malloc(data_alloc = csize*1.2+100);
549 if (!data)
550 return NULL;
551
552 /* Initialise zlib stream */
553 s.zalloc = Z_NULL; /* use default allocation functions */
554 s.zfree = Z_NULL;
555 s.opaque = Z_NULL;
556 s.next_in = (unsigned char *)cdata;
557 s.avail_in = csize;
558 s.total_in = 0;
559 s.next_out = data;
560 s.avail_out = data_alloc;
561 s.total_out = 0;
562
563 //err = inflateInit(&s);
564 err = inflateInit2(&s, 15 + 32);
565 if (err != Z_OK) {
566 hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
567 free(data);
568 return NULL;
569 }
570
571 /* Decode to 'data' array */
572 for (;s.avail_in;) {
573 unsigned char *data_tmp;
574 int alloc_inc;
575
576 s.next_out = &data[s.total_out];
577 err = inflate(&s, Z_NO_FLUSH);
578 if (err == Z_STREAM_END)
579 break;
580
581 if (err != Z_OK) {
582 hts_log_error("Call to zlib inflate failed: %s", s.msg);
583 if (data)
584 free(data);
585 return NULL;
586 }
587
588 /* More to come, so realloc based on growth so far */
589 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
590 data = realloc((data_tmp = data), data_alloc += alloc_inc);
591 if (!data) {
592 free(data_tmp);
593 return NULL;
594 }
595 s.avail_out += alloc_inc;
596 }
597 inflateEnd(&s);
598
599 *size = s.total_out;
600 return (char *)data;
601 }
602
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)603 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
604 int level, int strat) {
605 z_stream s;
606 unsigned char *cdata = NULL; /* Compressed output */
607 int cdata_alloc = 0;
608 int cdata_pos = 0;
609 int err;
610
611 cdata = malloc(cdata_alloc = size*1.05+100);
612 if (!cdata)
613 return NULL;
614 cdata_pos = 0;
615
616 /* Initialise zlib stream */
617 s.zalloc = Z_NULL; /* use default allocation functions */
618 s.zfree = Z_NULL;
619 s.opaque = Z_NULL;
620 s.next_in = (unsigned char *)data;
621 s.avail_in = size;
622 s.total_in = 0;
623 s.next_out = cdata;
624 s.avail_out = cdata_alloc;
625 s.total_out = 0;
626 s.data_type = Z_BINARY;
627
628 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
629 if (err != Z_OK) {
630 hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
631 return NULL;
632 }
633
634 /* Encode to 'cdata' array */
635 for (;s.avail_in;) {
636 s.next_out = &cdata[cdata_pos];
637 s.avail_out = cdata_alloc - cdata_pos;
638 if (cdata_alloc - cdata_pos <= 0) {
639 hts_log_error("Deflate produced larger output than expected");
640 return NULL;
641 }
642 err = deflate(&s, Z_NO_FLUSH);
643 cdata_pos = cdata_alloc - s.avail_out;
644 if (err != Z_OK) {
645 hts_log_error("Call to zlib deflate failed: %s", s.msg);
646 break;
647 }
648 }
649 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
650 hts_log_error("Call to zlib deflate failed: %s", s.msg);
651 }
652 *cdata_size = s.total_out;
653
654 if (deflateEnd(&s) != Z_OK) {
655 hts_log_error("Call to zlib deflate failed: %s", s.msg);
656 }
657 return (char *)cdata;
658 }
659
660 #ifdef HAVE_LIBLZMA
661 /* ------------------------------------------------------------------------ */
662 /*
663 * Data compression routines using liblzma (xz)
664 *
665 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
666 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
667 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
668 * as compression times.
669 *
670 * For now we disable this functionality. If it's to be reenabled make sure you
671 * improve the mem_inflate implementation as it's just a test hack at the
672 * moment.
673 */
674
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)675 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
676 int level) {
677 char *out;
678 size_t out_size = lzma_stream_buffer_bound(size);
679 *cdata_size = 0;
680
681 out = malloc(out_size);
682
683 /* Single call compression */
684 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
685 (uint8_t *)data, size,
686 (uint8_t *)out, cdata_size,
687 out_size))
688 return NULL;
689
690 return out;
691 }
692
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)693 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
694 lzma_stream strm = LZMA_STREAM_INIT;
695 size_t out_size = 0, out_pos = 0;
696 char *out = NULL;
697 int r;
698
699 /* Initiate the decoder */
700 if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
701 return NULL;
702
703 /* Decode loop */
704 strm.avail_in = csize;
705 strm.next_in = (uint8_t *)cdata;
706
707 for (;strm.avail_in;) {
708 if (strm.avail_in > out_size - out_pos) {
709 out_size += strm.avail_in * 4 + 32768;
710 out = realloc(out, out_size);
711 }
712 strm.avail_out = out_size - out_pos;
713 strm.next_out = (uint8_t *)&out[out_pos];
714
715 r = lzma_code(&strm, LZMA_RUN);
716 if (LZMA_OK != r && LZMA_STREAM_END != r) {
717 hts_log_error("LZMA decode failure (error %d)", r);
718 return NULL;
719 }
720
721 out_pos = strm.total_out;
722
723 if (r == LZMA_STREAM_END)
724 break;
725 }
726
727 /* finish up any unflushed data; necessary? */
728 r = lzma_code(&strm, LZMA_FINISH);
729 if (r != LZMA_OK && r != LZMA_STREAM_END) {
730 hts_log_error("Call to lzma_code failed with error %d", r);
731 return NULL;
732 }
733
734 out = realloc(out, strm.total_out);
735 *size = strm.total_out;
736
737 lzma_end(&strm);
738
739 return out;
740 }
741 #endif
742
743 /* ----------------------------------------------------------------------
744 * CRAM blocks - the dynamically growable data block. We have code to
745 * create, update, (un)compress and read/write.
746 *
747 * These are derived from the deflate_interlaced.c blocks, but with the
748 * CRAM extension of content types and IDs.
749 */
750
751 /*
752 * Allocates a new cram_block structure with a specified content_type and
753 * id.
754 *
755 * Returns block pointer on success
756 * NULL on failure
757 */
cram_new_block(enum cram_content_type content_type,int content_id)758 cram_block *cram_new_block(enum cram_content_type content_type,
759 int content_id) {
760 cram_block *b = malloc(sizeof(*b));
761 if (!b)
762 return NULL;
763 b->method = b->orig_method = RAW;
764 b->content_type = content_type;
765 b->content_id = content_id;
766 b->comp_size = 0;
767 b->uncomp_size = 0;
768 b->data = NULL;
769 b->alloc = 0;
770 b->byte = 0;
771 b->bit = 7; // MSB
772
773 return b;
774 }
775
776 /*
777 * Reads a block from a cram file.
778 * Returns cram_block pointer on success.
779 * NULL on failure
780 */
cram_read_block(cram_fd * fd)781 cram_block *cram_read_block(cram_fd *fd) {
782 cram_block *b = malloc(sizeof(*b));
783 unsigned char c;
784 uint32_t crc = 0;
785 if (!b)
786 return NULL;
787
788 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
789
790 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
791 c = b->method; crc = crc32(crc, &c, 1);
792 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
793 c = b->content_type; crc = crc32(crc, &c, 1);
794 if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
795 if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
796 if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
797
798 //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
799 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
800
801 if (b->method == RAW) {
802 if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
803 free(b);
804 return NULL;
805 }
806 b->alloc = b->uncomp_size;
807 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
808 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
809 free(b->data);
810 free(b);
811 return NULL;
812 }
813 } else {
814 if (b->comp_size < 0) { free(b); return NULL; }
815 b->alloc = b->comp_size;
816 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
817 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
818 free(b->data);
819 free(b);
820 return NULL;
821 }
822 }
823
824 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
825 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
826 free(b);
827 return NULL;
828 }
829
830 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
831 if (crc != b->crc32) {
832 hts_log_error("Block CRC32 failure");
833 free(b->data);
834 free(b);
835 return NULL;
836 }
837 }
838
839 b->orig_method = b->method;
840 b->idx = 0;
841 b->byte = 0;
842 b->bit = 7; // MSB
843
844 return b;
845 }
846
847
848 /*
849 * Computes the size of a cram block, including the block
850 * header itself.
851 */
cram_block_size(cram_block * b)852 uint32_t cram_block_size(cram_block *b) {
853 unsigned char dat[100], *cp = dat;;
854 uint32_t sz;
855
856 *cp++ = b->method;
857 *cp++ = b->content_type;
858 cp += itf8_put((char*)cp, b->content_id);
859 cp += itf8_put((char*)cp, b->comp_size);
860 cp += itf8_put((char*)cp, b->uncomp_size);
861
862 sz = cp-dat + 4;
863 sz += b->method == RAW ? b->uncomp_size : b->comp_size;
864
865 return sz;
866 }
867
868 /*
869 * Writes a CRAM block.
870 * Returns 0 on success
871 * -1 on failure
872 */
cram_write_block(cram_fd * fd,cram_block * b)873 int cram_write_block(cram_fd *fd, cram_block *b) {
874 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
875
876 if (hputc(b->method, fd->fp) == EOF) return -1;
877 if (hputc(b->content_type, fd->fp) == EOF) return -1;
878 if (itf8_encode(fd, b->content_id) == -1) return -1;
879 if (itf8_encode(fd, b->comp_size) == -1) return -1;
880 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
881
882 if (b->data) {
883 if (b->method == RAW) {
884 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
885 return -1;
886 } else {
887 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
888 return -1;
889 }
890 } else {
891 // Absent blocks should be size 0
892 assert(b->method == RAW && b->uncomp_size == 0);
893 }
894
895 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
896 unsigned char dat[100], *cp = dat;;
897 uint32_t crc;
898
899 *cp++ = b->method;
900 *cp++ = b->content_type;
901 cp += itf8_put((char*)cp, b->content_id);
902 cp += itf8_put((char*)cp, b->comp_size);
903 cp += itf8_put((char*)cp, b->uncomp_size);
904 crc = crc32(0L, dat, cp-dat);
905
906 if (b->method == RAW) {
907 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
908 } else {
909 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
910 }
911
912 if (-1 == int32_encode(fd, b->crc32))
913 return -1;
914 }
915
916 return 0;
917 }
918
919 /*
920 * Frees a CRAM block, deallocating internal data too.
921 */
cram_free_block(cram_block * b)922 void cram_free_block(cram_block *b) {
923 if (!b)
924 return;
925 if (b->data)
926 free(b->data);
927 free(b);
928 }
929
930 /*
931 * Uncompresses a CRAM block, if compressed.
932 */
cram_uncompress_block(cram_block * b)933 int cram_uncompress_block(cram_block *b) {
934 char *uncomp;
935 size_t uncomp_size = 0;
936
937 if (b->uncomp_size == 0) {
938 // blank block
939 b->method = RAW;
940 return 0;
941 }
942
943 switch (b->method) {
944 case RAW:
945 return 0;
946
947 case GZIP:
948 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
949 if (!uncomp)
950 return -1;
951 if ((int)uncomp_size != b->uncomp_size) {
952 free(uncomp);
953 return -1;
954 }
955 free(b->data);
956 b->data = (unsigned char *)uncomp;
957 b->alloc = uncomp_size;
958 b->method = RAW;
959 break;
960
961 #ifdef HAVE_LIBBZ2
962 case BZIP2: {
963 unsigned int usize = b->uncomp_size;
964 if (!(uncomp = malloc(usize)))
965 return -1;
966 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
967 (char *)b->data, b->comp_size,
968 0, 0)) {
969 free(uncomp);
970 return -1;
971 }
972 free(b->data);
973 b->data = (unsigned char *)uncomp;
974 b->alloc = usize;
975 b->method = RAW;
976 b->uncomp_size = usize; // Just incase it differs
977 break;
978 }
979 #else
980 case BZIP2:
981 hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
982 return -1;
983 #endif
984
985 #ifdef HAVE_LIBLZMA
986 case LZMA:
987 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
988 if (!uncomp)
989 return -1;
990 if ((int)uncomp_size != b->uncomp_size)
991 return -1;
992 free(b->data);
993 b->data = (unsigned char *)uncomp;
994 b->alloc = uncomp_size;
995 b->method = RAW;
996 break;
997 #else
998 case LZMA:
999 hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1000 return -1;
1001 break;
1002 #endif
1003
1004 case RANS: {
1005 unsigned int usize = b->uncomp_size, usize2;
1006 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1007 if (!uncomp || usize != usize2)
1008 return -1;
1009 free(b->data);
1010 b->data = (unsigned char *)uncomp;
1011 b->alloc = usize2;
1012 b->method = RAW;
1013 b->uncomp_size = usize2; // Just incase it differs
1014 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1015 break;
1016 }
1017
1018 default:
1019 return -1;
1020 }
1021
1022 return 0;
1023 }
1024
cram_compress_by_method(char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1025 static char *cram_compress_by_method(char *in, size_t in_size,
1026 size_t *out_size,
1027 enum cram_block_method method,
1028 int level, int strat) {
1029 switch (method) {
1030 case GZIP:
1031 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1032
1033 case BZIP2: {
1034 #ifdef HAVE_LIBBZ2
1035 unsigned int comp_size = in_size*1.01 + 600;
1036 char *comp = malloc(comp_size);
1037 if (!comp)
1038 return NULL;
1039
1040 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1041 in, in_size,
1042 level, 0, 30)) {
1043 free(comp);
1044 return NULL;
1045 }
1046 *out_size = comp_size;
1047 return comp;
1048 #else
1049 return NULL;
1050 #endif
1051 }
1052
1053 case LZMA:
1054 #ifdef HAVE_LIBLZMA
1055 return lzma_mem_deflate(in, in_size, out_size, level);
1056 #else
1057 return NULL;
1058 #endif
1059
1060 case RANS0: {
1061 unsigned int out_size_i;
1062 unsigned char *cp;
1063 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1064 *out_size = out_size_i;
1065 return (char *)cp;
1066 }
1067
1068 case RANS1: {
1069 unsigned int out_size_i;
1070 unsigned char *cp;
1071
1072 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1073 *out_size = out_size_i;
1074 return (char *)cp;
1075 }
1076
1077 case RAW:
1078 break;
1079
1080 default:
1081 return NULL;
1082 }
1083
1084 return NULL;
1085 }
1086
1087
1088 /*
1089 * Compresses a block using one of two different zlib strategies. If we only
1090 * want one choice set strat2 to be -1.
1091 *
1092 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1093 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1094 * significantly faster.
1095 *
1096 * Method and level -1 implies defaults, as specified in cram_fd.
1097 */
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)1098 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1099 int method, int level) {
1100
1101 char *comp = NULL;
1102 size_t comp_size = 0;
1103 int strat;
1104
1105 if (b->method != RAW) {
1106 // Maybe already compressed if s->block[0] was compressed and
1107 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1108 // one base type present and hence using E_HUFFMAN on block 0.
1109 // A second explicit attempt to compress the same block then
1110 // occurs.
1111 return 0;
1112 }
1113
1114 if (method == -1) {
1115 method = 1<<GZIP;
1116 if (fd->use_bz2)
1117 method |= 1<<BZIP2;
1118 if (fd->use_lzma)
1119 method |= 1<<LZMA;
1120 }
1121
1122 if (level == -1)
1123 level = fd->level;
1124
1125 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1126
1127 if (method == RAW || level == 0 || b->uncomp_size == 0) {
1128 b->method = RAW;
1129 b->comp_size = b->uncomp_size;
1130 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1131 return 0;
1132 }
1133
1134 if (metrics) {
1135 pthread_mutex_lock(&fd->metrics_lock);
1136 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1137 size_t sz_best = INT_MAX;
1138 size_t sz_gz_rle = 0;
1139 size_t sz_gz_def = 0;
1140 size_t sz_rans0 = 0;
1141 size_t sz_rans1 = 0;
1142 size_t sz_bzip2 = 0;
1143 size_t sz_lzma = 0;
1144 int method_best = 0;
1145 char *c_best = NULL, *c = NULL;
1146
1147 if (metrics->revised_method)
1148 method = metrics->revised_method;
1149 else
1150 metrics->revised_method = method;
1151
1152 if (metrics->next_trial <= 0) {
1153 metrics->next_trial = TRIAL_SPAN;
1154 metrics->trial = NTRIALS;
1155 metrics->sz_gz_rle /= 2;
1156 metrics->sz_gz_def /= 2;
1157 metrics->sz_rans0 /= 2;
1158 metrics->sz_rans1 /= 2;
1159 metrics->sz_bzip2 /= 2;
1160 metrics->sz_lzma /= 2;
1161 }
1162
1163 pthread_mutex_unlock(&fd->metrics_lock);
1164
1165 if (method & (1<<GZIP_RLE)) {
1166 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1167 &sz_gz_rle, GZIP, 1, Z_RLE);
1168 if (c && sz_best > sz_gz_rle) {
1169 sz_best = sz_gz_rle;
1170 method_best = GZIP_RLE;
1171 if (c_best)
1172 free(c_best);
1173 c_best = c;
1174 } else if (c) {
1175 free(c);
1176 } else {
1177 sz_gz_rle = b->uncomp_size*2+1000;
1178 }
1179
1180 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1181 }
1182
1183 if (method & (1<<GZIP)) {
1184 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1185 &sz_gz_def, GZIP, level,
1186 Z_FILTERED);
1187 if (c && sz_best > sz_gz_def) {
1188 sz_best = sz_gz_def;
1189 method_best = GZIP;
1190 if (c_best)
1191 free(c_best);
1192 c_best = c;
1193 } else if (c) {
1194 free(c);
1195 } else {
1196 sz_gz_def = b->uncomp_size*2+1000;
1197 }
1198
1199 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1200 }
1201
1202 if (method & (1<<RANS0)) {
1203 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1204 &sz_rans0, RANS0, 0, 0);
1205 if (c && sz_best > sz_rans0) {
1206 sz_best = sz_rans0;
1207 method_best = RANS0;
1208 if (c_best)
1209 free(c_best);
1210 c_best = c;
1211 } else if (c) {
1212 free(c);
1213 } else {
1214 sz_rans0 = b->uncomp_size*2+1000;
1215 }
1216 }
1217
1218 if (method & (1<<RANS1)) {
1219 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1220 &sz_rans1, RANS1, 0, 0);
1221 if (c && sz_best > sz_rans1) {
1222 sz_best = sz_rans1;
1223 method_best = RANS1;
1224 if (c_best)
1225 free(c_best);
1226 c_best = c;
1227 } else if (c) {
1228 free(c);
1229 } else {
1230 sz_rans1 = b->uncomp_size*2+1000;
1231 }
1232 }
1233
1234 if (method & (1<<BZIP2)) {
1235 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1236 &sz_bzip2, BZIP2, level, 0);
1237 if (c && sz_best > sz_bzip2) {
1238 sz_best = sz_bzip2;
1239 method_best = BZIP2;
1240 if (c_best)
1241 free(c_best);
1242 c_best = c;
1243 } else if (c) {
1244 free(c);
1245 } else {
1246 sz_bzip2 = b->uncomp_size*2+1000;
1247 }
1248 }
1249
1250 if (method & (1<<LZMA)) {
1251 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1252 &sz_lzma, LZMA, level, 0);
1253 if (c && sz_best > sz_lzma) {
1254 sz_best = sz_lzma;
1255 method_best = LZMA;
1256 if (c_best)
1257 free(c_best);
1258 c_best = c;
1259 } else if (c) {
1260 free(c);
1261 } else {
1262 sz_lzma = b->uncomp_size*2+1000;
1263 }
1264 }
1265
1266 //fprintf(stderr, "sz_best = %d\n", sz_best);
1267
1268 free(b->data);
1269 b->data = (unsigned char *)c_best;
1270 //printf("method_best = %s\n", cram_block_method2str(method_best));
1271 b->method = method_best == GZIP_RLE ? GZIP : method_best;
1272 b->comp_size = sz_best;
1273
1274 pthread_mutex_lock(&fd->metrics_lock);
1275 metrics->sz_gz_rle += sz_gz_rle;
1276 metrics->sz_gz_def += sz_gz_def;
1277 metrics->sz_rans0 += sz_rans0;
1278 metrics->sz_rans1 += sz_rans1;
1279 metrics->sz_bzip2 += sz_bzip2;
1280 metrics->sz_lzma += sz_lzma;
1281 if (--metrics->trial == 0) {
1282 int best_method = RAW;
1283 int best_sz = INT_MAX;
1284
1285 // Scale methods by cost
1286 if (fd->level <= 3) {
1287 metrics->sz_rans1 *= 1.02;
1288 metrics->sz_gz_def *= 1.04;
1289 metrics->sz_bzip2 *= 1.08;
1290 metrics->sz_lzma *= 1.10;
1291 } else if (fd->level <= 6) {
1292 metrics->sz_rans1 *= 1.01;
1293 metrics->sz_gz_def *= 1.02;
1294 metrics->sz_bzip2 *= 1.03;
1295 metrics->sz_lzma *= 1.05;
1296 }
1297
1298 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1299 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1300
1301 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1302 best_sz = metrics->sz_gz_def, best_method = GZIP;
1303
1304 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1305 best_sz = metrics->sz_rans0, best_method = RANS0;
1306
1307 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1308 best_sz = metrics->sz_rans1, best_method = RANS1;
1309
1310 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1311 best_sz = metrics->sz_bzip2, best_method = BZIP2;
1312
1313 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1314 best_sz = metrics->sz_lzma, best_method = LZMA;
1315
1316 if (best_method == GZIP_RLE) {
1317 metrics->method = GZIP;
1318 metrics->strat = Z_RLE;
1319 } else {
1320 metrics->method = best_method;
1321 metrics->strat = Z_FILTERED;
1322 }
1323
1324 // If we see at least MAXFAIL trials in a row for a specific
1325 // compression method with more than MAXDELTA aggregate
1326 // size then we drop this from the list of methods used
1327 // for this block type.
1328 #define MAXDELTA 0.20
1329 #define MAXFAILS 4
1330 if (best_method == GZIP_RLE) {
1331 metrics->gz_rle_cnt = 0;
1332 metrics->gz_rle_extra = 0;
1333 } else if (best_sz < metrics->sz_gz_rle) {
1334 double r = (double)metrics->sz_gz_rle / best_sz - 1;
1335 if (++metrics->gz_rle_cnt >= MAXFAILS &&
1336 (metrics->gz_rle_extra += r) >= MAXDELTA)
1337 method &= ~(1<<GZIP_RLE);
1338 }
1339
1340 if (best_method == GZIP) {
1341 metrics->gz_def_cnt = 0;
1342 metrics->gz_def_extra = 0;
1343 } else if (best_sz < metrics->sz_gz_def) {
1344 double r = (double)metrics->sz_gz_def / best_sz - 1;
1345 if (++metrics->gz_def_cnt >= MAXFAILS &&
1346 (metrics->gz_def_extra += r) >= MAXDELTA)
1347 method &= ~(1<<GZIP);
1348 }
1349
1350 if (best_method == RANS0) {
1351 metrics->rans0_cnt = 0;
1352 metrics->rans0_extra = 0;
1353 } else if (best_sz < metrics->sz_rans0) {
1354 double r = (double)metrics->sz_rans0 / best_sz - 1;
1355 if (++metrics->rans0_cnt >= MAXFAILS &&
1356 (metrics->rans0_extra += r) >= MAXDELTA)
1357 method &= ~(1<<RANS0);
1358 }
1359
1360 if (best_method == RANS1) {
1361 metrics->rans1_cnt = 0;
1362 metrics->rans1_extra = 0;
1363 } else if (best_sz < metrics->sz_rans1) {
1364 double r = (double)metrics->sz_rans1 / best_sz - 1;
1365 if (++metrics->rans1_cnt >= MAXFAILS &&
1366 (metrics->rans1_extra += r) >= MAXDELTA)
1367 method &= ~(1<<RANS1);
1368 }
1369
1370 if (best_method == BZIP2) {
1371 metrics->bzip2_cnt = 0;
1372 metrics->bzip2_extra = 0;
1373 } else if (best_sz < metrics->sz_bzip2) {
1374 double r = (double)metrics->sz_bzip2 / best_sz - 1;
1375 if (++metrics->bzip2_cnt >= MAXFAILS &&
1376 (metrics->bzip2_extra += r) >= MAXDELTA)
1377 method &= ~(1<<BZIP2);
1378 }
1379
1380 if (best_method == LZMA) {
1381 metrics->lzma_cnt = 0;
1382 metrics->lzma_extra = 0;
1383 } else if (best_sz < metrics->sz_lzma) {
1384 double r = (double)metrics->sz_lzma / best_sz - 1;
1385 if (++metrics->lzma_cnt >= MAXFAILS &&
1386 (metrics->lzma_extra += r) >= MAXDELTA)
1387 method &= ~(1<<LZMA);
1388 }
1389
1390 //if (method != metrics->revised_method)
1391 // fprintf(stderr, "%d: method from %x to %x\n",
1392 // b->content_id, metrics->revised_method, method);
1393 metrics->revised_method = method;
1394 }
1395 pthread_mutex_unlock(&fd->metrics_lock);
1396 } else {
1397 strat = metrics->strat;
1398 method = metrics->method;
1399
1400 pthread_mutex_unlock(&fd->metrics_lock);
1401 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1402 &comp_size, method,
1403 level, strat);
1404 if (!comp)
1405 return -1;
1406 free(b->data);
1407 b->data = (unsigned char *)comp;
1408 b->comp_size = comp_size;
1409 b->method = method;
1410 }
1411
1412 } else {
1413 // no cached metrics, so just do zlib?
1414 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1415 &comp_size, GZIP, level, Z_FILTERED);
1416 if (!comp) {
1417 hts_log_error("Compression failed");
1418 return -1;
1419 }
1420 free(b->data);
1421 b->data = (unsigned char *)comp;
1422 b->comp_size = comp_size;
1423 b->method = GZIP;
1424 }
1425
1426 hts_log_info("Compressed block ID %d from %d to %d by method %s",
1427 b->content_id, b->uncomp_size, b->comp_size,
1428 cram_block_method2str(b->method));
1429
1430 if (b->method == RANS1)
1431 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1432
1433 return 0;
1434 }
1435
cram_new_metrics(void)1436 cram_metrics *cram_new_metrics(void) {
1437 cram_metrics *m = calloc(1, sizeof(*m));
1438 if (!m)
1439 return NULL;
1440 m->trial = NTRIALS-1;
1441 m->next_trial = TRIAL_SPAN;
1442 m->method = RAW;
1443 m->strat = 0;
1444 m->revised_method = 0;
1445
1446 return m;
1447 }
1448
cram_block_method2str(enum cram_block_method m)1449 char *cram_block_method2str(enum cram_block_method m) {
1450 switch(m) {
1451 case RAW: return "RAW";
1452 case GZIP: return "GZIP";
1453 case BZIP2: return "BZIP2";
1454 case LZMA: return "LZMA";
1455 case RANS0: return "RANS0";
1456 case RANS1: return "RANS1";
1457 case GZIP_RLE: return "GZIP_RLE";
1458 case BM_ERROR: break;
1459 }
1460 return "?";
1461 }
1462
cram_content_type2str(enum cram_content_type t)1463 char *cram_content_type2str(enum cram_content_type t) {
1464 switch (t) {
1465 case FILE_HEADER: return "FILE_HEADER";
1466 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
1467 case MAPPED_SLICE: return "MAPPED_SLICE";
1468 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
1469 case EXTERNAL: return "EXTERNAL";
1470 case CORE: return "CORE";
1471 case CT_ERROR: break;
1472 }
1473 return "?";
1474 }
1475
1476 /* ----------------------------------------------------------------------
1477 * Reference sequence handling
1478 *
1479 * These revolve around the refs_t structure, which may potentially be
1480 * shared between multiple cram_fd.
1481 *
1482 * We start with refs_create() to allocate an empty refs_t and then
1483 * populate it with @SQ line data using refs_from_header(). This is done on
1484 * cram_open(). Also at start up we can call cram_load_reference() which
1485 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1486 * new one specified. In either case refs2id() is then called which
1487 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1488 *
1489 * Later, possibly within a thread, we will want to know the actual ref
1490 * seq itself, obtained by calling cram_get_ref(). This may use the
1491 * UR: or M5: fields or the filename specified in the original
1492 * cram_load_reference() call.
1493 *
1494 * Given the potential for multi-threaded reference usage, we have
1495 * reference counting (sorry for the confusing double use of "ref") to
1496 * track the number of callers interested in any specific reference.
1497 */
1498
1499 /*
1500 * Frees/unmaps a reference sequence and associated file handles.
1501 */
ref_entry_free_seq(ref_entry * e)1502 static void ref_entry_free_seq(ref_entry *e) {
1503 if (e->mf)
1504 mfclose(e->mf);
1505 if (e->seq && !e->mf)
1506 free(e->seq);
1507
1508 e->seq = NULL;
1509 e->mf = NULL;
1510 }
1511
refs_free(refs_t * r)1512 void refs_free(refs_t *r) {
1513 RP("refs_free()\n");
1514
1515 if (--r->count > 0)
1516 return;
1517
1518 if (!r)
1519 return;
1520
1521 if (r->pool)
1522 string_pool_destroy(r->pool);
1523
1524 if (r->h_meta) {
1525 khint_t k;
1526
1527 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1528 ref_entry *e;
1529
1530 if (!kh_exist(r->h_meta, k))
1531 continue;
1532 if (!(e = kh_val(r->h_meta, k)))
1533 continue;
1534 ref_entry_free_seq(e);
1535 free(e);
1536 }
1537
1538 kh_destroy(refs, r->h_meta);
1539 }
1540
1541 if (r->ref_id)
1542 free(r->ref_id);
1543
1544 if (r->fp)
1545 bgzf_close(r->fp);
1546
1547 pthread_mutex_destroy(&r->lock);
1548
1549 free(r);
1550 }
1551
refs_create(void)1552 static refs_t *refs_create(void) {
1553 refs_t *r = calloc(1, sizeof(*r));
1554
1555 RP("refs_create()\n");
1556
1557 if (!r)
1558 return NULL;
1559
1560 if (!(r->pool = string_pool_create(8192)))
1561 goto err;
1562
1563 r->ref_id = NULL; // see refs2id() to populate.
1564 r->count = 1;
1565 r->last = NULL;
1566 r->last_id = -1;
1567
1568 if (!(r->h_meta = kh_init(refs)))
1569 goto err;
1570
1571 pthread_mutex_init(&r->lock, NULL);
1572
1573 return r;
1574
1575 err:
1576 refs_free(r);
1577 return NULL;
1578 }
1579
1580 /*
1581 * Opens a reference fasta file as a BGZF stream, allowing for
1582 * compressed files. It automatically builds a .fai file if
1583 * required and if compressed a .gzi bgzf index too.
1584 *
1585 * Returns a BGZF handle on success;
1586 * NULL on failure.
1587 */
bgzf_open_ref(char * fn,char * mode,int is_md5)1588 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
1589 BGZF *fp;
1590
1591 if (!is_md5) {
1592 char fai_file[PATH_MAX];
1593
1594 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1595 if (access(fai_file, R_OK) != 0)
1596 if (fai_build(fn) != 0)
1597 return NULL;
1598 }
1599
1600 if (!(fp = bgzf_open(fn, mode))) {
1601 perror(fn);
1602 return NULL;
1603 }
1604
1605 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1606 hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
1607 bgzf_close(fp);
1608 return NULL;
1609 }
1610
1611 return fp;
1612 }
1613
1614 /*
1615 * Loads a FAI file for a reference.fasta.
1616 * "is_err" indicates whether failure to load is worthy of emitting an
1617 * error message. In some cases (eg with embedded references) we
1618 * speculatively load, just incase, and silently ignore errors.
1619 *
1620 * Returns the refs_t struct on success (maybe newly allocated);
1621 * NULL on failure
1622 */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)1623 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1624 struct stat sb;
1625 FILE *fp = NULL;
1626 char fai_fn[PATH_MAX];
1627 char line[8192];
1628 refs_t *r = r_orig;
1629 size_t fn_l = strlen(fn);
1630 int id = 0, id_alloc = 0;
1631
1632 RP("refs_load_fai %s\n", fn);
1633
1634 if (!r)
1635 if (!(r = refs_create()))
1636 goto err;
1637
1638 /* Open reference, for later use */
1639 if (stat(fn, &sb) != 0) {
1640 if (is_err)
1641 perror(fn);
1642 goto err;
1643 }
1644
1645 if (r->fp)
1646 if (bgzf_close(r->fp) != 0)
1647 goto err;
1648 r->fp = NULL;
1649
1650 if (!(r->fn = string_dup(r->pool, fn)))
1651 goto err;
1652
1653 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1654 r->fn[fn_l-4] = 0;
1655
1656 if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
1657 goto err;
1658
1659 /* Parse .fai file and load meta-data */
1660 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1661
1662 if (stat(fai_fn, &sb) != 0) {
1663 if (is_err)
1664 perror(fai_fn);
1665 goto err;
1666 }
1667 if (!(fp = fopen(fai_fn, "r"))) {
1668 if (is_err)
1669 perror(fai_fn);
1670 goto err;
1671 }
1672 while (fgets(line, 8192, fp) != NULL) {
1673 ref_entry *e = malloc(sizeof(*e));
1674 char *cp;
1675 int n;
1676 khint_t k;
1677
1678 if (!e)
1679 return NULL;
1680
1681 // id
1682 for (cp = line; *cp && !isspace_c(*cp); cp++)
1683 ;
1684 *cp++ = 0;
1685 e->name = string_dup(r->pool, line);
1686
1687 // length
1688 while (*cp && isspace_c(*cp))
1689 cp++;
1690 e->length = strtoll(cp, &cp, 10);
1691
1692 // offset
1693 while (*cp && isspace_c(*cp))
1694 cp++;
1695 e->offset = strtoll(cp, &cp, 10);
1696
1697 // bases per line
1698 while (*cp && isspace_c(*cp))
1699 cp++;
1700 e->bases_per_line = strtol(cp, &cp, 10);
1701
1702 // line length
1703 while (*cp && isspace_c(*cp))
1704 cp++;
1705 e->line_length = strtol(cp, &cp, 10);
1706
1707 // filename
1708 e->fn = r->fn;
1709
1710 e->count = 0;
1711 e->seq = NULL;
1712 e->mf = NULL;
1713 e->is_md5 = 0;
1714
1715 k = kh_put(refs, r->h_meta, e->name, &n);
1716 if (-1 == n) {
1717 free(e);
1718 return NULL;
1719 }
1720
1721 if (n) {
1722 kh_val(r->h_meta, k) = e;
1723 } else {
1724 ref_entry *re = kh_val(r->h_meta, k);
1725 if (re && (re->count != 0 || re->length != 0)) {
1726 /* Keep old */
1727 free(e);
1728 } else {
1729 /* Replace old */
1730 if (re)
1731 free(re);
1732 kh_val(r->h_meta, k) = e;
1733 }
1734 }
1735
1736 if (id >= id_alloc) {
1737 int x;
1738
1739 id_alloc = id_alloc ?id_alloc*2 : 16;
1740 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1741
1742 for (x = id; x < id_alloc; x++)
1743 r->ref_id[x] = NULL;
1744 }
1745 r->ref_id[id] = e;
1746 r->nref = ++id;
1747 }
1748
1749 fclose(fp);
1750 return r;
1751
1752 err:
1753 if (fp)
1754 fclose(fp);
1755
1756 if (!r_orig)
1757 refs_free(r);
1758
1759 return NULL;
1760 }
1761
1762 /*
1763 * Verifies that the CRAM @SQ lines and .fai files match.
1764 */
sanitise_SQ_lines(cram_fd * fd)1765 static void sanitise_SQ_lines(cram_fd *fd) {
1766 int i;
1767
1768 if (!fd->header)
1769 return;
1770
1771 if (!fd->refs || !fd->refs->h_meta)
1772 return;
1773
1774 for (i = 0; i < fd->header->nref; i++) {
1775 char *name = fd->header->ref[i].name;
1776 khint_t k = kh_get(refs, fd->refs->h_meta, name);
1777 ref_entry *r;
1778
1779 // We may have @SQ lines which have no known .fai, but do not
1780 // in themselves pose a problem because they are unused in the file.
1781 if (k == kh_end(fd->refs->h_meta))
1782 continue;
1783
1784 if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
1785 continue;
1786
1787 if (r->length && r->length != fd->header->ref[i].len) {
1788 assert(strcmp(r->name, fd->header->ref[i].name) == 0);
1789
1790 // Should we also check MD5sums here to ensure the correct
1791 // reference was given?
1792 hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d",
1793 r->name, fd->header->ref[i].len, (int)r->length);
1794
1795 // Fixing the parsed @SQ header will make MD:Z: strings work
1796 // and also stop it producing N for the sequence.
1797 fd->header->ref[i].len = r->length;
1798 }
1799 }
1800 }
1801
1802 /*
1803 * Indexes references by the order they appear in a BAM file. This may not
1804 * necessarily be the same order they appear in the fasta reference file.
1805 *
1806 * Returns 0 on success
1807 * -1 on failure
1808 */
refs2id(refs_t * r,SAM_hdr * h)1809 int refs2id(refs_t *r, SAM_hdr *h) {
1810 int i;
1811
1812 if (r->ref_id)
1813 free(r->ref_id);
1814 if (r->last)
1815 r->last = NULL;
1816
1817 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1818 if (!r->ref_id)
1819 return -1;
1820
1821 r->nref = h->nref;
1822 for (i = 0; i < h->nref; i++) {
1823 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1824 if (k != kh_end(r->h_meta)) {
1825 r->ref_id[i] = kh_val(r->h_meta, k);
1826 } else {
1827 hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
1828 }
1829 }
1830
1831 return 0;
1832 }
1833
1834 /*
1835 * Generates refs_t entries based on @SQ lines in the header.
1836 * Returns 0 on success
1837 * -1 on failure
1838 */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)1839 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1840 int i, j;
1841
1842 if (!r)
1843 return -1;
1844
1845 if (!h || h->nref == 0)
1846 return 0;
1847
1848 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1849
1850 /* Existing refs are fine, as long as they're compatible with the hdr. */
1851 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1852 return -1;
1853
1854 /* Copy info from h->ref[i] over to r */
1855 for (i = 0, j = r->nref; i < h->nref; i++) {
1856 SAM_hdr_type *ty;
1857 SAM_hdr_tag *tag;
1858 khint_t k;
1859 int n;
1860
1861 k = kh_get(refs, r->h_meta, h->ref[i].name);
1862 if (k != kh_end(r->h_meta))
1863 // Ref already known about
1864 continue;
1865
1866 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1867 return -1;
1868
1869 if (!h->ref[i].name)
1870 return -1;
1871
1872 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1873 r->ref_id[j]->length = 0; // marker for not yet loaded
1874
1875 /* Initialise likely filename if known */
1876 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1877 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1878 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1879 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1880 }
1881 }
1882
1883 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1884 if (n <= 0) // already exists or error
1885 return -1;
1886 kh_val(r->h_meta, k) = r->ref_id[j];
1887
1888 j++;
1889 }
1890 r->nref = j;
1891
1892 return 0;
1893 }
1894
1895 /*
1896 * Attaches a header to a cram_fd.
1897 *
1898 * This should be used when creating a new cram_fd for writing where
1899 * we have an SAM_hdr already constructed (eg from a file we've read
1900 * in).
1901 */
cram_set_header(cram_fd * fd,SAM_hdr * hdr)1902 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1903 if (fd->header)
1904 sam_hdr_free(fd->header);
1905 fd->header = hdr;
1906 return refs_from_header(fd->refs, fd, hdr);
1907 }
1908
1909 /*
1910 * Converts a directory and a filename into an expanded path, replacing %s
1911 * in directory with the filename and %[0-9]+s with portions of the filename
1912 * Any remaining parts of filename are added to the end with /%s.
1913 */
expand_cache_path(char * path,char * dir,char * fn)1914 int expand_cache_path(char *path, char *dir, char *fn) {
1915 char *cp, *start = path;
1916 size_t len;
1917 size_t sz = PATH_MAX;
1918
1919 while ((cp = strchr(dir, '%'))) {
1920 if (cp-dir >= sz) return -1;
1921 strncpy(path, dir, cp-dir);
1922 path += cp-dir;
1923 sz -= cp-dir;
1924
1925 if (*++cp == 's') {
1926 len = strlen(fn);
1927 if (len >= sz) return -1;
1928 strcpy(path, fn);
1929 path += len;
1930 sz -= len;
1931 fn += len;
1932 cp++;
1933 } else if (*cp >= '0' && *cp <= '9') {
1934 char *endp;
1935 long l;
1936
1937 l = strtol(cp, &endp, 10);
1938 l = MIN(l, strlen(fn));
1939 if (*endp == 's') {
1940 if (l >= sz) return -1;
1941 strncpy(path, fn, l);
1942 path += l;
1943 fn += l;
1944 sz -= l;
1945 *path = 0;
1946 cp = endp+1;
1947 } else {
1948 if (sz < 3) return -1;
1949 *path++ = '%';
1950 *path++ = *cp++;
1951 }
1952 } else {
1953 if (sz < 3) return -1;
1954 *path++ = '%';
1955 *path++ = *cp++;
1956 }
1957 dir = cp;
1958 }
1959
1960 len = strlen(dir);
1961 if (len >= sz) return -1;
1962 strcpy(path, dir);
1963 path += len;
1964 sz -= len;
1965
1966 len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
1967 if (len >= sz) return -1;
1968 if (*fn && path > start && path[-1] != '/')
1969 *path++ = '/';
1970 strcpy(path, fn);
1971 return 0;
1972 }
1973
1974 /*
1975 * Make the directory containing path and any prefix directories.
1976 */
mkdir_prefix(char * path,int mode)1977 void mkdir_prefix(char *path, int mode) {
1978 char *cp = strrchr(path, '/');
1979 if (!cp)
1980 return;
1981
1982 *cp = 0;
1983 if (is_directory(path)) {
1984 *cp = '/';
1985 return;
1986 }
1987
1988 if (mkdir(path, mode) == 0) {
1989 chmod(path, mode);
1990 *cp = '/';
1991 return;
1992 }
1993
1994 mkdir_prefix(path, mode);
1995 mkdir(path, mode);
1996 chmod(path, mode);
1997 *cp = '/';
1998 }
1999
2000 /*
2001 * Return the cache directory to use, based on the first of these
2002 * environment variables to be set to a non-empty value.
2003 */
get_cache_basedir(const char ** extra)2004 static const char *get_cache_basedir(const char **extra) {
2005 char *base;
2006
2007 *extra = "";
2008
2009 base = getenv("XDG_CACHE_HOME");
2010 if (base && *base) return base;
2011
2012 base = getenv("HOME");
2013 if (base && *base) { *extra = "/.cache"; return base; }
2014
2015 base = getenv("TMPDIR");
2016 if (base && *base) return base;
2017
2018 base = getenv("TEMP");
2019 if (base && *base) return base;
2020
2021 return "/tmp";
2022 }
2023
2024 /*
2025 * Return an integer representation of pthread_self().
2026 */
get_int_threadid()2027 static unsigned get_int_threadid() {
2028 pthread_t pt = pthread_self();
2029 unsigned char *s = (unsigned char *) &pt;
2030 size_t i;
2031 unsigned h = 0;
2032 for (i = 0; i < sizeof(pthread_t); i++)
2033 h = (h << 5) - h + s[i];
2034 return h;
2035 }
2036
2037 /*
2038 * Queries the M5 string from the header and attempts to populate the
2039 * reference from this using the REF_PATH environment.
2040 *
2041 * Returns 0 on sucess
2042 * -1 on failure
2043 */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2044 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2045 char *ref_path = getenv("REF_PATH");
2046 SAM_hdr_type *ty;
2047 SAM_hdr_tag *tag;
2048 char path[PATH_MAX], path_tmp[PATH_MAX + 64];
2049 char cache[PATH_MAX], cache_root[PATH_MAX];
2050 char *local_cache = getenv("REF_CACHE");
2051 mFILE *mf;
2052 int local_path = 0;
2053
2054 hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2055
2056 cache_root[0] = '\0';
2057
2058 if (!ref_path || *ref_path == '\0') {
2059 /*
2060 * If we have no ref path, we use the EBI server.
2061 * However to avoid spamming it we require a local ref cache too.
2062 */
2063 ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2064 if (!local_cache || *local_cache == '\0') {
2065 const char *extra;
2066 const char *base = get_cache_basedir(&extra);
2067 snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2068 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2069 local_cache = cache;
2070 hts_log_info("Populating local cache: %s", local_cache);
2071 }
2072 }
2073
2074 if (!r->name)
2075 return -1;
2076
2077 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2078 return -1;
2079
2080 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2081 goto no_M5;
2082
2083 hts_log_info("Querying ref %s", tag->str+3);
2084
2085 /* Use cache if available */
2086 if (local_cache && *local_cache) {
2087 if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2088 local_path = 1;
2089 }
2090
2091 #ifndef HAVE_MMAP
2092 char *path2;
2093 /* Search local files in REF_PATH; we can open them and return as above */
2094 if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2095 strncpy(path, path2, PATH_MAX);
2096 free(path2);
2097 if (is_file(path)) // incase it's too long
2098 local_path = 1;
2099 }
2100 #endif
2101
2102 /* Found via REF_CACHE or local REF_PATH file */
2103 if (local_path) {
2104 struct stat sb;
2105 BGZF *fp;
2106
2107 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
2108 r->length = sb.st_size;
2109 r->offset = r->line_length = r->bases_per_line = 0;
2110
2111 r->fn = string_dup(fd->refs->pool, path);
2112
2113 if (fd->refs->fp)
2114 if (bgzf_close(fd->refs->fp) != 0)
2115 return -1;
2116 fd->refs->fp = fp;
2117 fd->refs->fn = r->fn;
2118 r->is_md5 = 1;
2119
2120 // Fall back to cram_get_ref() where it'll do the actual
2121 // reading of the file.
2122 return 0;
2123 }
2124 }
2125
2126
2127 /* Otherwise search full REF_PATH; slower as loads entire file */
2128 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2129 size_t sz;
2130 r->seq = mfsteal(mf, &sz);
2131 if (r->seq) {
2132 r->mf = NULL;
2133 } else {
2134 // keep mf around as we couldn't detach
2135 r->seq = mf->data;
2136 r->mf = mf;
2137 }
2138 r->length = sz;
2139 r->is_md5 = 1;
2140 } else {
2141 refs_t *refs;
2142 char *fn;
2143
2144 no_M5:
2145 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2146 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2147 return -1;
2148
2149 fn = (strncmp(tag->str+3, "file:", 5) == 0)
2150 ? tag->str+8
2151 : tag->str+3;
2152
2153 if (fd->refs->fp) {
2154 if (bgzf_close(fd->refs->fp) != 0)
2155 return -1;
2156 fd->refs->fp = NULL;
2157 }
2158 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2159 return -1;
2160 sanitise_SQ_lines(fd);
2161
2162 fd->refs = refs;
2163 if (fd->refs->fp) {
2164 if (bgzf_close(fd->refs->fp) != 0)
2165 return -1;
2166 fd->refs->fp = NULL;
2167 }
2168
2169 if (!fd->refs->fn)
2170 return -1;
2171
2172 if (-1 == refs2id(fd->refs, fd->header))
2173 return -1;
2174 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2175 return -1;
2176
2177 // Local copy already, so fall back to cram_get_ref().
2178 return 0;
2179 }
2180
2181 /* Populate the local disk cache if required */
2182 if (local_cache && *local_cache) {
2183 int pid = (int) getpid();
2184 unsigned thrid = get_int_threadid();
2185 hFILE *fp;
2186
2187 if (*cache_root && !is_directory(cache_root)) {
2188 hts_log_warning("Creating reference cache directory %s\n"
2189 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2190 cache_root);
2191 }
2192
2193 if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
2194 return 0; // Not fatal - we have the data already so keep going.
2195 }
2196 hts_log_info("Writing cache file '%s'", path);
2197 mkdir_prefix(path, 01777);
2198
2199 do {
2200 // Attempt to further uniquify the temporary filename
2201 unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2202 thrid++; // Ensure filename changes even if time/clock haven't
2203
2204 snprintf(path_tmp, sizeof(path_tmp), "%s.tmp_%d_%u_%u",
2205 path, pid, thrid, t);
2206 fp = hopen(path_tmp, "wx");
2207 } while (fp == NULL && errno == EEXIST);
2208 if (!fp) {
2209 perror(path_tmp);
2210
2211 // Not fatal - we have the data already so keep going.
2212 return 0;
2213 }
2214
2215 // Check md5sum
2216 hts_md5_context *md5;
2217 char unsigned md5_buf1[16];
2218 char md5_buf2[33];
2219
2220 if (!(md5 = hts_md5_init())) {
2221 hclose_abruptly(fp);
2222 unlink(path_tmp);
2223 return -1;
2224 }
2225 hts_md5_update(md5, r->seq, r->length);
2226 hts_md5_final(md5_buf1, md5);
2227 hts_md5_destroy(md5);
2228 hts_md5_hex(md5_buf2, md5_buf1);
2229
2230 if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2231 hts_log_error("Mismatching md5sum for downloaded reference");
2232 hclose_abruptly(fp);
2233 unlink(path_tmp);
2234 return -1;
2235 }
2236
2237 if (hwrite(fp, r->seq, r->length) != r->length) {
2238 perror(path);
2239 }
2240 if (hclose(fp) < 0) {
2241 unlink(path_tmp);
2242 } else {
2243 if (0 == chmod(path_tmp, 0444))
2244 rename(path_tmp, path);
2245 else
2246 unlink(path_tmp);
2247 }
2248 }
2249
2250 return 0;
2251 }
2252
cram_ref_incr_locked(refs_t * r,int id)2253 static void cram_ref_incr_locked(refs_t *r, int id) {
2254 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2255
2256 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2257 return;
2258
2259 if (r->last_id == id)
2260 r->last_id = -1;
2261
2262 ++r->ref_id[id]->count;
2263 }
2264
cram_ref_incr(refs_t * r,int id)2265 void cram_ref_incr(refs_t *r, int id) {
2266 pthread_mutex_lock(&r->lock);
2267 cram_ref_incr_locked(r, id);
2268 pthread_mutex_unlock(&r->lock);
2269 }
2270
cram_ref_decr_locked(refs_t * r,int id)2271 static void cram_ref_decr_locked(refs_t *r, int id) {
2272 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2273
2274 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2275 assert(r->ref_id[id]->count >= 0);
2276 return;
2277 }
2278
2279 if (--r->ref_id[id]->count <= 0) {
2280 assert(r->ref_id[id]->count == 0);
2281 if (r->last_id >= 0) {
2282 if (r->ref_id[r->last_id]->count <= 0 &&
2283 r->ref_id[r->last_id]->seq) {
2284 RP("%d FREE REF %d (%p)\n", gettid(),
2285 r->last_id, r->ref_id[r->last_id]->seq);
2286 ref_entry_free_seq(r->ref_id[r->last_id]);
2287 if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
2288 }
2289 }
2290 r->last_id = id;
2291 }
2292 }
2293
cram_ref_decr(refs_t * r,int id)2294 void cram_ref_decr(refs_t *r, int id) {
2295 pthread_mutex_lock(&r->lock);
2296 cram_ref_decr_locked(r, id);
2297 pthread_mutex_unlock(&r->lock);
2298 }
2299
2300 /*
2301 * Used by cram_ref_load and cram_ref_get. The file handle will have
2302 * already been opened, so we can catch it. The ref_entry *e informs us
2303 * of whether this is a multi-line fasta file or a raw MD5 style file.
2304 * Either way we create a single contiguous sequence.
2305 *
2306 * Returns all or part of a reference sequence on success (malloced);
2307 * NULL on failure.
2308 */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)2309 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2310 off_t offset, len;
2311 char *seq;
2312
2313 if (end < start)
2314 end = start;
2315
2316 /*
2317 * Compute locations in file. This is trivial for the MD5 files, but
2318 * is still necessary for the fasta variants.
2319 */
2320 offset = e->line_length
2321 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
2322 (start-1) % e->bases_per_line
2323 : start-1;
2324
2325 len = (e->line_length
2326 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2327 (end-1) % e->bases_per_line
2328 : end-1) - offset + 1;
2329
2330 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2331 perror("bgzf_useek() on reference file");
2332 return NULL;
2333 }
2334
2335 if (len == 0 || !(seq = malloc(len))) {
2336 return NULL;
2337 }
2338
2339 if (len != bgzf_read(fp, seq, len)) {
2340 perror("bgzf_read() on reference file");
2341 free(seq);
2342 return NULL;
2343 }
2344
2345 /* Strip white-space if required. */
2346 if (len != end-start+1) {
2347 int i, j;
2348 char *cp = seq;
2349 char *cp_to;
2350
2351 for (i = j = 0; i < len; i++) {
2352 if (cp[i] >= '!' && cp[i] <= '~')
2353 cp[j++] = toupper_c(cp[i]);
2354 }
2355 cp_to = cp+j;
2356
2357 if (cp_to - seq != end-start+1) {
2358 hts_log_error("Malformed reference file");
2359 free(seq);
2360 return NULL;
2361 }
2362 } else {
2363 int i;
2364 for (i = 0; i < len; i++) {
2365 seq[i] = toupper_c(seq[i]);
2366 }
2367 }
2368
2369 return seq;
2370 }
2371
2372 /*
2373 * Load the entire reference 'id'.
2374 * This also increments the reference count by 1.
2375 *
2376 * Returns ref_entry on success;
2377 * NULL on failure
2378 */
cram_ref_load(refs_t * r,int id,int is_md5)2379 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
2380 ref_entry *e = r->ref_id[id];
2381 int start = 1, end = e->length;
2382 char *seq;
2383
2384 if (e->seq) {
2385 return e;
2386 }
2387
2388 assert(e->count == 0);
2389
2390 if (r->last) {
2391 #ifdef REF_DEBUG
2392 int idx = 0;
2393 for (idx = 0; idx < r->nref; idx++)
2394 if (r->last == r->ref_id[idx])
2395 break;
2396 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2397 #endif
2398 assert(r->last->count > 0);
2399 if (--r->last->count <= 0) {
2400 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2401 if (r->last->seq)
2402 ref_entry_free_seq(r->last);
2403 }
2404 }
2405
2406 /* Open file if it's not already the current open reference */
2407 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2408 if (r->fp)
2409 if (bgzf_close(r->fp) != 0)
2410 return NULL;
2411 r->fn = e->fn;
2412 if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
2413 return NULL;
2414 }
2415
2416 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2417
2418 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2419 return NULL;
2420 }
2421
2422 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2423
2424 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2425 e->seq = seq;
2426 e->mf = NULL;
2427 e->count++;
2428
2429 /*
2430 * Also keep track of last used ref so incr/decr loops on the same
2431 * sequence don't cause load/free loops.
2432 */
2433 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2434 r->last = e;
2435 e->count++;
2436
2437 return e;
2438 }
2439
2440 /*
2441 * Returns a portion of a reference sequence from start to end inclusive.
2442 * The returned pointer is owned by either the cram_file fd or by the
2443 * internal refs_t structure and should not be freed by the caller.
2444 *
2445 * The difference is whether or not this refs_t is in use by just the one
2446 * cram_fd or by multiples, or whether we have multiple threads accessing
2447 * references. In either case fd->shared will be true and we start using
2448 * reference counting to track the number of users of a specific reference
2449 * sequence.
2450 *
2451 * Otherwise the ref seq returned is allocated as part of cram_fd itself
2452 * and will be freed up on the next call to cram_get_ref or cram_close.
2453 *
2454 * To return the entire reference sequence, specify start as 1 and end
2455 * as 0.
2456 *
2457 * To cease using a reference, call cram_ref_decr().
2458 *
2459 * Returns reference on success,
2460 * NULL on failure
2461 */
cram_get_ref(cram_fd * fd,int id,int start,int end)2462 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2463 ref_entry *r;
2464 char *seq;
2465 int ostart = start;
2466
2467 if (id == -1)
2468 return NULL;
2469
2470 /* FIXME: axiomatic query of r->seq being true?
2471 * Or shortcut for unsorted data where we load once and never free?
2472 */
2473
2474 //fd->shared_ref = 1; // hard code for now to simplify things
2475
2476 pthread_mutex_lock(&fd->ref_lock);
2477
2478 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2479
2480 /*
2481 * Unsorted data implies we want to fetch an entire reference at a time.
2482 * We just deal with this at the moment by claiming we're sharing
2483 * references instead, which has the same requirement.
2484 */
2485 if (fd->unsorted)
2486 fd->shared_ref = 1;
2487
2488
2489 /* Sanity checking: does this ID exist? */
2490 if (id >= fd->refs->nref) {
2491 hts_log_error("No reference found for id %d", id);
2492 pthread_mutex_unlock(&fd->ref_lock);
2493 return NULL;
2494 }
2495
2496 if (!fd->refs || !fd->refs->ref_id[id]) {
2497 hts_log_error("No reference found for id %d", id);
2498 pthread_mutex_unlock(&fd->ref_lock);
2499 return NULL;
2500 }
2501
2502 if (!(r = fd->refs->ref_id[id])) {
2503 hts_log_error("No reference found for id %d", id);
2504 pthread_mutex_unlock(&fd->ref_lock);
2505 return NULL;
2506 }
2507
2508
2509 /*
2510 * It has an entry, but may not have been populated yet.
2511 * Any manually loaded .fai files have their lengths known.
2512 * A ref entry computed from @SQ lines (M5 or UR field) will have
2513 * r->length == 0 unless it's been loaded once and verified that we have
2514 * an on-disk filename for it.
2515 *
2516 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2517 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2518 * rewrite my code to have one curl handle per thread.
2519 */
2520 pthread_mutex_lock(&fd->refs->lock);
2521 if (r->length == 0) {
2522 if (cram_populate_ref(fd, id, r) == -1) {
2523 hts_log_error("Failed to populate reference for id %d", id);
2524 pthread_mutex_unlock(&fd->refs->lock);
2525 pthread_mutex_unlock(&fd->ref_lock);
2526 return NULL;
2527 }
2528 r = fd->refs->ref_id[id];
2529 if (fd->unsorted)
2530 cram_ref_incr_locked(fd->refs, id);
2531 }
2532
2533
2534 /*
2535 * We now know that we the filename containing the reference, so check
2536 * for limits. If it's over half the reference we'll load all of it in
2537 * memory as this will speed up subsequent calls.
2538 */
2539 if (end < 1)
2540 end = r->length;
2541 if (end >= r->length)
2542 end = r->length;
2543 if (start < 1)
2544 return NULL;
2545
2546 if (end - start >= 0.5*r->length || fd->shared_ref) {
2547 start = 1;
2548 end = r->length;
2549 }
2550
2551 /*
2552 * Maybe we have it cached already? If so use it.
2553 *
2554 * Alternatively if we don't have the sequence but we're sharing
2555 * references and/or are asking for the entire length of it, then
2556 * load the full reference into the refs structure and return
2557 * a pointer to that one instead.
2558 */
2559 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2560 char *cp;
2561
2562 if (id >= 0) {
2563 if (r->seq) {
2564 cram_ref_incr_locked(fd->refs, id);
2565 } else {
2566 ref_entry *e;
2567 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
2568 pthread_mutex_unlock(&fd->refs->lock);
2569 pthread_mutex_unlock(&fd->ref_lock);
2570 return NULL;
2571 }
2572
2573 /* unsorted data implies cache ref indefinitely, to avoid
2574 * continually loading and unloading.
2575 */
2576 if (fd->unsorted)
2577 cram_ref_incr_locked(fd->refs, id);
2578 }
2579
2580 fd->ref = NULL; /* We never access it directly */
2581 fd->ref_start = 1;
2582 fd->ref_end = r->length;
2583 fd->ref_id = id;
2584
2585 cp = fd->refs->ref_id[id]->seq + ostart-1;
2586 } else {
2587 fd->ref = NULL;
2588 cp = NULL;
2589 }
2590
2591 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2592
2593 pthread_mutex_unlock(&fd->refs->lock);
2594 pthread_mutex_unlock(&fd->ref_lock);
2595 return cp;
2596 }
2597
2598 /*
2599 * Otherwise we're not sharing, we don't have a copy of it already and
2600 * we're only asking for a small portion of it.
2601 *
2602 * In this case load up just that segment ourselves, freeing any old
2603 * small segments in the process.
2604 */
2605
2606 /* Unmapped ref ID */
2607 if (id < 0) {
2608 if (fd->ref_free) {
2609 free(fd->ref_free);
2610 fd->ref_free = NULL;
2611 }
2612 fd->ref = NULL;
2613 fd->ref_id = id;
2614 pthread_mutex_unlock(&fd->refs->lock);
2615 pthread_mutex_unlock(&fd->ref_lock);
2616 return NULL;
2617 }
2618
2619 /* Open file if it's not already the current open reference */
2620 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2621 if (fd->refs->fp)
2622 if (bgzf_close(fd->refs->fp) != 0)
2623 return NULL;
2624 fd->refs->fn = r->fn;
2625 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
2626 pthread_mutex_unlock(&fd->refs->lock);
2627 pthread_mutex_unlock(&fd->ref_lock);
2628 return NULL;
2629 }
2630 }
2631
2632 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2633 pthread_mutex_unlock(&fd->refs->lock);
2634 pthread_mutex_unlock(&fd->ref_lock);
2635 return NULL;
2636 }
2637
2638 if (fd->ref_free)
2639 free(fd->ref_free);
2640
2641 fd->ref_id = id;
2642 fd->ref_start = start;
2643 fd->ref_end = end;
2644 fd->ref_free = fd->ref;
2645 seq = fd->ref;
2646
2647 pthread_mutex_unlock(&fd->refs->lock);
2648 pthread_mutex_unlock(&fd->ref_lock);
2649
2650 return seq + ostart - start;
2651 }
2652
2653 /*
2654 * If fd has been opened for reading, it may be permitted to specify 'fn'
2655 * as NULL and let the code auto-detect the reference by parsing the
2656 * SAM header @SQ lines.
2657 */
cram_load_reference(cram_fd * fd,char * fn)2658 int cram_load_reference(cram_fd *fd, char *fn) {
2659 int ret = 0;
2660
2661 if (fn) {
2662 fd->refs = refs_load_fai(fd->refs, fn,
2663 !(fd->embed_ref && fd->mode == 'r'));
2664 fn = fd->refs ? fd->refs->fn : NULL;
2665 if (!fn)
2666 ret = -1;
2667 sanitise_SQ_lines(fd);
2668 }
2669 fd->ref_fn = fn;
2670
2671 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2672 if (fd->refs)
2673 refs_free(fd->refs);
2674 if (!(fd->refs = refs_create()))
2675 return -1;
2676 if (-1 == refs_from_header(fd->refs, fd, fd->header))
2677 return -1;
2678 }
2679
2680 if (fd->header)
2681 if (-1 == refs2id(fd->refs, fd->header))
2682 return -1;
2683
2684 return ret;
2685 }
2686
2687 /* ----------------------------------------------------------------------
2688 * Containers
2689 */
2690
2691 /*
2692 * Creates a new container, specifying the maximum number of slices
2693 * and records permitted.
2694 *
2695 * Returns cram_container ptr on success
2696 * NULL on failure
2697 */
cram_new_container(int nrec,int nslice)2698 cram_container *cram_new_container(int nrec, int nslice) {
2699 cram_container *c = calloc(1, sizeof(*c));
2700 enum cram_DS_ID id;
2701
2702 if (!c)
2703 return NULL;
2704
2705 c->curr_ref = -2;
2706
2707 c->max_c_rec = nrec * nslice;
2708 c->curr_c_rec = 0;
2709
2710 c->max_rec = nrec;
2711 c->record_counter = 0;
2712 c->num_bases = 0;
2713 c->s_num_bases = 0;
2714
2715 c->max_slice = nslice;
2716 c->curr_slice = 0;
2717
2718 c->pos_sorted = 1;
2719 c->max_apos = 0;
2720 c->multi_seq = 0;
2721
2722 c->bams = NULL;
2723
2724 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2725 goto err;
2726 c->slice = NULL;
2727
2728 if (!(c->comp_hdr = cram_new_compression_header()))
2729 goto err;
2730 c->comp_hdr_block = NULL;
2731
2732 for (id = DS_RN; id < DS_TN; id++)
2733 if (!(c->stats[id] = cram_stats_create())) goto err;
2734
2735 //c->aux_B_stats = cram_stats_create();
2736
2737 if (!(c->tags_used = kh_init(m_tagmap)))
2738 goto err;
2739 c->refs_used = 0;
2740
2741 return c;
2742
2743 err:
2744 if (c) {
2745 if (c->slices)
2746 free(c->slices);
2747 free(c);
2748 }
2749 return NULL;
2750 }
2751
cram_free_container(cram_container * c)2752 void cram_free_container(cram_container *c) {
2753 enum cram_DS_ID id;
2754 int i;
2755
2756 if (!c)
2757 return;
2758
2759 if (c->refs_used)
2760 free(c->refs_used);
2761
2762 if (c->landmark)
2763 free(c->landmark);
2764
2765 if (c->comp_hdr)
2766 cram_free_compression_header(c->comp_hdr);
2767
2768 if (c->comp_hdr_block)
2769 cram_free_block(c->comp_hdr_block);
2770
2771 // Free the slices; filled out by encoder only
2772 if (c->slices) {
2773 for (i = 0; i < c->max_slice; i++) {
2774 if (c->slices[i])
2775 cram_free_slice(c->slices[i]);
2776 if (c->slices[i] == c->slice)
2777 c->slice = NULL;
2778 }
2779 free(c->slices);
2780 }
2781
2782 // Free the current slice; set by both encoder & decoder
2783 if (c->slice) {
2784 cram_free_slice(c->slice);
2785 c->slice = NULL;
2786 }
2787
2788 for (id = DS_RN; id < DS_TN; id++)
2789 if (c->stats[id]) cram_stats_free(c->stats[id]);
2790
2791 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2792
2793 if (c->tags_used) {
2794 khint_t k;
2795
2796 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2797 if (!kh_exist(c->tags_used, k))
2798 continue;
2799
2800 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2801 cram_codec *c = tm->codec;
2802
2803 if (c) c->free(c);
2804 free(tm);
2805 }
2806
2807 kh_destroy(m_tagmap, c->tags_used);
2808 }
2809
2810 free(c);
2811 }
2812
2813 /*
2814 * Reads a container header.
2815 *
2816 * Returns cram_container on success
2817 * NULL on failure or no container left (fd->err == 0).
2818 */
cram_read_container(cram_fd * fd)2819 cram_container *cram_read_container(cram_fd *fd) {
2820 cram_container c2, *c;
2821 int i, s;
2822 size_t rd = 0;
2823 uint32_t crc = 0;
2824
2825 fd->err = 0;
2826 fd->eof = 0;
2827
2828 memset(&c2, 0, sizeof(c2));
2829 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2830 if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
2831 fd->eof = fd->empty_container ? 1 : 2;
2832 return NULL;
2833 } else {
2834 rd+=s;
2835 }
2836 } else {
2837 uint32_t len;
2838 if ((s = int32_decode(fd, &c2.length)) == -1) {
2839 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2840 CRAM_MINOR_VERS(fd->version) == 0)
2841 fd->eof = 1; // EOF blocks arrived in v2.1
2842 else
2843 fd->eof = fd->empty_container ? 1 : 2;
2844 return NULL;
2845 } else {
2846 rd+=s;
2847 }
2848 len = le_int4(c2.length);
2849 crc = crc32(0L, (unsigned char *)&len, 4);
2850 }
2851 if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
2852 if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
2853 if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
2854 if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
2855
2856 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2857 c2.record_counter = 0;
2858 c2.num_bases = 0;
2859 } else {
2860 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2861 if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
2862 return NULL;
2863 else
2864 rd += s;
2865 } else {
2866 int32_t i32;
2867 if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
2868 return NULL;
2869 else
2870 rd += s;
2871 c2.record_counter = i32;
2872 }
2873
2874 if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
2875 return NULL;
2876 else
2877 rd += s;
2878 }
2879 if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s;
2880 if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
2881
2882 if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
2883 return NULL;
2884
2885 if (!(c = calloc(1, sizeof(*c))))
2886 return NULL;
2887
2888 *c = c2;
2889
2890 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2891 c->num_landmarks) {
2892 fd->err = errno;
2893 cram_free_container(c);
2894 return NULL;
2895 }
2896 for (i = 0; i < c->num_landmarks; i++) {
2897 if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
2898 cram_free_container(c);
2899 return NULL;
2900 } else {
2901 rd += s;
2902 }
2903 }
2904
2905 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2906 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2907 return NULL;
2908 else
2909 rd+=4;
2910
2911 if (crc != c->crc32) {
2912 hts_log_error("Container header CRC32 failure");
2913 cram_free_container(c);
2914 return NULL;
2915 }
2916 }
2917
2918 c->offset = rd;
2919 c->slices = NULL;
2920 c->slice = NULL;
2921 c->curr_slice = 0;
2922 c->max_slice = c->num_landmarks;
2923 c->slice_rec = 0;
2924 c->curr_rec = 0;
2925 c->max_rec = 0;
2926
2927 if (c->ref_seq_id == -2) {
2928 c->multi_seq = 1;
2929 fd->multi_seq = 1;
2930 }
2931
2932 fd->empty_container =
2933 (c->num_records == 0 &&
2934 c->ref_seq_id == -1 &&
2935 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2936
2937 return c;
2938 }
2939
2940
2941 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)2942 int cram_container_size(cram_container *c) {
2943 return 55 + 5*c->num_landmarks;
2944 }
2945
2946
2947 /*
2948 * Stores the container structure in dat and returns *size as the
2949 * number of bytes written to dat[]. The input size of dat is also
2950 * held in *size and should be initialised to cram_container_size(c).
2951 *
2952 * Returns 0 on success;
2953 * -1 on failure
2954 */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)2955 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2956 {
2957 unsigned char *cp = (unsigned char *)dat;
2958 int i;
2959
2960 // Check the input buffer is large enough according to our stated
2961 // requirements. (NOTE: it may actually take less.)
2962 if (cram_container_size(c) > *size)
2963 return -1;
2964
2965 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2966 cp += itf8_put((char*)cp, c->length);
2967 } else {
2968 *(int32_t *)cp = le_int4(c->length);
2969 cp += 4;
2970 }
2971 if (c->multi_seq) {
2972 cp += itf8_put((char*)cp, -2);
2973 cp += itf8_put((char*)cp, 0);
2974 cp += itf8_put((char*)cp, 0);
2975 } else {
2976 cp += itf8_put((char*)cp, c->ref_seq_id);
2977 cp += itf8_put((char*)cp, c->ref_seq_start);
2978 cp += itf8_put((char*)cp, c->ref_seq_span);
2979 }
2980 cp += itf8_put((char*)cp, c->num_records);
2981 if (CRAM_MAJOR_VERS(fd->version) == 2) {
2982 cp += itf8_put((char*)cp, c->record_counter);
2983 cp += ltf8_put((char*)cp, c->num_bases);
2984 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2985 cp += ltf8_put((char*)cp, c->record_counter);
2986 cp += ltf8_put((char*)cp, c->num_bases);
2987 }
2988
2989 cp += itf8_put((char*)cp, c->num_blocks);
2990 cp += itf8_put((char*)cp, c->num_landmarks);
2991 for (i = 0; i < c->num_landmarks; i++)
2992 cp += itf8_put((char*)cp, c->landmark[i]);
2993
2994 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2995 c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat);
2996 cp[0] = c->crc32 & 0xff;
2997 cp[1] = (c->crc32 >> 8) & 0xff;
2998 cp[2] = (c->crc32 >> 16) & 0xff;
2999 cp[3] = (c->crc32 >> 24) & 0xff;
3000 cp += 4;
3001 }
3002
3003 *size = (char *)cp-dat; // actual used size
3004
3005 return 0;
3006 }
3007
3008
3009 /*
3010 * Writes a container structure.
3011 *
3012 * Returns 0 on success
3013 * -1 on failure
3014 */
cram_write_container(cram_fd * fd,cram_container * c)3015 int cram_write_container(cram_fd *fd, cram_container *c) {
3016 char buf_a[1024], *buf = buf_a;
3017 unsigned char *cp;
3018 int i;
3019
3020 if (55 + c->num_landmarks * 5 >= 1024)
3021 buf = malloc(55 + c->num_landmarks * 5);
3022 cp = (unsigned char *)buf;
3023
3024 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3025 cp += itf8_put((char*)cp, c->length);
3026 } else {
3027 *(int32_t *)cp = le_int4(c->length);
3028 cp += 4;
3029 }
3030 if (c->multi_seq) {
3031 cp += itf8_put((char*)cp, -2);
3032 cp += itf8_put((char*)cp, 0);
3033 cp += itf8_put((char*)cp, 0);
3034 } else {
3035 cp += itf8_put((char*)cp, c->ref_seq_id);
3036 cp += itf8_put((char*)cp, c->ref_seq_start);
3037 cp += itf8_put((char*)cp, c->ref_seq_span);
3038 }
3039 cp += itf8_put((char*)cp, c->num_records);
3040 if (CRAM_MAJOR_VERS(fd->version) == 2) {
3041 cp += itf8_put((char*)cp, c->record_counter);
3042 cp += ltf8_put((char*)cp, c->num_bases);
3043 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3044 cp += ltf8_put((char*)cp, c->record_counter);
3045 cp += ltf8_put((char*)cp, c->num_bases);
3046 }
3047
3048 cp += itf8_put((char*)cp, c->num_blocks);
3049 cp += itf8_put((char*)cp, c->num_landmarks);
3050 for (i = 0; i < c->num_landmarks; i++)
3051 cp += itf8_put((char*)cp, c->landmark[i]);
3052
3053 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3054 c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf);
3055 cp[0] = c->crc32 & 0xff;
3056 cp[1] = (c->crc32 >> 8) & 0xff;
3057 cp[2] = (c->crc32 >> 16) & 0xff;
3058 cp[3] = (c->crc32 >> 24) & 0xff;
3059 cp += 4;
3060 }
3061
3062 if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) {
3063 if (buf != buf_a)
3064 free(buf);
3065 return -1;
3066 }
3067
3068 if (buf != buf_a)
3069 free(buf);
3070
3071 return 0;
3072 }
3073
3074 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3075 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3076 int i, j;
3077
3078 if (c->curr_slice > 0 && !c->slices)
3079 return -1;
3080
3081 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3082
3083 /* Write the container struct itself */
3084 if (0 != cram_write_container(fd, c))
3085 return -1;
3086
3087 /* And the compression header */
3088 if (0 != cram_write_block(fd, c->comp_hdr_block))
3089 return -1;
3090
3091 /* Followed by the slice blocks */
3092 for (i = 0; i < c->curr_slice; i++) {
3093 cram_slice *s = c->slices[i];
3094
3095 if (0 != cram_write_block(fd, s->hdr_block))
3096 return -1;
3097
3098 for (j = 0; j < s->hdr->num_blocks; j++) {
3099 if (0 != cram_write_block(fd, s->block[j]))
3100 return -1;
3101 }
3102 }
3103
3104 return hflush(fd->fp) == 0 ? 0 : -1;
3105 }
3106
3107 /*
3108 * Flushes a completely or partially full container to disk, writing
3109 * container structure, header and blocks. This also calls the encoder
3110 * functions.
3111 *
3112 * Returns 0 on success
3113 * -1 on failure
3114 */
cram_flush_container(cram_fd * fd,cram_container * c)3115 int cram_flush_container(cram_fd *fd, cram_container *c) {
3116 /* Encode the container blocks and generate compression header */
3117 if (0 != cram_encode_container(fd, c))
3118 return -1;
3119
3120 return cram_flush_container2(fd, c);
3121 }
3122
3123 typedef struct {
3124 cram_fd *fd;
3125 cram_container *c;
3126 } cram_job;
3127
cram_flush_thread(void * arg)3128 void *cram_flush_thread(void *arg) {
3129 cram_job *j = (cram_job *)arg;
3130
3131 /* Encode the container blocks and generate compression header */
3132 if (0 != cram_encode_container(j->fd, j->c)) {
3133 hts_log_error("Call to cram_encode_container failed");
3134 return NULL;
3135 }
3136
3137 return arg;
3138 }
3139
cram_flush_result(cram_fd * fd)3140 static int cram_flush_result(cram_fd *fd) {
3141 int i, ret = 0;
3142 hts_tpool_result *r;
3143 cram_container *lc = NULL;
3144
3145 // NB: we can have one result per slice, not per container,
3146 // so we need to free the container only after all slices
3147 // within it have been freed. (Automatic via reference counting.)
3148 while ((r = hts_tpool_next_result(fd->rqueue))) {
3149 cram_job *j = (cram_job *)hts_tpool_result_data(r);
3150 cram_container *c;
3151
3152 if (!j) {
3153 hts_tpool_delete_result(r, 0);
3154 return -1;
3155 }
3156
3157 fd = j->fd;
3158 c = j->c;
3159
3160 if (fd->mode == 'w')
3161 if (0 != cram_flush_container2(fd, c))
3162 return -1;
3163
3164 // Free the slices; filled out by encoder only
3165 if (c->slices) {
3166 for (i = 0; i < c->max_slice; i++) {
3167 if (c->slices[i])
3168 cram_free_slice(c->slices[i]);
3169 if (c->slices[i] == c->slice)
3170 c->slice = NULL;
3171 c->slices[i] = NULL;
3172 }
3173 }
3174
3175 // Free the current slice; set by both encoder & decoder
3176 if (c->slice) {
3177 cram_free_slice(c->slice);
3178 c->slice = NULL;
3179 }
3180 c->curr_slice = 0;
3181
3182 // Our jobs will be in order, so we free the last
3183 // container when our job has switched to a new one.
3184 if (c != lc) {
3185 if (lc) {
3186 if (fd->ctr == lc)
3187 fd->ctr = NULL;
3188 if (fd->ctr_mt == lc)
3189 fd->ctr_mt = NULL;
3190 cram_free_container(lc);
3191 }
3192 lc = c;
3193 }
3194
3195 if (fd->mode == 'w')
3196 ret |= hflush(fd->fp) == 0 ? 0 : -1;
3197
3198 hts_tpool_delete_result(r, 1);
3199 }
3200 if (lc) {
3201 if (fd->ctr == lc)
3202 fd->ctr = NULL;
3203 if (fd->ctr_mt == lc)
3204 fd->ctr_mt = NULL;
3205 cram_free_container(lc);
3206 }
3207
3208 return ret;
3209 }
3210
cram_flush_container_mt(cram_fd * fd,cram_container * c)3211 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3212 cram_job *j;
3213
3214 if (!fd->pool)
3215 return cram_flush_container(fd, c);
3216
3217 if (!(j = malloc(sizeof(*j))))
3218 return -1;
3219 j->fd = fd;
3220 j->c = c;
3221
3222 // Flush the job. Note our encoder queue may be full, so we
3223 // either have to keep trying in non-blocking mode (what we do) or
3224 // use a dedicated separate thread for draining the queue.
3225 for (;;) {
3226 errno = 0;
3227 hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
3228 int pending = (errno == EAGAIN);
3229 if (cram_flush_result(fd) != 0)
3230 return -1;
3231 if (!pending)
3232 break;
3233
3234 usleep(1000);
3235 }
3236
3237 return 0;
3238 }
3239
3240 /* ----------------------------------------------------------------------
3241 * Compression headers; the first part of the container
3242 */
3243
3244 /*
3245 * Creates a new blank container compression header
3246 *
3247 * Returns header ptr on success
3248 * NULL on failure
3249 */
cram_new_compression_header(void)3250 cram_block_compression_hdr *cram_new_compression_header(void) {
3251 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3252 if (!hdr)
3253 return NULL;
3254
3255 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3256 free(hdr);
3257 return NULL;
3258 }
3259
3260 if (!(hdr->TD_hash = kh_init(m_s2i))) {
3261 cram_free_block(hdr->TD_blk);
3262 free(hdr);
3263 return NULL;
3264 }
3265
3266 if (!(hdr->TD_keys = string_pool_create(8192))) {
3267 kh_destroy(m_s2i, hdr->TD_hash);
3268 cram_free_block(hdr->TD_blk);
3269 free(hdr);
3270 return NULL;
3271 }
3272
3273 return hdr;
3274 }
3275
cram_free_compression_header(cram_block_compression_hdr * hdr)3276 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3277 int i;
3278
3279 if (hdr->landmark)
3280 free(hdr->landmark);
3281
3282 if (hdr->preservation_map)
3283 kh_destroy(map, hdr->preservation_map);
3284
3285 for (i = 0; i < CRAM_MAP_HASH; i++) {
3286 cram_map *m, *m2;
3287 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3288 m2 = m->next;
3289 if (m->codec)
3290 m->codec->free(m->codec);
3291 free(m);
3292 }
3293 }
3294
3295 for (i = 0; i < CRAM_MAP_HASH; i++) {
3296 cram_map *m, *m2;
3297 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3298 m2 = m->next;
3299 if (m->codec)
3300 m->codec->free(m->codec);
3301 free(m);
3302 }
3303 }
3304
3305 for (i = 0; i < DS_END; i++) {
3306 if (hdr->codecs[i])
3307 hdr->codecs[i]->free(hdr->codecs[i]);
3308 }
3309
3310 if (hdr->TL)
3311 free(hdr->TL);
3312 if (hdr->TD_blk)
3313 cram_free_block(hdr->TD_blk);
3314 if (hdr->TD_hash)
3315 kh_destroy(m_s2i, hdr->TD_hash);
3316 if (hdr->TD_keys)
3317 string_pool_destroy(hdr->TD_keys);
3318
3319 free(hdr);
3320 }
3321
3322
3323 /* ----------------------------------------------------------------------
3324 * Slices and slice headers
3325 */
3326
cram_free_slice_header(cram_block_slice_hdr * hdr)3327 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3328 if (!hdr)
3329 return;
3330
3331 if (hdr->block_content_ids)
3332 free(hdr->block_content_ids);
3333
3334 free(hdr);
3335
3336 return;
3337 }
3338
cram_free_slice(cram_slice * s)3339 void cram_free_slice(cram_slice *s) {
3340 if (!s)
3341 return;
3342
3343 if (s->hdr_block)
3344 cram_free_block(s->hdr_block);
3345
3346 if (s->block) {
3347 int i;
3348
3349 if (s->hdr) {
3350 for (i = 0; i < s->hdr->num_blocks; i++) {
3351 cram_free_block(s->block[i]);
3352 }
3353 }
3354 free(s->block);
3355 }
3356
3357 if (s->block_by_id)
3358 free(s->block_by_id);
3359
3360 if (s->hdr)
3361 cram_free_slice_header(s->hdr);
3362
3363 if (s->seqs_blk)
3364 cram_free_block(s->seqs_blk);
3365
3366 if (s->qual_blk)
3367 cram_free_block(s->qual_blk);
3368
3369 if (s->name_blk)
3370 cram_free_block(s->name_blk);
3371
3372 if (s->aux_blk)
3373 cram_free_block(s->aux_blk);
3374
3375 if (s->base_blk)
3376 cram_free_block(s->base_blk);
3377
3378 if (s->soft_blk)
3379 cram_free_block(s->soft_blk);
3380
3381 if (s->cigar)
3382 free(s->cigar);
3383
3384 if (s->crecs)
3385 free(s->crecs);
3386
3387 if (s->features)
3388 free(s->features);
3389
3390 if (s->TN)
3391 free(s->TN);
3392
3393 if (s->pair_keys)
3394 string_pool_destroy(s->pair_keys);
3395
3396 if (s->pair[0])
3397 kh_destroy(m_s2i, s->pair[0]);
3398 if (s->pair[1])
3399 kh_destroy(m_s2i, s->pair[1]);
3400
3401 if (s->aux_block)
3402 free(s->aux_block);
3403
3404 free(s);
3405 }
3406
3407 /*
3408 * Creates a new empty slice in memory, for subsequent writing to
3409 * disk.
3410 *
3411 * Returns cram_slice ptr on success
3412 * NULL on failure
3413 */
cram_new_slice(enum cram_content_type type,int nrecs)3414 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3415 cram_slice *s = calloc(1, sizeof(*s));
3416 if (!s)
3417 return NULL;
3418
3419 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3420 goto err;
3421 s->hdr->content_type = type;
3422
3423 s->hdr_block = NULL;
3424 s->block = NULL;
3425 s->block_by_id = NULL;
3426 s->last_apos = 0;
3427 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
3428 s->cigar = NULL;
3429 s->cigar_alloc = 0;
3430 s->ncigar = 0;
3431
3432 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3433 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3434 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3435 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3436 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3437 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3438
3439 s->features = NULL;
3440 s->nfeatures = s->afeatures = 0;
3441
3442 #ifndef TN_external
3443 s->TN = NULL;
3444 s->nTN = s->aTN = 0;
3445 #endif
3446
3447 // Volatile keys as we do realloc in dstring
3448 if (!(s->pair_keys = string_pool_create(8192))) goto err;
3449 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
3450 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
3451
3452 #ifdef BA_external
3453 s->BA_len = 0;
3454 #endif
3455
3456 return s;
3457
3458 err:
3459 if (s)
3460 cram_free_slice(s);
3461
3462 return NULL;
3463 }
3464
3465 /*
3466 * Loads an entire slice.
3467 * FIXME: In 1.0 the native unit of slices within CRAM is broken
3468 * as slices contain references to objects in other slices.
3469 * To work around this while keeping the slice oriented outer loop
3470 * we read all slices and stitch them together into a fake large
3471 * slice instead.
3472 *
3473 * Returns cram_slice ptr on success
3474 * NULL on failure
3475 */
cram_read_slice(cram_fd * fd)3476 cram_slice *cram_read_slice(cram_fd *fd) {
3477 cram_block *b = cram_read_block(fd);
3478 cram_slice *s = calloc(1, sizeof(*s));
3479 int i, n, max_id, min_id;
3480
3481 if (!b || !s)
3482 goto err;
3483
3484 s->hdr_block = b;
3485 switch (b->content_type) {
3486 case MAPPED_SLICE:
3487 case UNMAPPED_SLICE:
3488 if (!(s->hdr = cram_decode_slice_header(fd, b)))
3489 goto err;
3490 break;
3491
3492 default:
3493 hts_log_error("Unexpected block of type %s",
3494 cram_content_type2str(b->content_type));
3495 goto err;
3496 }
3497
3498 if (s->hdr->num_blocks < 1) {
3499 hts_log_error("Slice does not include any data blocks");
3500 goto err;
3501 }
3502
3503 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3504 if (!s->block)
3505 goto err;
3506
3507 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3508 if (!(s->block[i] = cram_read_block(fd)))
3509 goto err;
3510
3511 if (s->block[i]->content_type == EXTERNAL) {
3512 if (max_id < s->block[i]->content_id)
3513 max_id = s->block[i]->content_id;
3514 if (min_id > s->block[i]->content_id)
3515 min_id = s->block[i]->content_id;
3516 }
3517 }
3518
3519 if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
3520 goto err;
3521
3522 for (i = 0; i < n; i++) {
3523 if (s->block[i]->content_type != EXTERNAL)
3524 continue;
3525 int v = s->block[i]->content_id;
3526 if (v < 0 || v >= 256)
3527 v = 256 + (v > 0 ? v % 251 : (-v) % 251);
3528 s->block_by_id[v] = s->block[i];
3529 }
3530
3531 /* Initialise encoding/decoding tables */
3532 s->cigar = NULL;
3533 s->cigar_alloc = 0;
3534 s->ncigar = 0;
3535
3536 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3537 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3538 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3539 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3540 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3541 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3542
3543 s->crecs = NULL;
3544
3545 s->last_apos = s->hdr->ref_seq_start;
3546 s->decode_md = fd->decode_md;
3547
3548 return s;
3549
3550 err:
3551 if (b)
3552 cram_free_block(b);
3553 if (s) {
3554 s->hdr_block = NULL;
3555 cram_free_slice(s);
3556 }
3557 return NULL;
3558 }
3559
3560
3561 /* ----------------------------------------------------------------------
3562 * CRAM file definition (header)
3563 */
3564
3565 /*
3566 * Reads a CRAM file definition structure.
3567 * Returns file_def ptr on success
3568 * NULL on failure
3569 */
cram_read_file_def(cram_fd * fd)3570 cram_file_def *cram_read_file_def(cram_fd *fd) {
3571 cram_file_def *def = malloc(sizeof(*def));
3572 if (!def)
3573 return NULL;
3574
3575 if (26 != hread(fd->fp, &def->magic[0], 26)) {
3576 free(def);
3577 return NULL;
3578 }
3579
3580 if (memcmp(def->magic, "CRAM", 4) != 0) {
3581 free(def);
3582 return NULL;
3583 }
3584
3585 if (def->major_version > 3) {
3586 hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d",
3587 def->major_version, def->minor_version);
3588 free(def);
3589 return NULL;
3590 }
3591
3592 fd->first_container += 26;
3593 fd->last_slice = 0;
3594
3595 return def;
3596 }
3597
3598 /*
3599 * Writes a cram_file_def structure to cram_fd.
3600 * Returns 0 on success
3601 * -1 on failure
3602 */
cram_write_file_def(cram_fd * fd,cram_file_def * def)3603 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3604 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3605 }
3606
cram_free_file_def(cram_file_def * def)3607 void cram_free_file_def(cram_file_def *def) {
3608 if (def) free(def);
3609 }
3610
3611 /* ----------------------------------------------------------------------
3612 * SAM header I/O
3613 */
3614
3615
3616 /*
3617 * Reads the SAM header from the first CRAM data block.
3618 * Also performs minimal parsing to extract read-group
3619 * and sample information.
3620
3621 * Returns SAM hdr ptr on success
3622 * NULL on failure
3623 */
cram_read_SAM_hdr(cram_fd * fd)3624 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3625 int32_t header_len;
3626 char *header;
3627 SAM_hdr *hdr;
3628
3629 /* 1.1 onwards stores the header in the first block of a container */
3630 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3631 /* Length */
3632 if (-1 == int32_decode(fd, &header_len))
3633 return NULL;
3634
3635 /* Alloc and read */
3636 if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
3637 return NULL;
3638
3639 if (header_len != hread(fd->fp, header, header_len))
3640 return NULL;
3641 header[header_len] = '\0';
3642
3643 fd->first_container += 4 + header_len;
3644 } else {
3645 cram_container *c = cram_read_container(fd);
3646 cram_block *b;
3647 int i;
3648 int64_t len;
3649
3650 if (!c)
3651 return NULL;
3652
3653 fd->first_container += c->length + c->offset;
3654
3655 if (c->num_blocks < 1) {
3656 cram_free_container(c);
3657 return NULL;
3658 }
3659
3660 if (!(b = cram_read_block(fd))) {
3661 cram_free_container(c);
3662 return NULL;
3663 }
3664 if (cram_uncompress_block(b) != 0) {
3665 cram_free_container(c);
3666 cram_free_block(b);
3667 return NULL;
3668 }
3669
3670 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3671 itf8_size(b->content_id) +
3672 itf8_size(b->uncomp_size) +
3673 itf8_size(b->comp_size);
3674
3675 /* Extract header from 1st block */
3676 if (-1 == int32_get_blk(b, &header_len) ||
3677 header_len < 0 || /* Spec. says signed... why? */
3678 b->uncomp_size - 4 < header_len) {
3679 cram_free_container(c);
3680 cram_free_block(b);
3681 return NULL;
3682 }
3683 if (NULL == (header = malloc((size_t) header_len+1))) {
3684 cram_free_container(c);
3685 cram_free_block(b);
3686 return NULL;
3687 }
3688 memcpy(header, BLOCK_END(b), header_len);
3689 header[header_len] = '\0';
3690 cram_free_block(b);
3691
3692 /* Consume any remaining blocks */
3693 for (i = 1; i < c->num_blocks; i++) {
3694 if (!(b = cram_read_block(fd))) {
3695 cram_free_container(c);
3696 return NULL;
3697 }
3698 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3699 itf8_size(b->content_id) +
3700 itf8_size(b->uncomp_size) +
3701 itf8_size(b->comp_size);
3702 cram_free_block(b);
3703 }
3704
3705 if (c->length > 0 && len > 0 && c->length > len) {
3706 // Consume padding
3707 char *pads = malloc(c->length - len);
3708 if (!pads) {
3709 cram_free_container(c);
3710 return NULL;
3711 }
3712
3713 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3714 cram_free_container(c);
3715 return NULL;
3716 }
3717 free(pads);
3718 }
3719
3720 cram_free_container(c);
3721 }
3722
3723 /* Parse */
3724 hdr = sam_hdr_parse_(header, header_len);
3725 free(header);
3726
3727 return hdr;
3728 }
3729
3730 /*
3731 * Converts 'in' to a full pathname to store in out.
3732 * Out must be at least PATH_MAX bytes long.
3733 */
full_path(char * out,char * in)3734 static void full_path(char *out, char *in) {
3735 size_t in_l = strlen(in);
3736 if (*in == '/' ||
3737 // Windows paths
3738 (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' &&
3739 in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
3740 strncpy(out, in, PATH_MAX);
3741 out[PATH_MAX-1] = 0;
3742 } else {
3743 int len;
3744
3745 // unable to get dir or out+in is too long
3746 if (!getcwd(out, PATH_MAX) ||
3747 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3748 strncpy(out, in, PATH_MAX);
3749 out[PATH_MAX-1] = 0;
3750 return;
3751 }
3752
3753 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3754
3755 // FIXME: cope with `pwd`/../../../foo.fa ?
3756 }
3757 }
3758
3759 /*
3760 * Writes a CRAM SAM header.
3761 * Returns 0 on success
3762 * -1 on failure
3763 */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)3764 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3765 int header_len;
3766 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3767
3768 /* Write CRAM MAGIC if not yet written. */
3769 if (fd->file_def->major_version == 0) {
3770 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3771 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3772 if (0 != cram_write_file_def(fd, fd->file_def))
3773 return -1;
3774 }
3775
3776 /* 1.0 requires an UNKNOWN read-group */
3777 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3778 if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3779 if (sam_hdr_add(hdr, "RG",
3780 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3781 return -1;
3782 }
3783
3784 /* Fix M5 strings */
3785 if (fd->refs && !fd->no_ref) {
3786 int i;
3787 for (i = 0; i < hdr->nref; i++) {
3788 SAM_hdr_type *ty;
3789 char *ref;
3790
3791 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3792 return -1;
3793
3794 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3795 char unsigned buf[16];
3796 char buf2[33];
3797 int rlen;
3798 hts_md5_context *md5;
3799
3800 if (!fd->refs ||
3801 !fd->refs->ref_id ||
3802 !fd->refs->ref_id[i]) {
3803 return -1;
3804 }
3805 rlen = fd->refs->ref_id[i]->length;
3806 if (!(md5 = hts_md5_init()))
3807 return -1;
3808 ref = cram_get_ref(fd, i, 1, rlen);
3809 if (NULL == ref) return -1;
3810 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3811 hts_md5_update(md5, ref, rlen);
3812 hts_md5_final(buf, md5);
3813 hts_md5_destroy(md5);
3814 cram_ref_decr(fd->refs, i);
3815
3816 hts_md5_hex(buf2, buf);
3817 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3818 return -1;
3819 }
3820
3821 if (fd->ref_fn) {
3822 char ref_fn[PATH_MAX];
3823 full_path(ref_fn, fd->ref_fn);
3824 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3825 return -1;
3826 }
3827 }
3828 }
3829
3830 if (sam_hdr_rebuild(hdr))
3831 return -1;
3832
3833 /* Length */
3834 header_len = sam_hdr_length(hdr);
3835 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3836 if (-1 == int32_encode(fd, header_len))
3837 return -1;
3838
3839 /* Text data */
3840 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3841 return -1;
3842 } else {
3843 /* Create block(s) inside a container */
3844 cram_block *b = cram_new_block(FILE_HEADER, 0);
3845 cram_container *c = cram_new_container(0, 0);
3846 int padded_length;
3847 char *pads;
3848 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3849
3850 if (!b || !c) {
3851 if (b) cram_free_block(b);
3852 if (c) cram_free_container(c);
3853 return -1;
3854 }
3855
3856 int32_put_blk(b, header_len);
3857 if (header_len)
3858 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3859 BLOCK_UPLEN(b);
3860
3861 // Compress header block if V3.0 and above
3862 if (CRAM_MAJOR_VERS(fd->version) >= 3)
3863 cram_compress_block(fd, b, NULL, -1, -1);
3864
3865 if (blank_block) {
3866 c->length = b->comp_size + 2 + 4*is_cram_3 +
3867 itf8_size(b->content_id) +
3868 itf8_size(b->uncomp_size) +
3869 itf8_size(b->comp_size);
3870
3871 c->num_blocks = 2;
3872 c->num_landmarks = 2;
3873 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3874 cram_free_block(b);
3875 cram_free_container(c);
3876 return -1;
3877 }
3878 c->landmark[0] = 0;
3879 c->landmark[1] = c->length;
3880
3881 // Plus extra storage for uncompressed secondary blank block
3882 padded_length = MIN(c->length*.5, 10000);
3883 c->length += padded_length + 2 + 4*is_cram_3 +
3884 itf8_size(b->content_id) +
3885 itf8_size(padded_length)*2;
3886 } else {
3887 // Pad the block instead.
3888 c->num_blocks = 1;
3889 c->num_landmarks = 1;
3890 if (!(c->landmark = malloc(sizeof(*c->landmark))))
3891 return -1;
3892 c->landmark[0] = 0;
3893
3894 padded_length = MAX(c->length*1.5, 10000) - c->length;
3895
3896 c->length = b->comp_size + padded_length +
3897 2 + 4*is_cram_3 +
3898 itf8_size(b->content_id) +
3899 itf8_size(b->uncomp_size) +
3900 itf8_size(b->comp_size);
3901
3902 if (NULL == (pads = calloc(1, padded_length))) {
3903 cram_free_block(b);
3904 cram_free_container(c);
3905 return -1;
3906 }
3907 BLOCK_APPEND(b, pads, padded_length);
3908 BLOCK_UPLEN(b);
3909 free(pads);
3910 }
3911
3912 if (-1 == cram_write_container(fd, c)) {
3913 cram_free_block(b);
3914 cram_free_container(c);
3915 return -1;
3916 }
3917
3918 if (-1 == cram_write_block(fd, b)) {
3919 cram_free_block(b);
3920 cram_free_container(c);
3921 return -1;
3922 }
3923
3924 if (blank_block) {
3925 BLOCK_RESIZE(b, padded_length);
3926 memset(BLOCK_DATA(b), 0, padded_length);
3927 BLOCK_SIZE(b) = padded_length;
3928 BLOCK_UPLEN(b);
3929 b->method = RAW;
3930 if (-1 == cram_write_block(fd, b)) {
3931 cram_free_block(b);
3932 cram_free_container(c);
3933 return -1;
3934 }
3935 }
3936
3937 cram_free_block(b);
3938 cram_free_container(c);
3939 }
3940
3941 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3942 return -1;
3943 if (-1 == refs2id(fd->refs, fd->header))
3944 return -1;
3945
3946 if (0 != hflush(fd->fp))
3947 return -1;
3948
3949 RP("=== Finishing saving header ===\n");
3950
3951 return 0;
3952 }
3953
3954 /* ----------------------------------------------------------------------
3955 * The top-level cram opening, closing and option handling
3956 */
3957
3958 /*
3959 * Initialises the lookup tables. These could be global statics, but they're
3960 * clumsy to setup in a multi-threaded environment unless we generate
3961 * verbatim code and include that.
3962 */
cram_init_tables(cram_fd * fd)3963 static void cram_init_tables(cram_fd *fd) {
3964 int i;
3965
3966 memset(fd->L1, 4, 256);
3967 fd->L1['A'] = 0; fd->L1['a'] = 0;
3968 fd->L1['C'] = 1; fd->L1['c'] = 1;
3969 fd->L1['G'] = 2; fd->L1['g'] = 2;
3970 fd->L1['T'] = 3; fd->L1['t'] = 3;
3971
3972 memset(fd->L2, 5, 256);
3973 fd->L2['A'] = 0; fd->L2['a'] = 0;
3974 fd->L2['C'] = 1; fd->L2['c'] = 1;
3975 fd->L2['G'] = 2; fd->L2['g'] = 2;
3976 fd->L2['T'] = 3; fd->L2['t'] = 3;
3977 fd->L2['N'] = 4; fd->L2['n'] = 4;
3978
3979 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3980 for (i = 0; i < 0x200; i++) {
3981 int f = 0;
3982
3983 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
3984 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3985 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
3986 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
3987 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
3988 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
3989 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
3990 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
3991 if (i & CRAM_FDUP) f |= BAM_FDUP;
3992
3993 fd->bam_flag_swap[i] = f;
3994 }
3995
3996 for (i = 0; i < 0x1000; i++) {
3997 int g = 0;
3998
3999 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
4000 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
4001 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
4002 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
4003 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
4004 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
4005 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
4006 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
4007 if (i & BAM_FDUP) g |= CRAM_FDUP;
4008
4009 fd->cram_flag_swap[i] = g;
4010 }
4011 } else {
4012 /* NOP */
4013 for (i = 0; i < 0x1000; i++)
4014 fd->bam_flag_swap[i] = i;
4015 for (i = 0; i < 0x1000; i++)
4016 fd->cram_flag_swap[i] = i;
4017 }
4018
4019 memset(fd->cram_sub_matrix, 4, 32*32);
4020 for (i = 0; i < 32; i++) {
4021 fd->cram_sub_matrix[i]['A'&0x1f]=0;
4022 fd->cram_sub_matrix[i]['C'&0x1f]=1;
4023 fd->cram_sub_matrix[i]['G'&0x1f]=2;
4024 fd->cram_sub_matrix[i]['T'&0x1f]=3;
4025 fd->cram_sub_matrix[i]['N'&0x1f]=4;
4026 }
4027 for (i = 0; i < 20; i+=4) {
4028 int j;
4029 for (j = 0; j < 20; j++) {
4030 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4031 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4032 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4033 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4034 }
4035 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
4036 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
4037 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
4038 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
4039 }
4040 }
4041
4042 // Default version numbers for CRAM
4043 static int major_version = 3;
4044 static int minor_version = 0;
4045
4046 /*
4047 * Opens a CRAM file for read (mode "rb") or write ("wb").
4048 * The filename may be "-" to indicate stdin or stdout.
4049 *
4050 * Returns file handle on success
4051 * NULL on failure.
4052 */
cram_open(const char * filename,const char * mode)4053 cram_fd *cram_open(const char *filename, const char *mode) {
4054 hFILE *fp;
4055 cram_fd *fd;
4056 char fmode[3]= { mode[0], '\0', '\0' };
4057
4058 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
4059 fmode[1] = 'b';
4060 }
4061
4062 fp = hopen(filename, fmode);
4063 if (!fp)
4064 return NULL;
4065
4066 fd = cram_dopen(fp, filename, mode);
4067 if (!fd)
4068 hclose_abruptly(fp);
4069
4070 return fd;
4071 }
4072
4073 /* Opens an existing stream for reading or writing.
4074 *
4075 * Returns file handle on success;
4076 * NULL on failure.
4077 */
cram_dopen(hFILE * fp,const char * filename,const char * mode)4078 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
4079 int i;
4080 char *cp;
4081 cram_fd *fd = calloc(1, sizeof(*fd));
4082 if (!fd)
4083 return NULL;
4084
4085 fd->level = 5;
4086 for (i = 0; mode[i]; i++) {
4087 if (mode[i] >= '0' && mode[i] <= '9') {
4088 fd->level = mode[i] - '0';
4089 break;
4090 }
4091 }
4092
4093 fd->fp = fp;
4094 fd->mode = *mode;
4095 fd->first_container = 0;
4096
4097 if (fd->mode == 'r') {
4098 /* Reader */
4099
4100 if (!(fd->file_def = cram_read_file_def(fd)))
4101 goto err;
4102
4103 fd->version = fd->file_def->major_version * 256 +
4104 fd->file_def->minor_version;
4105
4106 if (!(fd->header = cram_read_SAM_hdr(fd))) {
4107 cram_free_file_def(fd->file_def);
4108 goto err;
4109 }
4110
4111 } else {
4112 /* Writer */
4113 cram_file_def *def = calloc(1, sizeof(*def));
4114 if (!def)
4115 return NULL;
4116
4117 fd->file_def = def;
4118
4119 def->magic[0] = 'C';
4120 def->magic[1] = 'R';
4121 def->magic[2] = 'A';
4122 def->magic[3] = 'M';
4123 def->major_version = 0; // Indicator to write file def later.
4124 def->minor_version = 0;
4125 memset(def->file_id, 0, 20);
4126 strncpy(def->file_id, filename, 20);
4127
4128 fd->version = major_version * 256 + minor_version;
4129
4130 /* SAM header written later along with this file_def */
4131 }
4132
4133 cram_init_tables(fd);
4134
4135 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
4136 if (!fd->prefix)
4137 goto err;
4138 fd->first_base = fd->last_base = -1;
4139 fd->record_counter = 0;
4140
4141 fd->ctr = NULL;
4142 fd->ctr_mt = NULL;
4143 fd->refs = refs_create();
4144 if (!fd->refs)
4145 goto err;
4146 fd->ref_id = -2;
4147 fd->ref = NULL;
4148
4149 fd->decode_md = 0;
4150 fd->seqs_per_slice = SEQS_PER_SLICE;
4151 fd->bases_per_slice = BASES_PER_SLICE;
4152 fd->slices_per_container = SLICE_PER_CNT;
4153 fd->embed_ref = 0;
4154 fd->no_ref = 0;
4155 fd->ignore_md5 = 0;
4156 fd->lossy_read_names = 0;
4157 fd->use_bz2 = 0;
4158 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
4159 fd->use_lzma = 0;
4160 fd->multi_seq = -1;
4161 fd->unsorted = 0;
4162 fd->shared_ref = 0;
4163 fd->store_md = 0;
4164 fd->store_nm = 0;
4165
4166 fd->index = NULL;
4167 fd->own_pool = 0;
4168 fd->pool = NULL;
4169 fd->rqueue = NULL;
4170 fd->job_pending = NULL;
4171 fd->ooc = 0;
4172 fd->required_fields = INT_MAX;
4173
4174 for (i = 0; i < DS_END; i++)
4175 fd->m[i] = cram_new_metrics();
4176
4177 if (!(fd->tags_used = kh_init(m_metrics)))
4178 goto err;
4179
4180 fd->range.refid = -2; // no ref.
4181 fd->eof = 1; // See samtools issue #150
4182 fd->ref_fn = NULL;
4183
4184 fd->bl = NULL;
4185
4186 /* Initialise dummy refs from the @SQ headers */
4187 if (-1 == refs_from_header(fd->refs, fd, fd->header))
4188 goto err;
4189
4190 return fd;
4191
4192 err:
4193 if (fd)
4194 free(fd);
4195
4196 return NULL;
4197 }
4198
4199 /*
4200 * Seek within a CRAM file.
4201 *
4202 * Returns 0 on success
4203 * -1 on failure
4204 */
cram_seek(cram_fd * fd,off_t offset,int whence)4205 int cram_seek(cram_fd *fd, off_t offset, int whence) {
4206 char buf[65536];
4207
4208 fd->ooc = 0;
4209
4210 cram_drain_rqueue(fd);
4211
4212 if (hseek(fd->fp, offset, whence) >= 0) {
4213 return 0;
4214 }
4215
4216 if (!(whence == SEEK_CUR && offset >= 0))
4217 return -1;
4218
4219 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4220 while (offset > 0) {
4221 int len = MIN(65536, offset);
4222 if (len != hread(fd->fp, buf, len))
4223 return -1;
4224 offset -= len;
4225 }
4226
4227 return 0;
4228 }
4229
4230 /*
4231 * Flushes a CRAM file.
4232 * Useful for when writing to stdout without wishing to close the stream.
4233 *
4234 * Returns 0 on success
4235 * -1 on failure
4236 */
cram_flush(cram_fd * fd)4237 int cram_flush(cram_fd *fd) {
4238 if (!fd)
4239 return -1;
4240
4241 if (fd->mode == 'w' && fd->ctr) {
4242 if(fd->ctr->slice)
4243 cram_update_curr_slice(fd->ctr);
4244
4245 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4246 return -1;
4247 }
4248
4249 return 0;
4250 }
4251
4252 /*
4253 * Closes a CRAM file.
4254 * Returns 0 on success
4255 * -1 on failure
4256 */
cram_close(cram_fd * fd)4257 int cram_close(cram_fd *fd) {
4258 spare_bams *bl, *next;
4259 int i;
4260
4261 if (!fd)
4262 return -1;
4263
4264 if (fd->mode == 'w' && fd->ctr) {
4265 if(fd->ctr->slice)
4266 cram_update_curr_slice(fd->ctr);
4267
4268 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4269 return -1;
4270 }
4271
4272 if (fd->mode != 'w')
4273 cram_drain_rqueue(fd);
4274
4275 if (fd->pool && fd->eof >= 0) {
4276 hts_tpool_process_flush(fd->rqueue);
4277
4278 if (0 != cram_flush_result(fd))
4279 return -1;
4280
4281 if (fd->mode == 'w')
4282 fd->ctr = NULL; // prevent double freeing
4283
4284 pthread_mutex_destroy(&fd->metrics_lock);
4285 pthread_mutex_destroy(&fd->ref_lock);
4286 pthread_mutex_destroy(&fd->bam_list_lock);
4287
4288 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4289
4290 hts_tpool_process_destroy(fd->rqueue);
4291 }
4292
4293 if (fd->mode == 'w') {
4294 /* Write EOF block */
4295 if (CRAM_MAJOR_VERS(fd->version) == 3) {
4296 if (38 != hwrite(fd->fp,
4297 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4298 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4299 "\x00\x01\x00" // Cont HDR
4300 "\x05\xbd\xd9\x4f" // CRC32
4301 "\x00\x01\x00\x06\x06" // Comp.HDR blk
4302 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
4303 "\xee\x63\x01\x4b", // CRC32
4304 38))
4305 return -1;
4306 } else {
4307 if (30 != hwrite(fd->fp,
4308 "\x0b\x00\x00\x00\xff\xff\xff\xff"
4309 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4310 "\x00\x01\x00\x00\x01\x00\x06\x06"
4311 "\x01\x00\x01\x00\x01\x00", 30))
4312 return -1;
4313 }
4314 }
4315
4316 for (bl = fd->bl; bl; bl = next) {
4317 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
4318
4319 next = bl->next;
4320 for (i = 0; i < max_rec; i++) {
4321 if (bl->bams[i])
4322 bam_free(bl->bams[i]);
4323 }
4324 free(bl->bams);
4325 free(bl);
4326 }
4327
4328 if (hclose(fd->fp) != 0)
4329 return -1;
4330
4331 if (fd->file_def)
4332 cram_free_file_def(fd->file_def);
4333
4334 if (fd->header)
4335 sam_hdr_free(fd->header);
4336
4337 free(fd->prefix);
4338
4339 if (fd->ctr)
4340 cram_free_container(fd->ctr);
4341
4342 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
4343 cram_free_container(fd->ctr_mt);
4344
4345 if (fd->refs)
4346 refs_free(fd->refs);
4347 if (fd->ref_free)
4348 free(fd->ref_free);
4349
4350 for (i = 0; i < DS_END; i++)
4351 if (fd->m[i])
4352 free(fd->m[i]);
4353
4354 if (fd->tags_used) {
4355 khint_t k;
4356
4357 for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
4358 if (kh_exist(fd->tags_used, k))
4359 free(kh_val(fd->tags_used, k));
4360 }
4361
4362 kh_destroy(m_metrics, fd->tags_used);
4363 }
4364
4365 if (fd->index)
4366 cram_index_free(fd);
4367
4368 if (fd->own_pool && fd->pool)
4369 hts_tpool_destroy(fd->pool);
4370
4371 free(fd);
4372 return 0;
4373 }
4374
4375 /*
4376 * Returns 1 if we hit an EOF while reading.
4377 */
cram_eof(cram_fd * fd)4378 int cram_eof(cram_fd *fd) {
4379 return fd->eof;
4380 }
4381
4382
4383 /*
4384 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4385 * Use this immediately after opening.
4386 *
4387 * Returns 0 on success
4388 * -1 on failure
4389 */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)4390 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
4391 int r;
4392 va_list args;
4393
4394 va_start(args, opt);
4395 r = cram_set_voption(fd, opt, args);
4396 va_end(args);
4397
4398 return r;
4399 }
4400
4401 /*
4402 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4403 * Use this immediately after opening.
4404 *
4405 * Returns 0 on success
4406 * -1 on failure
4407 */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)4408 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
4409 refs_t *refs;
4410
4411 if (!fd) {
4412 errno = EBADF;
4413 return -1;
4414 }
4415
4416 switch (opt) {
4417 case CRAM_OPT_DECODE_MD:
4418 fd->decode_md = va_arg(args, int);
4419 break;
4420
4421 case CRAM_OPT_PREFIX:
4422 if (fd->prefix)
4423 free(fd->prefix);
4424 if (!(fd->prefix = strdup(va_arg(args, char *))))
4425 return -1;
4426 break;
4427
4428 case CRAM_OPT_VERBOSITY:
4429 break;
4430
4431 case CRAM_OPT_SEQS_PER_SLICE:
4432 fd->seqs_per_slice = va_arg(args, int);
4433 break;
4434
4435 case CRAM_OPT_BASES_PER_SLICE:
4436 fd->bases_per_slice = va_arg(args, int);
4437 break;
4438
4439 case CRAM_OPT_SLICES_PER_CONTAINER:
4440 fd->slices_per_container = va_arg(args, int);
4441 break;
4442
4443 case CRAM_OPT_EMBED_REF:
4444 fd->embed_ref = va_arg(args, int);
4445 break;
4446
4447 case CRAM_OPT_NO_REF:
4448 fd->no_ref = va_arg(args, int);
4449 break;
4450
4451 case CRAM_OPT_IGNORE_MD5:
4452 fd->ignore_md5 = va_arg(args, int);
4453 break;
4454
4455 case CRAM_OPT_LOSSY_NAMES:
4456 fd->lossy_read_names = va_arg(args, int);
4457 // Currently lossy read names required paired (attached) reads.
4458 // TLEN 0 or being 1 out causes read pairs to be detached, breaking
4459 // the lossy read name compression, so we have extra options to
4460 // slacken the exact TLEN round-trip checks.
4461 fd->tlen_approx = fd->lossy_read_names;
4462 fd->tlen_zero = fd->lossy_read_names;
4463 break;
4464
4465 case CRAM_OPT_USE_BZIP2:
4466 fd->use_bz2 = va_arg(args, int);
4467 break;
4468
4469 case CRAM_OPT_USE_RANS:
4470 fd->use_rans = va_arg(args, int);
4471 break;
4472
4473 case CRAM_OPT_USE_LZMA:
4474 fd->use_lzma = va_arg(args, int);
4475 break;
4476
4477 case CRAM_OPT_SHARED_REF:
4478 fd->shared_ref = 1;
4479 refs = va_arg(args, refs_t *);
4480 if (refs != fd->refs) {
4481 if (fd->refs)
4482 refs_free(fd->refs);
4483 fd->refs = refs;
4484 fd->refs->count++;
4485 }
4486 break;
4487
4488 case CRAM_OPT_RANGE: {
4489 int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
4490 pthread_mutex_lock(&fd->range_lock);
4491 if (fd->range.refid != -2)
4492 fd->required_fields |= SAM_POS;
4493 pthread_mutex_unlock(&fd->range_lock);
4494 return r;
4495 }
4496
4497 case CRAM_OPT_REFERENCE:
4498 return cram_load_reference(fd, va_arg(args, char *));
4499
4500 case CRAM_OPT_VERSION: {
4501 int major, minor;
4502 char *s = va_arg(args, char *);
4503 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4504 hts_log_error("Malformed version string %s", s);
4505 return -1;
4506 }
4507 if (!((major == 1 && minor == 0) ||
4508 (major == 2 && (minor == 0 || minor == 1)) ||
4509 (major == 3 && minor == 0))) {
4510 hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0");
4511 errno = EINVAL;
4512 return -1;
4513 }
4514 fd->version = major*256 + minor;
4515
4516 if (CRAM_MAJOR_VERS(fd->version) >= 3)
4517 fd->use_rans = 1;
4518 break;
4519 }
4520
4521 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4522 fd->multi_seq = va_arg(args, int);
4523 break;
4524
4525 case CRAM_OPT_NTHREADS: {
4526 int nthreads = va_arg(args, int);
4527 if (nthreads >= 1) {
4528 if (!(fd->pool = hts_tpool_init(nthreads)))
4529 return -1;
4530
4531 fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
4532 pthread_mutex_init(&fd->metrics_lock, NULL);
4533 pthread_mutex_init(&fd->ref_lock, NULL);
4534 pthread_mutex_init(&fd->range_lock, NULL);
4535 pthread_mutex_init(&fd->bam_list_lock, NULL);
4536 fd->shared_ref = 1;
4537 fd->own_pool = 1;
4538 }
4539 break;
4540 }
4541
4542 case CRAM_OPT_THREAD_POOL: {
4543 htsThreadPool *p = va_arg(args, htsThreadPool *);
4544 fd->pool = p ? p->pool : NULL;
4545 if (fd->pool) {
4546 fd->rqueue = hts_tpool_process_init(fd->pool,
4547 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
4548 0);
4549 pthread_mutex_init(&fd->metrics_lock, NULL);
4550 pthread_mutex_init(&fd->ref_lock, NULL);
4551 pthread_mutex_init(&fd->range_lock, NULL);
4552 pthread_mutex_init(&fd->bam_list_lock, NULL);
4553 }
4554 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4555 fd->own_pool = 0;
4556
4557 //fd->qsize = 1;
4558 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4559 //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4560 break;
4561 }
4562
4563 case CRAM_OPT_REQUIRED_FIELDS:
4564 fd->required_fields = va_arg(args, int);
4565 if (fd->range.refid != -2)
4566 fd->required_fields |= SAM_POS;
4567 break;
4568
4569 case CRAM_OPT_STORE_MD:
4570 fd->store_md = va_arg(args, int);
4571 break;
4572
4573 case CRAM_OPT_STORE_NM:
4574 fd->store_nm = va_arg(args, int);
4575 break;
4576
4577 case HTS_OPT_COMPRESSION_LEVEL:
4578 fd->level = va_arg(args, int);
4579 break;
4580
4581 default:
4582 hts_log_error("Unknown CRAM option code %d", opt);
4583 errno = EINVAL;
4584 return -1;
4585 }
4586
4587 return 0;
4588 }
4589
cram_check_EOF(cram_fd * fd)4590 int cram_check_EOF(cram_fd *fd)
4591 {
4592 // Byte 9 in these templates is & with 0x0f to resolve differences
4593 // between ITF-8 interpretations between early Java and C
4594 // implementations of CRAM
4595 static const unsigned char TEMPLATE_2_1[30] = {
4596 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4597 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4598 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4599 };
4600 static const unsigned char TEMPLATE_3[38] = {
4601 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4602 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4603 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4604 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4605 };
4606
4607 unsigned char buf[38]; // max(sizeof TEMPLATE_*)
4608
4609 uint8_t major = CRAM_MAJOR_VERS(fd->version);
4610 uint8_t minor = CRAM_MINOR_VERS(fd->version);
4611
4612 const unsigned char *template;
4613 ssize_t template_len;
4614 if ((major < 2) ||
4615 (major == 2 && minor == 0)) {
4616 return 3; // No EOF support in cram versions less than 2.1
4617 } else if (major == 2 && minor == 1) {
4618 template = TEMPLATE_2_1;
4619 template_len = sizeof TEMPLATE_2_1;
4620 } else {
4621 template = TEMPLATE_3;
4622 template_len = sizeof TEMPLATE_3;
4623 }
4624
4625 off_t offset = htell(fd->fp);
4626 if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
4627 if (errno == ESPIPE) {
4628 hclearerr(fd->fp);
4629 return 2;
4630 }
4631 else {
4632 return -1;
4633 }
4634 }
4635 if (hread(fd->fp, buf, template_len) != template_len) return -1;
4636 if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
4637 buf[8] &= 0x0f;
4638 return (memcmp(template, buf, template_len) == 0)? 1 : 0;
4639 }
4640