1 /*
2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * CRAM I/O primitives.
33 *
34 * - ITF8 encoding and decoding.
35 * - Block based I/O
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
40 */
41
42 /*
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44 * a way to return errors for when malloc fails.
45 */
46
47 #include <config.h>
48
49 #include <stdio.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <zlib.h>
56 #ifdef HAVE_LIBBZ2
57 #include <bzlib.h>
58 #endif
59 #ifdef HAVE_LIBLZMA
60 #ifdef HAVE_LZMA_H
61 #include <lzma.h>
62 #else
63 #include "os/lzma_stub.h"
64 #endif
65 #endif
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <math.h>
69 #include <time.h>
70 #include <stdint.h>
71
72 #include "cram/cram.h"
73 #include "cram/os.h"
74 #include "htslib/hts.h"
75 #include "cram/open_trace_file.h"
76 #include "cram/rANS_static.h"
77
78 //#define REF_DEBUG
79
80 #ifdef REF_DEBUG
81 #include <sys/syscall.h>
82 #define gettid() (int)syscall(SYS_gettid)
83
84 #define RP(...) fprintf (stderr, __VA_ARGS__)
85 #else
86 #define RP(...)
87 #endif
88
89 #include "htslib/hfile.h"
90 #include "htslib/bgzf.h"
91 #include "htslib/faidx.h"
92 #include "hts_internal.h"
93
94 #ifndef PATH_MAX
95 #define PATH_MAX FILENAME_MAX
96 #endif
97
98 #define TRIAL_SPAN 50
99 #define NTRIALS 3
100
101
102 /* ----------------------------------------------------------------------
103 * ITF8 encoding and decoding.
104 *
105 * Also see the itf8_get and itf8_put macros in cram_io.h
106 */
107
108 /*
109 * LEGACY: consider using itf8_decode_crc.
110 *
111 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
112 * *val.
113 *
114 * Returns the number of bytes read on success
115 * -1 on failure
116 */
itf8_decode(cram_fd * fd,int32_t * val_p)117 int itf8_decode(cram_fd *fd, int32_t *val_p) {
118 static int nbytes[16] = {
119 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
120 1,1,1,1, // 1000xxxx - 1011xxxx
121 2,2, // 1100xxxx - 1101xxxx
122 3, // 1110xxxx
123 4, // 1111xxxx
124 };
125
126 static int nbits[16] = {
127 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
128 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
129 0x1f, 0x1f, // 1100xxxx - 1101xxxx
130 0x0f, // 1110xxxx
131 0x0f, // 1111xxxx
132 };
133
134 int32_t val = hgetc(fd->fp);
135 if (val == -1)
136 return -1;
137
138 int i = nbytes[val>>4];
139 val &= nbits[val>>4];
140
141 switch(i) {
142 case 0:
143 *val_p = val;
144 return 1;
145
146 case 1:
147 val = (val<<8) | (unsigned char)hgetc(fd->fp);
148 *val_p = val;
149 return 2;
150
151 case 2:
152 val = (val<<8) | (unsigned char)hgetc(fd->fp);
153 val = (val<<8) | (unsigned char)hgetc(fd->fp);
154 *val_p = val;
155 return 3;
156
157 case 3:
158 val = (val<<8) | (unsigned char)hgetc(fd->fp);
159 val = (val<<8) | (unsigned char)hgetc(fd->fp);
160 val = (val<<8) | (unsigned char)hgetc(fd->fp);
161 *val_p = val;
162 return 4;
163
164 case 4: // really 3.5 more, why make it different?
165 val = (val<<8) | (unsigned char)hgetc(fd->fp);
166 val = (val<<8) | (unsigned char)hgetc(fd->fp);
167 val = (val<<8) | (unsigned char)hgetc(fd->fp);
168 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
169 *val_p = val;
170 }
171
172 return 5;
173 }
174
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)175 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
176 static int nbytes[16] = {
177 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
178 1,1,1,1, // 1000xxxx - 1011xxxx
179 2,2, // 1100xxxx - 1101xxxx
180 3, // 1110xxxx
181 4, // 1111xxxx
182 };
183
184 static int nbits[16] = {
185 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
186 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
187 0x1f, 0x1f, // 1100xxxx - 1101xxxx
188 0x0f, // 1110xxxx
189 0x0f, // 1111xxxx
190 };
191 unsigned char c[5];
192
193 int32_t val = hgetc(fd->fp);
194 if (val == -1)
195 return -1;
196
197 c[0]=val;
198
199 int i = nbytes[val>>4];
200 val &= nbits[val>>4];
201
202 switch(i) {
203 case 0:
204 *val_p = val;
205 *crc = crc32(*crc, c, 1);
206 return 1;
207
208 case 1:
209 val = (val<<8) | (c[1]=hgetc(fd->fp));
210 *val_p = val;
211 *crc = crc32(*crc, c, 2);
212 return 2;
213
214 case 2:
215 val = (val<<8) | (c[1]=hgetc(fd->fp));
216 val = (val<<8) | (c[2]=hgetc(fd->fp));
217 *val_p = val;
218 *crc = crc32(*crc, c, 3);
219 return 3;
220
221 case 3:
222 val = (val<<8) | (c[1]=hgetc(fd->fp));
223 val = (val<<8) | (c[2]=hgetc(fd->fp));
224 val = (val<<8) | (c[3]=hgetc(fd->fp));
225 *val_p = val;
226 *crc = crc32(*crc, c, 4);
227 return 4;
228
229 case 4: // really 3.5 more, why make it different?
230 {
231 uint32_t uv = val;
232 uv = (uv<<8) | (c[1]=hgetc(fd->fp));
233 uv = (uv<<8) | (c[2]=hgetc(fd->fp));
234 uv = (uv<<8) | (c[3]=hgetc(fd->fp));
235 uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
236 *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
237 *crc = crc32(*crc, c, 5);
238 }
239 }
240
241 return 5;
242 }
243
244 /*
245 * Encodes and writes a single integer in ITF-8 format.
246 * Returns 0 on success
247 * -1 on failure
248 */
itf8_encode(cram_fd * fd,int32_t val)249 int itf8_encode(cram_fd *fd, int32_t val) {
250 char buf[5];
251 int len = itf8_put(buf, val);
252 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
253 }
254
255 const int itf8_bytes[16] = {
256 1, 1, 1, 1, 1, 1, 1, 1,
257 2, 2, 2, 2, 3, 3, 4, 5
258 };
259
260 const int ltf8_bytes[256] = {
261 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
265
266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
270
271 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
272 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
273 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
274 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
275
276 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
277 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
278
279 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
280
281 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
282 };
283
284 /*
285 * LEGACY: consider using ltf8_decode_crc.
286 */
ltf8_decode(cram_fd * fd,int64_t * val_p)287 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
288 int c = hgetc(fd->fp);
289 int64_t val = (unsigned char)c;
290 if (c == -1)
291 return -1;
292
293 if (val < 0x80) {
294 *val_p = val;
295 return 1;
296
297 } else if (val < 0xc0) {
298 val = (val<<8) | (unsigned char)hgetc(fd->fp);
299 *val_p = val & (((1LL<<(6+8)))-1);
300 return 2;
301
302 } else if (val < 0xe0) {
303 val = (val<<8) | (unsigned char)hgetc(fd->fp);
304 val = (val<<8) | (unsigned char)hgetc(fd->fp);
305 *val_p = val & ((1LL<<(5+2*8))-1);
306 return 3;
307
308 } else if (val < 0xf0) {
309 val = (val<<8) | (unsigned char)hgetc(fd->fp);
310 val = (val<<8) | (unsigned char)hgetc(fd->fp);
311 val = (val<<8) | (unsigned char)hgetc(fd->fp);
312 *val_p = val & ((1LL<<(4+3*8))-1);
313 return 4;
314
315 } else if (val < 0xf8) {
316 val = (val<<8) | (unsigned char)hgetc(fd->fp);
317 val = (val<<8) | (unsigned char)hgetc(fd->fp);
318 val = (val<<8) | (unsigned char)hgetc(fd->fp);
319 val = (val<<8) | (unsigned char)hgetc(fd->fp);
320 *val_p = val & ((1LL<<(3+4*8))-1);
321 return 5;
322
323 } else if (val < 0xfc) {
324 val = (val<<8) | (unsigned char)hgetc(fd->fp);
325 val = (val<<8) | (unsigned char)hgetc(fd->fp);
326 val = (val<<8) | (unsigned char)hgetc(fd->fp);
327 val = (val<<8) | (unsigned char)hgetc(fd->fp);
328 val = (val<<8) | (unsigned char)hgetc(fd->fp);
329 *val_p = val & ((1LL<<(2+5*8))-1);
330 return 6;
331
332 } else if (val < 0xfe) {
333 val = (val<<8) | (unsigned char)hgetc(fd->fp);
334 val = (val<<8) | (unsigned char)hgetc(fd->fp);
335 val = (val<<8) | (unsigned char)hgetc(fd->fp);
336 val = (val<<8) | (unsigned char)hgetc(fd->fp);
337 val = (val<<8) | (unsigned char)hgetc(fd->fp);
338 val = (val<<8) | (unsigned char)hgetc(fd->fp);
339 *val_p = val & ((1LL<<(1+6*8))-1);
340 return 7;
341
342 } else if (val < 0xff) {
343 val = (val<<8) | (unsigned char)hgetc(fd->fp);
344 val = (val<<8) | (unsigned char)hgetc(fd->fp);
345 val = (val<<8) | (unsigned char)hgetc(fd->fp);
346 val = (val<<8) | (unsigned char)hgetc(fd->fp);
347 val = (val<<8) | (unsigned char)hgetc(fd->fp);
348 val = (val<<8) | (unsigned char)hgetc(fd->fp);
349 val = (val<<8) | (unsigned char)hgetc(fd->fp);
350 *val_p = val & ((1LL<<(7*8))-1);
351 return 8;
352
353 } else {
354 val = (val<<8) | (unsigned char)hgetc(fd->fp);
355 val = (val<<8) | (unsigned char)hgetc(fd->fp);
356 val = (val<<8) | (unsigned char)hgetc(fd->fp);
357 val = (val<<8) | (unsigned char)hgetc(fd->fp);
358 val = (val<<8) | (unsigned char)hgetc(fd->fp);
359 val = (val<<8) | (unsigned char)hgetc(fd->fp);
360 val = (val<<8) | (unsigned char)hgetc(fd->fp);
361 val = (val<<8) | (unsigned char)hgetc(fd->fp);
362 *val_p = val;
363 }
364
365 return 9;
366 }
367
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)368 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
369 unsigned char c[9];
370 int64_t val = (unsigned char)hgetc(fd->fp);
371 if (val == -1)
372 return -1;
373
374 c[0] = val;
375
376 if (val < 0x80) {
377 *val_p = val;
378 *crc = crc32(*crc, c, 1);
379 return 1;
380
381 } else if (val < 0xc0) {
382 val = (val<<8) | (c[1]=hgetc(fd->fp));;
383 *val_p = val & (((1LL<<(6+8)))-1);
384 *crc = crc32(*crc, c, 2);
385 return 2;
386
387 } else if (val < 0xe0) {
388 val = (val<<8) | (c[1]=hgetc(fd->fp));;
389 val = (val<<8) | (c[2]=hgetc(fd->fp));;
390 *val_p = val & ((1LL<<(5+2*8))-1);
391 *crc = crc32(*crc, c, 3);
392 return 3;
393
394 } else if (val < 0xf0) {
395 val = (val<<8) | (c[1]=hgetc(fd->fp));;
396 val = (val<<8) | (c[2]=hgetc(fd->fp));;
397 val = (val<<8) | (c[3]=hgetc(fd->fp));;
398 *val_p = val & ((1LL<<(4+3*8))-1);
399 *crc = crc32(*crc, c, 4);
400 return 4;
401
402 } else if (val < 0xf8) {
403 val = (val<<8) | (c[1]=hgetc(fd->fp));;
404 val = (val<<8) | (c[2]=hgetc(fd->fp));;
405 val = (val<<8) | (c[3]=hgetc(fd->fp));;
406 val = (val<<8) | (c[4]=hgetc(fd->fp));;
407 *val_p = val & ((1LL<<(3+4*8))-1);
408 *crc = crc32(*crc, c, 5);
409 return 5;
410
411 } else if (val < 0xfc) {
412 val = (val<<8) | (c[1]=hgetc(fd->fp));;
413 val = (val<<8) | (c[2]=hgetc(fd->fp));;
414 val = (val<<8) | (c[3]=hgetc(fd->fp));;
415 val = (val<<8) | (c[4]=hgetc(fd->fp));;
416 val = (val<<8) | (c[5]=hgetc(fd->fp));;
417 *val_p = val & ((1LL<<(2+5*8))-1);
418 *crc = crc32(*crc, c, 6);
419 return 6;
420
421 } else if (val < 0xfe) {
422 val = (val<<8) | (c[1]=hgetc(fd->fp));;
423 val = (val<<8) | (c[2]=hgetc(fd->fp));;
424 val = (val<<8) | (c[3]=hgetc(fd->fp));;
425 val = (val<<8) | (c[4]=hgetc(fd->fp));;
426 val = (val<<8) | (c[5]=hgetc(fd->fp));;
427 val = (val<<8) | (c[6]=hgetc(fd->fp));;
428 *val_p = val & ((1LL<<(1+6*8))-1);
429 *crc = crc32(*crc, c, 7);
430 return 7;
431
432 } else if (val < 0xff) {
433 val = (val<<8) | (c[1]=hgetc(fd->fp));;
434 val = (val<<8) | (c[2]=hgetc(fd->fp));;
435 val = (val<<8) | (c[3]=hgetc(fd->fp));;
436 val = (val<<8) | (c[4]=hgetc(fd->fp));;
437 val = (val<<8) | (c[5]=hgetc(fd->fp));;
438 val = (val<<8) | (c[6]=hgetc(fd->fp));;
439 val = (val<<8) | (c[7]=hgetc(fd->fp));;
440 *val_p = val & ((1LL<<(7*8))-1);
441 *crc = crc32(*crc, c, 8);
442 return 8;
443
444 } else {
445 val = (val<<8) | (c[1]=hgetc(fd->fp));;
446 val = (val<<8) | (c[2]=hgetc(fd->fp));;
447 val = (val<<8) | (c[3]=hgetc(fd->fp));;
448 val = (val<<8) | (c[4]=hgetc(fd->fp));;
449 val = (val<<8) | (c[5]=hgetc(fd->fp));;
450 val = (val<<8) | (c[6]=hgetc(fd->fp));;
451 val = (val<<8) | (c[7]=hgetc(fd->fp));;
452 val = (val<<8) | (c[8]=hgetc(fd->fp));;
453 *crc = crc32(*crc, c, 9);
454 *val_p = val;
455 }
456
457 return 9;
458 }
459
460 /*
461 * Pushes a value in ITF8 format onto the end of a block.
462 * This shouldn't be used for high-volume data as it is not the fastest
463 * method.
464 *
465 * Returns the number of bytes written
466 */
itf8_put_blk(cram_block * blk,int val)467 int itf8_put_blk(cram_block *blk, int val) {
468 char buf[5];
469 int sz;
470
471 sz = itf8_put(buf, val);
472 BLOCK_APPEND(blk, buf, sz);
473 return sz;
474 }
475
476 /*
477 * Decodes a 32-bit little endian value from fd and stores in val.
478 *
479 * Returns the number of bytes read on success
480 * -1 on failure
481 */
int32_decode(cram_fd * fd,int32_t * val)482 int int32_decode(cram_fd *fd, int32_t *val) {
483 int32_t i;
484 if (4 != hread(fd->fp, &i, 4))
485 return -1;
486
487 *val = le_int4(i);
488 return 4;
489 }
490
491 /*
492 * Encodes a 32-bit little endian value 'val' and writes to fd.
493 *
494 * Returns the number of bytes written on success
495 * -1 on failure
496 */
int32_encode(cram_fd * fd,int32_t val)497 int int32_encode(cram_fd *fd, int32_t val) {
498 val = le_int4(val);
499 if (4 != hwrite(fd->fp, &val, 4))
500 return -1;
501
502 return 4;
503 }
504
505 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)506 int int32_get_blk(cram_block *b, int32_t *val) {
507 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
508 return -1;
509
510 *val =
511 b->data[b->byte ] |
512 (b->data[b->byte+1] << 8) |
513 (b->data[b->byte+2] << 16) |
514 (b->data[b->byte+3] << 24);
515 BLOCK_SIZE(b) += 4;
516 return 4;
517 }
518
519 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)520 int int32_put_blk(cram_block *b, int32_t val) {
521 unsigned char cp[4];
522 cp[0] = ( val & 0xff);
523 cp[1] = ((val>>8) & 0xff);
524 cp[2] = ((val>>16) & 0xff);
525 cp[3] = ((val>>24) & 0xff);
526
527 BLOCK_APPEND(b, cp, 4);
528 return b->data ? 0 : -1;
529 }
530
531 /* ----------------------------------------------------------------------
532 * zlib compression code - from Gap5's tg_iface_g.c
533 * They're static here as they're only used within the cram_compress_block
534 * and cram_uncompress_block functions, which are the external interface.
535 */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)536 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
537 z_stream s;
538 unsigned char *data = NULL; /* Uncompressed output */
539 int data_alloc = 0;
540 int err;
541
542 /* Starting point at uncompressed size, and scale after that */
543 data = malloc(data_alloc = csize*1.2+100);
544 if (!data)
545 return NULL;
546
547 /* Initialise zlib stream */
548 s.zalloc = Z_NULL; /* use default allocation functions */
549 s.zfree = Z_NULL;
550 s.opaque = Z_NULL;
551 s.next_in = (unsigned char *)cdata;
552 s.avail_in = csize;
553 s.total_in = 0;
554 s.next_out = data;
555 s.avail_out = data_alloc;
556 s.total_out = 0;
557
558 //err = inflateInit(&s);
559 err = inflateInit2(&s, 15 + 32);
560 if (err != Z_OK) {
561 hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
562 free(data);
563 return NULL;
564 }
565
566 /* Decode to 'data' array */
567 for (;s.avail_in;) {
568 unsigned char *data_tmp;
569 int alloc_inc;
570
571 s.next_out = &data[s.total_out];
572 err = inflate(&s, Z_NO_FLUSH);
573 if (err == Z_STREAM_END)
574 break;
575
576 if (err != Z_OK) {
577 hts_log_error("Call to zlib inflate failed: %s", s.msg);
578 if (data)
579 free(data);
580 return NULL;
581 }
582
583 /* More to come, so realloc based on growth so far */
584 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
585 data = realloc((data_tmp = data), data_alloc += alloc_inc);
586 if (!data) {
587 free(data_tmp);
588 return NULL;
589 }
590 s.avail_out += alloc_inc;
591 }
592 inflateEnd(&s);
593
594 *size = s.total_out;
595 return (char *)data;
596 }
597
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)598 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
599 int level, int strat) {
600 z_stream s;
601 unsigned char *cdata = NULL; /* Compressed output */
602 int cdata_alloc = 0;
603 int cdata_pos = 0;
604 int err;
605
606 cdata = malloc(cdata_alloc = size*1.05+100);
607 if (!cdata)
608 return NULL;
609 cdata_pos = 0;
610
611 /* Initialise zlib stream */
612 s.zalloc = Z_NULL; /* use default allocation functions */
613 s.zfree = Z_NULL;
614 s.opaque = Z_NULL;
615 s.next_in = (unsigned char *)data;
616 s.avail_in = size;
617 s.total_in = 0;
618 s.next_out = cdata;
619 s.avail_out = cdata_alloc;
620 s.total_out = 0;
621 s.data_type = Z_BINARY;
622
623 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
624 if (err != Z_OK) {
625 hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
626 return NULL;
627 }
628
629 /* Encode to 'cdata' array */
630 for (;s.avail_in;) {
631 s.next_out = &cdata[cdata_pos];
632 s.avail_out = cdata_alloc - cdata_pos;
633 if (cdata_alloc - cdata_pos <= 0) {
634 hts_log_error("Deflate produced larger output than expected");
635 return NULL;
636 }
637 err = deflate(&s, Z_NO_FLUSH);
638 cdata_pos = cdata_alloc - s.avail_out;
639 if (err != Z_OK) {
640 hts_log_error("Call to zlib deflate failed: %s", s.msg);
641 break;
642 }
643 }
644 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
645 hts_log_error("Call to zlib deflate failed: %s", s.msg);
646 }
647 *cdata_size = s.total_out;
648
649 if (deflateEnd(&s) != Z_OK) {
650 hts_log_error("Call to zlib deflate failed: %s", s.msg);
651 }
652 return (char *)cdata;
653 }
654
655 #ifdef HAVE_LIBLZMA
656 /* ------------------------------------------------------------------------ */
657 /*
658 * Data compression routines using liblzma (xz)
659 *
660 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
661 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
662 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
663 * as compression times.
664 *
665 * For now we disable this functionality. If it's to be reenabled make sure you
666 * improve the mem_inflate implementation as it's just a test hack at the
667 * moment.
668 */
669
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)670 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
671 int level) {
672 char *out;
673 size_t out_size = lzma_stream_buffer_bound(size);
674 *cdata_size = 0;
675
676 out = malloc(out_size);
677
678 /* Single call compression */
679 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
680 (uint8_t *)data, size,
681 (uint8_t *)out, cdata_size,
682 out_size))
683 return NULL;
684
685 return out;
686 }
687
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)688 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
689 lzma_stream strm = LZMA_STREAM_INIT;
690 size_t out_size = 0, out_pos = 0;
691 char *out = NULL;
692 int r;
693
694 /* Initiate the decoder */
695 if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
696 return NULL;
697
698 /* Decode loop */
699 strm.avail_in = csize;
700 strm.next_in = (uint8_t *)cdata;
701
702 for (;strm.avail_in;) {
703 if (strm.avail_in > out_size - out_pos) {
704 out_size += strm.avail_in * 4 + 32768;
705 out = realloc(out, out_size);
706 }
707 strm.avail_out = out_size - out_pos;
708 strm.next_out = (uint8_t *)&out[out_pos];
709
710 r = lzma_code(&strm, LZMA_RUN);
711 if (LZMA_OK != r && LZMA_STREAM_END != r) {
712 hts_log_error("LZMA decode failure (error %d)", r);
713 return NULL;
714 }
715
716 out_pos = strm.total_out;
717
718 if (r == LZMA_STREAM_END)
719 break;
720 }
721
722 /* finish up any unflushed data; necessary? */
723 r = lzma_code(&strm, LZMA_FINISH);
724 if (r != LZMA_OK && r != LZMA_STREAM_END) {
725 hts_log_error("Call to lzma_code failed with error %d", r);
726 return NULL;
727 }
728
729 out = realloc(out, strm.total_out);
730 *size = strm.total_out;
731
732 lzma_end(&strm);
733
734 return out;
735 }
736 #endif
737
738 /* ----------------------------------------------------------------------
739 * CRAM blocks - the dynamically growable data block. We have code to
740 * create, update, (un)compress and read/write.
741 *
742 * These are derived from the deflate_interlaced.c blocks, but with the
743 * CRAM extension of content types and IDs.
744 */
745
746 /*
747 * Allocates a new cram_block structure with a specified content_type and
748 * id.
749 *
750 * Returns block pointer on success
751 * NULL on failure
752 */
cram_new_block(enum cram_content_type content_type,int content_id)753 cram_block *cram_new_block(enum cram_content_type content_type,
754 int content_id) {
755 cram_block *b = malloc(sizeof(*b));
756 if (!b)
757 return NULL;
758 b->method = b->orig_method = RAW;
759 b->content_type = content_type;
760 b->content_id = content_id;
761 b->comp_size = 0;
762 b->uncomp_size = 0;
763 b->data = NULL;
764 b->alloc = 0;
765 b->byte = 0;
766 b->bit = 7; // MSB
767
768 return b;
769 }
770
771 /*
772 * Reads a block from a cram file.
773 * Returns cram_block pointer on success.
774 * NULL on failure
775 */
cram_read_block(cram_fd * fd)776 cram_block *cram_read_block(cram_fd *fd) {
777 cram_block *b = malloc(sizeof(*b));
778 unsigned char c;
779 uint32_t crc = 0;
780 if (!b)
781 return NULL;
782
783 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
784
785 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
786 c = b->method; crc = crc32(crc, &c, 1);
787 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
788 c = b->content_type; crc = crc32(crc, &c, 1);
789 if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
790 if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
791 if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
792
793 //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
794 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
795
796 if (b->method == RAW) {
797 if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
798 free(b);
799 return NULL;
800 }
801 b->alloc = b->uncomp_size;
802 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
803 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
804 free(b->data);
805 free(b);
806 return NULL;
807 }
808 } else {
809 if (b->comp_size < 0) { free(b); return NULL; }
810 b->alloc = b->comp_size;
811 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
812 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
813 free(b->data);
814 free(b);
815 return NULL;
816 }
817 }
818
819 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
820 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
821 free(b);
822 return NULL;
823 }
824
825 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
826 if (crc != b->crc32) {
827 hts_log_error("Block CRC32 failure");
828 free(b->data);
829 free(b);
830 return NULL;
831 }
832 }
833
834 b->orig_method = b->method;
835 b->idx = 0;
836 b->byte = 0;
837 b->bit = 7; // MSB
838
839 return b;
840 }
841
842
843 /*
844 * Computes the size of a cram block, including the block
845 * header itself.
846 */
cram_block_size(cram_block * b)847 uint32_t cram_block_size(cram_block *b) {
848 unsigned char dat[100], *cp = dat;;
849 uint32_t sz;
850
851 *cp++ = b->method;
852 *cp++ = b->content_type;
853 cp += itf8_put((char*)cp, b->content_id);
854 cp += itf8_put((char*)cp, b->comp_size);
855 cp += itf8_put((char*)cp, b->uncomp_size);
856
857 sz = cp-dat + 4;
858 sz += b->method == RAW ? b->uncomp_size : b->comp_size;
859
860 return sz;
861 }
862
863 /*
864 * Writes a CRAM block.
865 * Returns 0 on success
866 * -1 on failure
867 */
cram_write_block(cram_fd * fd,cram_block * b)868 int cram_write_block(cram_fd *fd, cram_block *b) {
869 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
870
871 if (hputc(b->method, fd->fp) == EOF) return -1;
872 if (hputc(b->content_type, fd->fp) == EOF) return -1;
873 if (itf8_encode(fd, b->content_id) == -1) return -1;
874 if (itf8_encode(fd, b->comp_size) == -1) return -1;
875 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
876
877 if (b->data) {
878 if (b->method == RAW) {
879 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
880 return -1;
881 } else {
882 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
883 return -1;
884 }
885 } else {
886 // Absent blocks should be size 0
887 assert(b->method == RAW && b->uncomp_size == 0);
888 }
889
890 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
891 unsigned char dat[100], *cp = dat;;
892 uint32_t crc;
893
894 *cp++ = b->method;
895 *cp++ = b->content_type;
896 cp += itf8_put((char*)cp, b->content_id);
897 cp += itf8_put((char*)cp, b->comp_size);
898 cp += itf8_put((char*)cp, b->uncomp_size);
899 crc = crc32(0L, dat, cp-dat);
900
901 if (b->method == RAW) {
902 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
903 } else {
904 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
905 }
906
907 if (-1 == int32_encode(fd, b->crc32))
908 return -1;
909 }
910
911 return 0;
912 }
913
914 /*
915 * Frees a CRAM block, deallocating internal data too.
916 */
cram_free_block(cram_block * b)917 void cram_free_block(cram_block *b) {
918 if (!b)
919 return;
920 if (b->data)
921 free(b->data);
922 free(b);
923 }
924
925 /*
926 * Uncompresses a CRAM block, if compressed.
927 */
cram_uncompress_block(cram_block * b)928 int cram_uncompress_block(cram_block *b) {
929 char *uncomp;
930 size_t uncomp_size = 0;
931
932 if (b->uncomp_size == 0) {
933 // blank block
934 b->method = RAW;
935 return 0;
936 }
937
938 switch (b->method) {
939 case RAW:
940 return 0;
941
942 case GZIP:
943 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
944 if (!uncomp)
945 return -1;
946 if ((int)uncomp_size != b->uncomp_size) {
947 free(uncomp);
948 return -1;
949 }
950 free(b->data);
951 b->data = (unsigned char *)uncomp;
952 b->alloc = uncomp_size;
953 b->method = RAW;
954 break;
955
956 #ifdef HAVE_LIBBZ2
957 case BZIP2: {
958 unsigned int usize = b->uncomp_size;
959 if (!(uncomp = malloc(usize)))
960 return -1;
961 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
962 (char *)b->data, b->comp_size,
963 0, 0)) {
964 free(uncomp);
965 return -1;
966 }
967 free(b->data);
968 b->data = (unsigned char *)uncomp;
969 b->alloc = usize;
970 b->method = RAW;
971 b->uncomp_size = usize; // Just incase it differs
972 break;
973 }
974 #else
975 case BZIP2:
976 hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
977 return -1;
978 #endif
979
980 #ifdef HAVE_LIBLZMA
981 case LZMA:
982 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
983 if (!uncomp)
984 return -1;
985 if ((int)uncomp_size != b->uncomp_size)
986 return -1;
987 free(b->data);
988 b->data = (unsigned char *)uncomp;
989 b->alloc = uncomp_size;
990 b->method = RAW;
991 break;
992 #else
993 case LZMA:
994 hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
995 return -1;
996 break;
997 #endif
998
999 case RANS: {
1000 unsigned int usize = b->uncomp_size, usize2;
1001 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1002 if (!uncomp || usize != usize2)
1003 return -1;
1004 free(b->data);
1005 b->data = (unsigned char *)uncomp;
1006 b->alloc = usize2;
1007 b->method = RAW;
1008 b->uncomp_size = usize2; // Just incase it differs
1009 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1010 break;
1011 }
1012
1013 default:
1014 return -1;
1015 }
1016
1017 return 0;
1018 }
1019
cram_compress_by_method(char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1020 static char *cram_compress_by_method(char *in, size_t in_size,
1021 size_t *out_size,
1022 enum cram_block_method method,
1023 int level, int strat) {
1024 switch (method) {
1025 case GZIP:
1026 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1027
1028 case BZIP2: {
1029 #ifdef HAVE_LIBBZ2
1030 unsigned int comp_size = in_size*1.01 + 600;
1031 char *comp = malloc(comp_size);
1032 if (!comp)
1033 return NULL;
1034
1035 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1036 in, in_size,
1037 level, 0, 30)) {
1038 free(comp);
1039 return NULL;
1040 }
1041 *out_size = comp_size;
1042 return comp;
1043 #else
1044 return NULL;
1045 #endif
1046 }
1047
1048 case LZMA:
1049 #ifdef HAVE_LIBLZMA
1050 return lzma_mem_deflate(in, in_size, out_size, level);
1051 #else
1052 return NULL;
1053 #endif
1054
1055 case RANS0: {
1056 unsigned int out_size_i;
1057 unsigned char *cp;
1058 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1059 *out_size = out_size_i;
1060 return (char *)cp;
1061 }
1062
1063 case RANS1: {
1064 unsigned int out_size_i;
1065 unsigned char *cp;
1066
1067 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1068 *out_size = out_size_i;
1069 return (char *)cp;
1070 }
1071
1072 case RAW:
1073 break;
1074
1075 default:
1076 return NULL;
1077 }
1078
1079 return NULL;
1080 }
1081
1082
1083 /*
1084 * Compresses a block using one of two different zlib strategies. If we only
1085 * want one choice set strat2 to be -1.
1086 *
1087 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1088 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1089 * significantly faster.
1090 *
1091 * Method and level -1 implies defaults, as specified in cram_fd.
1092 */
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)1093 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1094 int method, int level) {
1095
1096 char *comp = NULL;
1097 size_t comp_size = 0;
1098 int strat;
1099
1100 if (b->method != RAW) {
1101 // Maybe already compressed if s->block[0] was compressed and
1102 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1103 // one base type present and hence using E_HUFFMAN on block 0.
1104 // A second explicit attempt to compress the same block then
1105 // occurs.
1106 return 0;
1107 }
1108
1109 if (method == -1) {
1110 method = 1<<GZIP;
1111 if (fd->use_bz2)
1112 method |= 1<<BZIP2;
1113 if (fd->use_lzma)
1114 method |= 1<<LZMA;
1115 }
1116
1117 if (level == -1)
1118 level = fd->level;
1119
1120 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1121
1122 if (method == RAW || level == 0 || b->uncomp_size == 0) {
1123 b->method = RAW;
1124 b->comp_size = b->uncomp_size;
1125 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1126 return 0;
1127 }
1128
1129 if (metrics) {
1130 pthread_mutex_lock(&fd->metrics_lock);
1131 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1132 size_t sz_best = INT_MAX;
1133 size_t sz_gz_rle = 0;
1134 size_t sz_gz_def = 0;
1135 size_t sz_rans0 = 0;
1136 size_t sz_rans1 = 0;
1137 size_t sz_bzip2 = 0;
1138 size_t sz_lzma = 0;
1139 int method_best = 0;
1140 char *c_best = NULL, *c = NULL;
1141
1142 if (metrics->revised_method)
1143 method = metrics->revised_method;
1144 else
1145 metrics->revised_method = method;
1146
1147 if (metrics->next_trial <= 0) {
1148 metrics->next_trial = TRIAL_SPAN;
1149 metrics->trial = NTRIALS;
1150 metrics->sz_gz_rle /= 2;
1151 metrics->sz_gz_def /= 2;
1152 metrics->sz_rans0 /= 2;
1153 metrics->sz_rans1 /= 2;
1154 metrics->sz_bzip2 /= 2;
1155 metrics->sz_lzma /= 2;
1156 }
1157
1158 pthread_mutex_unlock(&fd->metrics_lock);
1159
1160 if (method & (1<<GZIP_RLE)) {
1161 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1162 &sz_gz_rle, GZIP, 1, Z_RLE);
1163 if (c && sz_best > sz_gz_rle) {
1164 sz_best = sz_gz_rle;
1165 method_best = GZIP_RLE;
1166 if (c_best)
1167 free(c_best);
1168 c_best = c;
1169 } else if (c) {
1170 free(c);
1171 } else {
1172 sz_gz_rle = b->uncomp_size*2+1000;
1173 }
1174
1175 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1176 }
1177
1178 if (method & (1<<GZIP)) {
1179 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1180 &sz_gz_def, GZIP, level,
1181 Z_FILTERED);
1182 if (c && sz_best > sz_gz_def) {
1183 sz_best = sz_gz_def;
1184 method_best = GZIP;
1185 if (c_best)
1186 free(c_best);
1187 c_best = c;
1188 } else if (c) {
1189 free(c);
1190 } else {
1191 sz_gz_def = b->uncomp_size*2+1000;
1192 }
1193
1194 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1195 }
1196
1197 if (method & (1<<RANS0)) {
1198 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1199 &sz_rans0, RANS0, 0, 0);
1200 if (c && sz_best > sz_rans0) {
1201 sz_best = sz_rans0;
1202 method_best = RANS0;
1203 if (c_best)
1204 free(c_best);
1205 c_best = c;
1206 } else if (c) {
1207 free(c);
1208 } else {
1209 sz_rans0 = b->uncomp_size*2+1000;
1210 }
1211 }
1212
1213 if (method & (1<<RANS1)) {
1214 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1215 &sz_rans1, RANS1, 0, 0);
1216 if (c && sz_best > sz_rans1) {
1217 sz_best = sz_rans1;
1218 method_best = RANS1;
1219 if (c_best)
1220 free(c_best);
1221 c_best = c;
1222 } else if (c) {
1223 free(c);
1224 } else {
1225 sz_rans1 = b->uncomp_size*2+1000;
1226 }
1227 }
1228
1229 if (method & (1<<BZIP2)) {
1230 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1231 &sz_bzip2, BZIP2, level, 0);
1232 if (c && sz_best > sz_bzip2) {
1233 sz_best = sz_bzip2;
1234 method_best = BZIP2;
1235 if (c_best)
1236 free(c_best);
1237 c_best = c;
1238 } else if (c) {
1239 free(c);
1240 } else {
1241 sz_bzip2 = b->uncomp_size*2+1000;
1242 }
1243 }
1244
1245 if (method & (1<<LZMA)) {
1246 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1247 &sz_lzma, LZMA, level, 0);
1248 if (c && sz_best > sz_lzma) {
1249 sz_best = sz_lzma;
1250 method_best = LZMA;
1251 if (c_best)
1252 free(c_best);
1253 c_best = c;
1254 } else if (c) {
1255 free(c);
1256 } else {
1257 sz_lzma = b->uncomp_size*2+1000;
1258 }
1259 }
1260
1261 //fprintf(stderr, "sz_best = %d\n", sz_best);
1262
1263 free(b->data);
1264 b->data = (unsigned char *)c_best;
1265 //printf("method_best = %s\n", cram_block_method2str(method_best));
1266 b->method = method_best == GZIP_RLE ? GZIP : method_best;
1267 b->comp_size = sz_best;
1268
1269 pthread_mutex_lock(&fd->metrics_lock);
1270 metrics->sz_gz_rle += sz_gz_rle;
1271 metrics->sz_gz_def += sz_gz_def;
1272 metrics->sz_rans0 += sz_rans0;
1273 metrics->sz_rans1 += sz_rans1;
1274 metrics->sz_bzip2 += sz_bzip2;
1275 metrics->sz_lzma += sz_lzma;
1276 if (--metrics->trial == 0) {
1277 int best_method = RAW;
1278 int best_sz = INT_MAX;
1279
1280 // Scale methods by cost
1281 if (fd->level <= 3) {
1282 metrics->sz_rans1 *= 1.02;
1283 metrics->sz_gz_def *= 1.04;
1284 metrics->sz_bzip2 *= 1.08;
1285 metrics->sz_lzma *= 1.10;
1286 } else if (fd->level <= 6) {
1287 metrics->sz_rans1 *= 1.01;
1288 metrics->sz_gz_def *= 1.02;
1289 metrics->sz_bzip2 *= 1.03;
1290 metrics->sz_lzma *= 1.05;
1291 }
1292
1293 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1294 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1295
1296 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1297 best_sz = metrics->sz_gz_def, best_method = GZIP;
1298
1299 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1300 best_sz = metrics->sz_rans0, best_method = RANS0;
1301
1302 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1303 best_sz = metrics->sz_rans1, best_method = RANS1;
1304
1305 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1306 best_sz = metrics->sz_bzip2, best_method = BZIP2;
1307
1308 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1309 best_sz = metrics->sz_lzma, best_method = LZMA;
1310
1311 if (best_method == GZIP_RLE) {
1312 metrics->method = GZIP;
1313 metrics->strat = Z_RLE;
1314 } else {
1315 metrics->method = best_method;
1316 metrics->strat = Z_FILTERED;
1317 }
1318
1319 // If we see at least MAXFAIL trials in a row for a specific
1320 // compression method with more than MAXDELTA aggregate
1321 // size then we drop this from the list of methods used
1322 // for this block type.
1323 #define MAXDELTA 0.20
1324 #define MAXFAILS 4
1325 if (best_method == GZIP_RLE) {
1326 metrics->gz_rle_cnt = 0;
1327 metrics->gz_rle_extra = 0;
1328 } else if (best_sz < metrics->sz_gz_rle) {
1329 double r = (double)metrics->sz_gz_rle / best_sz - 1;
1330 if (++metrics->gz_rle_cnt >= MAXFAILS &&
1331 (metrics->gz_rle_extra += r) >= MAXDELTA)
1332 method &= ~(1<<GZIP_RLE);
1333 }
1334
1335 if (best_method == GZIP) {
1336 metrics->gz_def_cnt = 0;
1337 metrics->gz_def_extra = 0;
1338 } else if (best_sz < metrics->sz_gz_def) {
1339 double r = (double)metrics->sz_gz_def / best_sz - 1;
1340 if (++metrics->gz_def_cnt >= MAXFAILS &&
1341 (metrics->gz_def_extra += r) >= MAXDELTA)
1342 method &= ~(1<<GZIP);
1343 }
1344
1345 if (best_method == RANS0) {
1346 metrics->rans0_cnt = 0;
1347 metrics->rans0_extra = 0;
1348 } else if (best_sz < metrics->sz_rans0) {
1349 double r = (double)metrics->sz_rans0 / best_sz - 1;
1350 if (++metrics->rans0_cnt >= MAXFAILS &&
1351 (metrics->rans0_extra += r) >= MAXDELTA)
1352 method &= ~(1<<RANS0);
1353 }
1354
1355 if (best_method == RANS1) {
1356 metrics->rans1_cnt = 0;
1357 metrics->rans1_extra = 0;
1358 } else if (best_sz < metrics->sz_rans1) {
1359 double r = (double)metrics->sz_rans1 / best_sz - 1;
1360 if (++metrics->rans1_cnt >= MAXFAILS &&
1361 (metrics->rans1_extra += r) >= MAXDELTA)
1362 method &= ~(1<<RANS1);
1363 }
1364
1365 if (best_method == BZIP2) {
1366 metrics->bzip2_cnt = 0;
1367 metrics->bzip2_extra = 0;
1368 } else if (best_sz < metrics->sz_bzip2) {
1369 double r = (double)metrics->sz_bzip2 / best_sz - 1;
1370 if (++metrics->bzip2_cnt >= MAXFAILS &&
1371 (metrics->bzip2_extra += r) >= MAXDELTA)
1372 method &= ~(1<<BZIP2);
1373 }
1374
1375 if (best_method == LZMA) {
1376 metrics->lzma_cnt = 0;
1377 metrics->lzma_extra = 0;
1378 } else if (best_sz < metrics->sz_lzma) {
1379 double r = (double)metrics->sz_lzma / best_sz - 1;
1380 if (++metrics->lzma_cnt >= MAXFAILS &&
1381 (metrics->lzma_extra += r) >= MAXDELTA)
1382 method &= ~(1<<LZMA);
1383 }
1384
1385 //if (method != metrics->revised_method)
1386 // fprintf(stderr, "%d: method from %x to %x\n",
1387 // b->content_id, metrics->revised_method, method);
1388 metrics->revised_method = method;
1389 }
1390 pthread_mutex_unlock(&fd->metrics_lock);
1391 } else {
1392 strat = metrics->strat;
1393 method = metrics->method;
1394
1395 pthread_mutex_unlock(&fd->metrics_lock);
1396 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1397 &comp_size, method,
1398 level, strat);
1399 if (!comp)
1400 return -1;
1401 free(b->data);
1402 b->data = (unsigned char *)comp;
1403 b->comp_size = comp_size;
1404 b->method = method;
1405 }
1406
1407 } else {
1408 // no cached metrics, so just do zlib?
1409 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1410 &comp_size, GZIP, level, Z_FILTERED);
1411 if (!comp) {
1412 hts_log_error("Compression failed");
1413 return -1;
1414 }
1415 free(b->data);
1416 b->data = (unsigned char *)comp;
1417 b->comp_size = comp_size;
1418 b->method = GZIP;
1419 }
1420
1421 hts_log_info("Compressed block ID %d from %d to %d by method %s",
1422 b->content_id, b->uncomp_size, b->comp_size,
1423 cram_block_method2str(b->method));
1424
1425 if (b->method == RANS1)
1426 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1427
1428 return 0;
1429 }
1430
cram_new_metrics(void)1431 cram_metrics *cram_new_metrics(void) {
1432 cram_metrics *m = calloc(1, sizeof(*m));
1433 if (!m)
1434 return NULL;
1435 m->trial = NTRIALS-1;
1436 m->next_trial = TRIAL_SPAN;
1437 m->method = RAW;
1438 m->strat = 0;
1439 m->revised_method = 0;
1440
1441 return m;
1442 }
1443
cram_block_method2str(enum cram_block_method m)1444 char *cram_block_method2str(enum cram_block_method m) {
1445 switch(m) {
1446 case RAW: return "RAW";
1447 case GZIP: return "GZIP";
1448 case BZIP2: return "BZIP2";
1449 case LZMA: return "LZMA";
1450 case RANS0: return "RANS0";
1451 case RANS1: return "RANS1";
1452 case GZIP_RLE: return "GZIP_RLE";
1453 case BM_ERROR: break;
1454 }
1455 return "?";
1456 }
1457
cram_content_type2str(enum cram_content_type t)1458 char *cram_content_type2str(enum cram_content_type t) {
1459 switch (t) {
1460 case FILE_HEADER: return "FILE_HEADER";
1461 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
1462 case MAPPED_SLICE: return "MAPPED_SLICE";
1463 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
1464 case EXTERNAL: return "EXTERNAL";
1465 case CORE: return "CORE";
1466 case CT_ERROR: break;
1467 }
1468 return "?";
1469 }
1470
1471 /* ----------------------------------------------------------------------
1472 * Reference sequence handling
1473 *
1474 * These revolve around the refs_t structure, which may potentially be
1475 * shared between multiple cram_fd.
1476 *
1477 * We start with refs_create() to allocate an empty refs_t and then
1478 * populate it with @SQ line data using refs_from_header(). This is done on
1479 * cram_open(). Also at start up we can call cram_load_reference() which
1480 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1481 * new one specified. In either case refs2id() is then called which
1482 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1483 *
1484 * Later, possibly within a thread, we will want to know the actual ref
1485 * seq itself, obtained by calling cram_get_ref(). This may use the
1486 * UR: or M5: fields or the filename specified in the original
1487 * cram_load_reference() call.
1488 *
1489 * Given the potential for multi-threaded reference usage, we have
1490 * reference counting (sorry for the confusing double use of "ref") to
1491 * track the number of callers interested in any specific reference.
1492 */
1493
1494 /*
1495 * Frees/unmaps a reference sequence and associated file handles.
1496 */
ref_entry_free_seq(ref_entry * e)1497 static void ref_entry_free_seq(ref_entry *e) {
1498 if (e->mf)
1499 mfclose(e->mf);
1500 if (e->seq && !e->mf)
1501 free(e->seq);
1502
1503 e->seq = NULL;
1504 e->mf = NULL;
1505 }
1506
refs_free(refs_t * r)1507 void refs_free(refs_t *r) {
1508 RP("refs_free()\n");
1509
1510 if (--r->count > 0)
1511 return;
1512
1513 if (!r)
1514 return;
1515
1516 if (r->pool)
1517 string_pool_destroy(r->pool);
1518
1519 if (r->h_meta) {
1520 khint_t k;
1521
1522 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1523 ref_entry *e;
1524
1525 if (!kh_exist(r->h_meta, k))
1526 continue;
1527 if (!(e = kh_val(r->h_meta, k)))
1528 continue;
1529 ref_entry_free_seq(e);
1530 free(e);
1531 }
1532
1533 kh_destroy(refs, r->h_meta);
1534 }
1535
1536 if (r->ref_id)
1537 free(r->ref_id);
1538
1539 if (r->fp)
1540 bgzf_close(r->fp);
1541
1542 pthread_mutex_destroy(&r->lock);
1543
1544 free(r);
1545 }
1546
refs_create(void)1547 static refs_t *refs_create(void) {
1548 refs_t *r = calloc(1, sizeof(*r));
1549
1550 RP("refs_create()\n");
1551
1552 if (!r)
1553 return NULL;
1554
1555 if (!(r->pool = string_pool_create(8192)))
1556 goto err;
1557
1558 r->ref_id = NULL; // see refs2id() to populate.
1559 r->count = 1;
1560 r->last = NULL;
1561 r->last_id = -1;
1562
1563 if (!(r->h_meta = kh_init(refs)))
1564 goto err;
1565
1566 pthread_mutex_init(&r->lock, NULL);
1567
1568 return r;
1569
1570 err:
1571 refs_free(r);
1572 return NULL;
1573 }
1574
1575 /*
1576 * Opens a reference fasta file as a BGZF stream, allowing for
1577 * compressed files. It automatically builds a .fai file if
1578 * required and if compressed a .gzi bgzf index too.
1579 *
1580 * Returns a BGZF handle on success;
1581 * NULL on failure.
1582 */
bgzf_open_ref(char * fn,char * mode,int is_md5)1583 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
1584 BGZF *fp;
1585
1586 if (!is_md5) {
1587 char fai_file[PATH_MAX];
1588
1589 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1590 if (access(fai_file, R_OK) != 0)
1591 if (fai_build(fn) != 0)
1592 return NULL;
1593 }
1594
1595 if (!(fp = bgzf_open(fn, mode))) {
1596 perror(fn);
1597 return NULL;
1598 }
1599
1600 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1601 hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
1602 bgzf_close(fp);
1603 return NULL;
1604 }
1605
1606 return fp;
1607 }
1608
1609 /*
1610 * Loads a FAI file for a reference.fasta.
1611 * "is_err" indicates whether failure to load is worthy of emitting an
1612 * error message. In some cases (eg with embedded references) we
1613 * speculatively load, just incase, and silently ignore errors.
1614 *
1615 * Returns the refs_t struct on success (maybe newly allocated);
1616 * NULL on failure
1617 */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)1618 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1619 struct stat sb;
1620 FILE *fp = NULL;
1621 char fai_fn[PATH_MAX];
1622 char line[8192];
1623 refs_t *r = r_orig;
1624 size_t fn_l = strlen(fn);
1625 int id = 0, id_alloc = 0;
1626
1627 RP("refs_load_fai %s\n", fn);
1628
1629 if (!r)
1630 if (!(r = refs_create()))
1631 goto err;
1632
1633 /* Open reference, for later use */
1634 if (stat(fn, &sb) != 0) {
1635 if (is_err)
1636 perror(fn);
1637 goto err;
1638 }
1639
1640 if (r->fp)
1641 if (bgzf_close(r->fp) != 0)
1642 goto err;
1643 r->fp = NULL;
1644
1645 if (!(r->fn = string_dup(r->pool, fn)))
1646 goto err;
1647
1648 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1649 r->fn[fn_l-4] = 0;
1650
1651 if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
1652 goto err;
1653
1654 /* Parse .fai file and load meta-data */
1655 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1656
1657 if (stat(fai_fn, &sb) != 0) {
1658 if (is_err)
1659 perror(fai_fn);
1660 goto err;
1661 }
1662 if (!(fp = fopen(fai_fn, "r"))) {
1663 if (is_err)
1664 perror(fai_fn);
1665 goto err;
1666 }
1667 while (fgets(line, 8192, fp) != NULL) {
1668 ref_entry *e = malloc(sizeof(*e));
1669 char *cp;
1670 int n;
1671 khint_t k;
1672
1673 if (!e)
1674 return NULL;
1675
1676 // id
1677 for (cp = line; *cp && !isspace_c(*cp); cp++)
1678 ;
1679 *cp++ = 0;
1680 e->name = string_dup(r->pool, line);
1681
1682 // length
1683 while (*cp && isspace_c(*cp))
1684 cp++;
1685 e->length = strtoll(cp, &cp, 10);
1686
1687 // offset
1688 while (*cp && isspace_c(*cp))
1689 cp++;
1690 e->offset = strtoll(cp, &cp, 10);
1691
1692 // bases per line
1693 while (*cp && isspace_c(*cp))
1694 cp++;
1695 e->bases_per_line = strtol(cp, &cp, 10);
1696
1697 // line length
1698 while (*cp && isspace_c(*cp))
1699 cp++;
1700 e->line_length = strtol(cp, &cp, 10);
1701
1702 // filename
1703 e->fn = r->fn;
1704
1705 e->count = 0;
1706 e->seq = NULL;
1707 e->mf = NULL;
1708 e->is_md5 = 0;
1709
1710 k = kh_put(refs, r->h_meta, e->name, &n);
1711 if (-1 == n) {
1712 free(e);
1713 return NULL;
1714 }
1715
1716 if (n) {
1717 kh_val(r->h_meta, k) = e;
1718 } else {
1719 ref_entry *re = kh_val(r->h_meta, k);
1720 if (re && (re->count != 0 || re->length != 0)) {
1721 /* Keep old */
1722 free(e);
1723 } else {
1724 /* Replace old */
1725 if (re)
1726 free(re);
1727 kh_val(r->h_meta, k) = e;
1728 }
1729 }
1730
1731 if (id >= id_alloc) {
1732 int x;
1733
1734 id_alloc = id_alloc ?id_alloc*2 : 16;
1735 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1736
1737 for (x = id; x < id_alloc; x++)
1738 r->ref_id[x] = NULL;
1739 }
1740 r->ref_id[id] = e;
1741 r->nref = ++id;
1742 }
1743
1744 fclose(fp);
1745 return r;
1746
1747 err:
1748 if (fp)
1749 fclose(fp);
1750
1751 if (!r_orig)
1752 refs_free(r);
1753
1754 return NULL;
1755 }
1756
1757 /*
1758 * Verifies that the CRAM @SQ lines and .fai files match.
1759 */
sanitise_SQ_lines(cram_fd * fd)1760 static void sanitise_SQ_lines(cram_fd *fd) {
1761 int i;
1762
1763 if (!fd->header)
1764 return;
1765
1766 if (!fd->refs || !fd->refs->h_meta)
1767 return;
1768
1769 for (i = 0; i < fd->header->nref; i++) {
1770 char *name = fd->header->ref[i].name;
1771 khint_t k = kh_get(refs, fd->refs->h_meta, name);
1772 ref_entry *r;
1773
1774 // We may have @SQ lines which have no known .fai, but do not
1775 // in themselves pose a problem because they are unused in the file.
1776 if (k == kh_end(fd->refs->h_meta))
1777 continue;
1778
1779 if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
1780 continue;
1781
1782 if (r->length && r->length != fd->header->ref[i].len) {
1783 assert(strcmp(r->name, fd->header->ref[i].name) == 0);
1784
1785 // Should we also check MD5sums here to ensure the correct
1786 // reference was given?
1787 hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d",
1788 r->name, fd->header->ref[i].len, (int)r->length);
1789
1790 // Fixing the parsed @SQ header will make MD:Z: strings work
1791 // and also stop it producing N for the sequence.
1792 fd->header->ref[i].len = r->length;
1793 }
1794 }
1795 }
1796
1797 /*
1798 * Indexes references by the order they appear in a BAM file. This may not
1799 * necessarily be the same order they appear in the fasta reference file.
1800 *
1801 * Returns 0 on success
1802 * -1 on failure
1803 */
refs2id(refs_t * r,SAM_hdr * h)1804 int refs2id(refs_t *r, SAM_hdr *h) {
1805 int i;
1806
1807 if (r->ref_id)
1808 free(r->ref_id);
1809 if (r->last)
1810 r->last = NULL;
1811
1812 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1813 if (!r->ref_id)
1814 return -1;
1815
1816 r->nref = h->nref;
1817 for (i = 0; i < h->nref; i++) {
1818 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1819 if (k != kh_end(r->h_meta)) {
1820 r->ref_id[i] = kh_val(r->h_meta, k);
1821 } else {
1822 hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
1823 }
1824 }
1825
1826 return 0;
1827 }
1828
1829 /*
1830 * Generates refs_t entries based on @SQ lines in the header.
1831 * Returns 0 on success
1832 * -1 on failure
1833 */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)1834 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1835 int i, j;
1836
1837 if (!r)
1838 return -1;
1839
1840 if (!h || h->nref == 0)
1841 return 0;
1842
1843 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1844
1845 /* Existing refs are fine, as long as they're compatible with the hdr. */
1846 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1847 return -1;
1848
1849 /* Copy info from h->ref[i] over to r */
1850 for (i = 0, j = r->nref; i < h->nref; i++) {
1851 SAM_hdr_type *ty;
1852 SAM_hdr_tag *tag;
1853 khint_t k;
1854 int n;
1855
1856 k = kh_get(refs, r->h_meta, h->ref[i].name);
1857 if (k != kh_end(r->h_meta))
1858 // Ref already known about
1859 continue;
1860
1861 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1862 return -1;
1863
1864 if (!h->ref[i].name)
1865 return -1;
1866
1867 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1868 r->ref_id[j]->length = 0; // marker for not yet loaded
1869
1870 /* Initialise likely filename if known */
1871 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1872 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1873 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1874 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1875 }
1876 }
1877
1878 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1879 if (n <= 0) // already exists or error
1880 return -1;
1881 kh_val(r->h_meta, k) = r->ref_id[j];
1882
1883 j++;
1884 }
1885 r->nref = j;
1886
1887 return 0;
1888 }
1889
1890 /*
1891 * Attaches a header to a cram_fd.
1892 *
1893 * This should be used when creating a new cram_fd for writing where
1894 * we have an SAM_hdr already constructed (eg from a file we've read
1895 * in).
1896 */
cram_set_header(cram_fd * fd,SAM_hdr * hdr)1897 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1898 if (fd->header)
1899 sam_hdr_free(fd->header);
1900 fd->header = hdr;
1901 return refs_from_header(fd->refs, fd, hdr);
1902 }
1903
1904 /*
1905 * Converts a directory and a filename into an expanded path, replacing %s
1906 * in directory with the filename and %[0-9]+s with portions of the filename
1907 * Any remaining parts of filename are added to the end with /%s.
1908 */
expand_cache_path(char * path,char * dir,char * fn)1909 void expand_cache_path(char *path, char *dir, char *fn) {
1910 char *cp;
1911
1912 while ((cp = strchr(dir, '%'))) {
1913 strncpy(path, dir, cp-dir);
1914 path += cp-dir;
1915
1916 if (*++cp == 's') {
1917 strcpy(path, fn);
1918 path += strlen(fn);
1919 fn += strlen(fn);
1920 cp++;
1921 } else if (*cp >= '0' && *cp <= '9') {
1922 char *endp;
1923 long l;
1924
1925 l = strtol(cp, &endp, 10);
1926 l = MIN(l, strlen(fn));
1927 if (*endp == 's') {
1928 strncpy(path, fn, l);
1929 path += l;
1930 fn += l;
1931 *path = 0;
1932 cp = endp+1;
1933 } else {
1934 *path++ = '%';
1935 *path++ = *cp++;
1936 }
1937 } else {
1938 *path++ = '%';
1939 *path++ = *cp++;
1940 }
1941 dir = cp;
1942 }
1943 strcpy(path, dir);
1944 path += strlen(dir);
1945 if (*fn && path[-1] != '/')
1946 *path++ = '/';
1947 strcpy(path, fn);
1948 }
1949
1950 /*
1951 * Make the directory containing path and any prefix directories.
1952 */
mkdir_prefix(char * path,int mode)1953 void mkdir_prefix(char *path, int mode) {
1954 char *cp = strrchr(path, '/');
1955 if (!cp)
1956 return;
1957
1958 *cp = 0;
1959 if (is_directory(path)) {
1960 *cp = '/';
1961 return;
1962 }
1963
1964 if (mkdir(path, mode) == 0) {
1965 chmod(path, mode);
1966 *cp = '/';
1967 return;
1968 }
1969
1970 mkdir_prefix(path, mode);
1971 mkdir(path, mode);
1972 chmod(path, mode);
1973 *cp = '/';
1974 }
1975
1976 /*
1977 * Return the cache directory to use, based on the first of these
1978 * environment variables to be set to a non-empty value.
1979 */
get_cache_basedir(const char ** extra)1980 static const char *get_cache_basedir(const char **extra) {
1981 char *base;
1982
1983 *extra = "";
1984
1985 base = getenv("XDG_CACHE_HOME");
1986 if (base && *base) return base;
1987
1988 base = getenv("HOME");
1989 if (base && *base) { *extra = "/.cache"; return base; }
1990
1991 base = getenv("TMPDIR");
1992 if (base && *base) return base;
1993
1994 base = getenv("TEMP");
1995 if (base && *base) return base;
1996
1997 return "/tmp";
1998 }
1999
2000 /*
2001 * Return an integer representation of pthread_self().
2002 */
get_int_threadid()2003 static unsigned get_int_threadid() {
2004 pthread_t pt = pthread_self();
2005 unsigned char *s = (unsigned char *) &pt;
2006 size_t i;
2007 unsigned h = 0;
2008 for (i = 0; i < sizeof(pthread_t); i++)
2009 h = (h << 5) - h + s[i];
2010 return h;
2011 }
2012
2013 /*
2014 * Queries the M5 string from the header and attempts to populate the
2015 * reference from this using the REF_PATH environment.
2016 *
2017 * Returns 0 on sucess
2018 * -1 on failure
2019 */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2020 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2021 char *ref_path = getenv("REF_PATH");
2022 SAM_hdr_type *ty;
2023 SAM_hdr_tag *tag;
2024 char path[PATH_MAX], path_tmp[PATH_MAX];
2025 char cache[PATH_MAX], cache_root[PATH_MAX];
2026 char *local_cache = getenv("REF_CACHE");
2027 mFILE *mf;
2028 int local_path = 0;
2029
2030 hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2031
2032 cache_root[0] = '\0';
2033
2034 if (!ref_path || *ref_path == '\0') {
2035 /*
2036 * If we have no ref path, we use the EBI server.
2037 * However to avoid spamming it we require a local ref cache too.
2038 */
2039 ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
2040 if (!local_cache || *local_cache == '\0') {
2041 const char *extra;
2042 const char *base = get_cache_basedir(&extra);
2043 snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2044 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2045 local_cache = cache;
2046 hts_log_info("Populating local cache: %s", local_cache);
2047 }
2048 }
2049
2050 if (!r->name)
2051 return -1;
2052
2053 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2054 return -1;
2055
2056 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2057 goto no_M5;
2058
2059 hts_log_info("Querying ref %s", tag->str+3);
2060
2061 /* Use cache if available */
2062 if (local_cache && *local_cache) {
2063 expand_cache_path(path, local_cache, tag->str+3);
2064 local_path = 1;
2065 }
2066
2067 #ifndef HAVE_MMAP
2068 char *path2;
2069 /* Search local files in REF_PATH; we can open them and return as above */
2070 if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2071 strncpy(path, path2, PATH_MAX);
2072 free(path2);
2073 if (is_file(path)) // incase it's too long
2074 local_path = 1;
2075 }
2076 #endif
2077
2078 /* Found via REF_CACHE or local REF_PATH file */
2079 if (local_path) {
2080 struct stat sb;
2081 BGZF *fp;
2082
2083 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
2084 r->length = sb.st_size;
2085 r->offset = r->line_length = r->bases_per_line = 0;
2086
2087 r->fn = string_dup(fd->refs->pool, path);
2088
2089 if (fd->refs->fp)
2090 if (bgzf_close(fd->refs->fp) != 0)
2091 return -1;
2092 fd->refs->fp = fp;
2093 fd->refs->fn = r->fn;
2094 r->is_md5 = 1;
2095
2096 // Fall back to cram_get_ref() where it'll do the actual
2097 // reading of the file.
2098 return 0;
2099 }
2100 }
2101
2102
2103 /* Otherwise search full REF_PATH; slower as loads entire file */
2104 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2105 size_t sz;
2106 r->seq = mfsteal(mf, &sz);
2107 if (r->seq) {
2108 r->mf = NULL;
2109 } else {
2110 // keep mf around as we couldn't detach
2111 r->seq = mf->data;
2112 r->mf = mf;
2113 }
2114 r->length = sz;
2115 r->is_md5 = 1;
2116 } else {
2117 refs_t *refs;
2118 char *fn;
2119
2120 no_M5:
2121 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2122 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2123 return -1;
2124
2125 fn = (strncmp(tag->str+3, "file:", 5) == 0)
2126 ? tag->str+8
2127 : tag->str+3;
2128
2129 if (fd->refs->fp) {
2130 if (bgzf_close(fd->refs->fp) != 0)
2131 return -1;
2132 fd->refs->fp = NULL;
2133 }
2134 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2135 return -1;
2136 sanitise_SQ_lines(fd);
2137
2138 fd->refs = refs;
2139 if (fd->refs->fp) {
2140 if (bgzf_close(fd->refs->fp) != 0)
2141 return -1;
2142 fd->refs->fp = NULL;
2143 }
2144
2145 if (!fd->refs->fn)
2146 return -1;
2147
2148 if (-1 == refs2id(fd->refs, fd->header))
2149 return -1;
2150 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2151 return -1;
2152
2153 // Local copy already, so fall back to cram_get_ref().
2154 return 0;
2155 }
2156
2157 /* Populate the local disk cache if required */
2158 if (local_cache && *local_cache) {
2159 int pid = (int) getpid();
2160 unsigned thrid = get_int_threadid();
2161 hFILE *fp;
2162
2163 if (*cache_root && !is_directory(cache_root)) {
2164 hts_log_warning("Creating reference cache directory %s\n"
2165 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2166 cache_root);
2167 }
2168
2169 expand_cache_path(path, local_cache, tag->str+3);
2170 hts_log_info("Writing cache file '%s'", path);
2171 mkdir_prefix(path, 01777);
2172
2173 do {
2174 // Attempt to further uniquify the temporary filename
2175 unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2176 thrid++; // Ensure filename changes even if time/clock haven't
2177
2178 sprintf(path_tmp, "%s.tmp_%d_%u_%u", path, pid, thrid, t);
2179 fp = hopen(path_tmp, "wx");
2180 } while (fp == NULL && errno == EEXIST);
2181 if (!fp) {
2182 perror(path_tmp);
2183
2184 // Not fatal - we have the data already so keep going.
2185 return 0;
2186 }
2187
2188 // Check md5sum
2189 hts_md5_context *md5;
2190 char unsigned md5_buf1[16];
2191 char md5_buf2[33];
2192
2193 if (!(md5 = hts_md5_init())) {
2194 hclose_abruptly(fp);
2195 unlink(path_tmp);
2196 return -1;
2197 }
2198 hts_md5_update(md5, r->seq, r->length);
2199 hts_md5_final(md5_buf1, md5);
2200 hts_md5_destroy(md5);
2201 hts_md5_hex(md5_buf2, md5_buf1);
2202
2203 if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2204 hts_log_error("Mismatching md5sum for downloaded reference");
2205 hclose_abruptly(fp);
2206 unlink(path_tmp);
2207 return -1;
2208 }
2209
2210 if (hwrite(fp, r->seq, r->length) != r->length) {
2211 perror(path);
2212 }
2213 if (hclose(fp) < 0) {
2214 unlink(path_tmp);
2215 } else {
2216 if (0 == chmod(path_tmp, 0444))
2217 rename(path_tmp, path);
2218 else
2219 unlink(path_tmp);
2220 }
2221 }
2222
2223 return 0;
2224 }
2225
cram_ref_incr_locked(refs_t * r,int id)2226 static void cram_ref_incr_locked(refs_t *r, int id) {
2227 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2228
2229 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2230 return;
2231
2232 if (r->last_id == id)
2233 r->last_id = -1;
2234
2235 ++r->ref_id[id]->count;
2236 }
2237
cram_ref_incr(refs_t * r,int id)2238 void cram_ref_incr(refs_t *r, int id) {
2239 pthread_mutex_lock(&r->lock);
2240 cram_ref_incr_locked(r, id);
2241 pthread_mutex_unlock(&r->lock);
2242 }
2243
cram_ref_decr_locked(refs_t * r,int id)2244 static void cram_ref_decr_locked(refs_t *r, int id) {
2245 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2246
2247 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2248 assert(r->ref_id[id]->count >= 0);
2249 return;
2250 }
2251
2252 if (--r->ref_id[id]->count <= 0) {
2253 assert(r->ref_id[id]->count == 0);
2254 if (r->last_id >= 0) {
2255 if (r->ref_id[r->last_id]->count <= 0 &&
2256 r->ref_id[r->last_id]->seq) {
2257 RP("%d FREE REF %d (%p)\n", gettid(),
2258 r->last_id, r->ref_id[r->last_id]->seq);
2259 ref_entry_free_seq(r->ref_id[r->last_id]);
2260 r->ref_id[r->last_id]->length = 0;
2261 }
2262 }
2263 r->last_id = id;
2264 }
2265 }
2266
cram_ref_decr(refs_t * r,int id)2267 void cram_ref_decr(refs_t *r, int id) {
2268 pthread_mutex_lock(&r->lock);
2269 cram_ref_decr_locked(r, id);
2270 pthread_mutex_unlock(&r->lock);
2271 }
2272
2273 /*
2274 * Used by cram_ref_load and cram_ref_get. The file handle will have
2275 * already been opened, so we can catch it. The ref_entry *e informs us
2276 * of whether this is a multi-line fasta file or a raw MD5 style file.
2277 * Either way we create a single contiguous sequence.
2278 *
2279 * Returns all or part of a reference sequence on success (malloced);
2280 * NULL on failure.
2281 */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)2282 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2283 off_t offset, len;
2284 char *seq;
2285
2286 if (end < start)
2287 end = start;
2288
2289 /*
2290 * Compute locations in file. This is trivial for the MD5 files, but
2291 * is still necessary for the fasta variants.
2292 */
2293 offset = e->line_length
2294 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
2295 (start-1) % e->bases_per_line
2296 : start-1;
2297
2298 len = (e->line_length
2299 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2300 (end-1) % e->bases_per_line
2301 : end-1) - offset + 1;
2302
2303 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2304 perror("bgzf_useek() on reference file");
2305 return NULL;
2306 }
2307
2308 if (len == 0 || !(seq = malloc(len))) {
2309 return NULL;
2310 }
2311
2312 if (len != bgzf_read(fp, seq, len)) {
2313 perror("bgzf_read() on reference file");
2314 free(seq);
2315 return NULL;
2316 }
2317
2318 /* Strip white-space if required. */
2319 if (len != end-start+1) {
2320 int i, j;
2321 char *cp = seq;
2322 char *cp_to;
2323
2324 for (i = j = 0; i < len; i++) {
2325 if (cp[i] >= '!' && cp[i] <= '~')
2326 cp[j++] = toupper_c(cp[i]);
2327 }
2328 cp_to = cp+j;
2329
2330 if (cp_to - seq != end-start+1) {
2331 hts_log_error("Malformed reference file");
2332 free(seq);
2333 return NULL;
2334 }
2335 } else {
2336 int i;
2337 for (i = 0; i < len; i++) {
2338 seq[i] = toupper_c(seq[i]);
2339 }
2340 }
2341
2342 return seq;
2343 }
2344
2345 /*
2346 * Load the entire reference 'id'.
2347 * This also increments the reference count by 1.
2348 *
2349 * Returns ref_entry on success;
2350 * NULL on failure
2351 */
cram_ref_load(refs_t * r,int id,int is_md5)2352 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
2353 ref_entry *e = r->ref_id[id];
2354 int start = 1, end = e->length;
2355 char *seq;
2356
2357 if (e->seq) {
2358 return e;
2359 }
2360
2361 assert(e->count == 0);
2362
2363 if (r->last) {
2364 #ifdef REF_DEBUG
2365 int idx = 0;
2366 for (idx = 0; idx < r->nref; idx++)
2367 if (r->last == r->ref_id[idx])
2368 break;
2369 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2370 #endif
2371 assert(r->last->count > 0);
2372 if (--r->last->count <= 0) {
2373 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2374 if (r->last->seq)
2375 ref_entry_free_seq(r->last);
2376 }
2377 }
2378
2379 /* Open file if it's not already the current open reference */
2380 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2381 if (r->fp)
2382 if (bgzf_close(r->fp) != 0)
2383 return NULL;
2384 r->fn = e->fn;
2385 if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
2386 return NULL;
2387 }
2388
2389 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2390
2391 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2392 return NULL;
2393 }
2394
2395 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2396
2397 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2398 e->seq = seq;
2399 e->mf = NULL;
2400 e->count++;
2401
2402 /*
2403 * Also keep track of last used ref so incr/decr loops on the same
2404 * sequence don't cause load/free loops.
2405 */
2406 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2407 r->last = e;
2408 e->count++;
2409
2410 return e;
2411 }
2412
2413 /*
2414 * Returns a portion of a reference sequence from start to end inclusive.
2415 * The returned pointer is owned by either the cram_file fd or by the
2416 * internal refs_t structure and should not be freed by the caller.
2417 *
2418 * The difference is whether or not this refs_t is in use by just the one
2419 * cram_fd or by multiples, or whether we have multiple threads accessing
2420 * references. In either case fd->shared will be true and we start using
2421 * reference counting to track the number of users of a specific reference
2422 * sequence.
2423 *
2424 * Otherwise the ref seq returned is allocated as part of cram_fd itself
2425 * and will be freed up on the next call to cram_get_ref or cram_close.
2426 *
2427 * To return the entire reference sequence, specify start as 1 and end
2428 * as 0.
2429 *
2430 * To cease using a reference, call cram_ref_decr().
2431 *
2432 * Returns reference on success,
2433 * NULL on failure
2434 */
cram_get_ref(cram_fd * fd,int id,int start,int end)2435 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2436 ref_entry *r;
2437 char *seq;
2438 int ostart = start;
2439
2440 if (id == -1)
2441 return NULL;
2442
2443 /* FIXME: axiomatic query of r->seq being true?
2444 * Or shortcut for unsorted data where we load once and never free?
2445 */
2446
2447 //fd->shared_ref = 1; // hard code for now to simplify things
2448
2449 pthread_mutex_lock(&fd->ref_lock);
2450
2451 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2452
2453 /*
2454 * Unsorted data implies we want to fetch an entire reference at a time.
2455 * We just deal with this at the moment by claiming we're sharing
2456 * references instead, which has the same requirement.
2457 */
2458 if (fd->unsorted)
2459 fd->shared_ref = 1;
2460
2461
2462 /* Sanity checking: does this ID exist? */
2463 if (id >= fd->refs->nref) {
2464 hts_log_error("No reference found for id %d", id);
2465 pthread_mutex_unlock(&fd->ref_lock);
2466 return NULL;
2467 }
2468
2469 if (!fd->refs || !fd->refs->ref_id[id]) {
2470 hts_log_error("No reference found for id %d", id);
2471 pthread_mutex_unlock(&fd->ref_lock);
2472 return NULL;
2473 }
2474
2475 if (!(r = fd->refs->ref_id[id])) {
2476 hts_log_error("No reference found for id %d", id);
2477 pthread_mutex_unlock(&fd->ref_lock);
2478 return NULL;
2479 }
2480
2481
2482 /*
2483 * It has an entry, but may not have been populated yet.
2484 * Any manually loaded .fai files have their lengths known.
2485 * A ref entry computed from @SQ lines (M5 or UR field) will have
2486 * r->length == 0 unless it's been loaded once and verified that we have
2487 * an on-disk filename for it.
2488 *
2489 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2490 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2491 * rewrite my code to have one curl handle per thread.
2492 */
2493 pthread_mutex_lock(&fd->refs->lock);
2494 if (r->length == 0) {
2495 if (cram_populate_ref(fd, id, r) == -1) {
2496 hts_log_error("Failed to populate reference for id %d", id);
2497 pthread_mutex_unlock(&fd->refs->lock);
2498 pthread_mutex_unlock(&fd->ref_lock);
2499 return NULL;
2500 }
2501 r = fd->refs->ref_id[id];
2502 if (fd->unsorted)
2503 cram_ref_incr_locked(fd->refs, id);
2504 }
2505
2506
2507 /*
2508 * We now know that we the filename containing the reference, so check
2509 * for limits. If it's over half the reference we'll load all of it in
2510 * memory as this will speed up subsequent calls.
2511 */
2512 if (end < 1)
2513 end = r->length;
2514 if (end >= r->length)
2515 end = r->length;
2516 if (start < 1)
2517 return NULL;
2518
2519 if (end - start >= 0.5*r->length || fd->shared_ref) {
2520 start = 1;
2521 end = r->length;
2522 }
2523
2524 /*
2525 * Maybe we have it cached already? If so use it.
2526 *
2527 * Alternatively if we don't have the sequence but we're sharing
2528 * references and/or are asking for the entire length of it, then
2529 * load the full reference into the refs structure and return
2530 * a pointer to that one instead.
2531 */
2532 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2533 char *cp;
2534
2535 if (id >= 0) {
2536 if (r->seq) {
2537 cram_ref_incr_locked(fd->refs, id);
2538 } else {
2539 ref_entry *e;
2540 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
2541 pthread_mutex_unlock(&fd->refs->lock);
2542 pthread_mutex_unlock(&fd->ref_lock);
2543 return NULL;
2544 }
2545
2546 /* unsorted data implies cache ref indefinitely, to avoid
2547 * continually loading and unloading.
2548 */
2549 if (fd->unsorted)
2550 cram_ref_incr_locked(fd->refs, id);
2551 }
2552
2553 fd->ref = NULL; /* We never access it directly */
2554 fd->ref_start = 1;
2555 fd->ref_end = r->length;
2556 fd->ref_id = id;
2557
2558 cp = fd->refs->ref_id[id]->seq + ostart-1;
2559 } else {
2560 fd->ref = NULL;
2561 cp = NULL;
2562 }
2563
2564 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2565
2566 pthread_mutex_unlock(&fd->refs->lock);
2567 pthread_mutex_unlock(&fd->ref_lock);
2568 return cp;
2569 }
2570
2571 /*
2572 * Otherwise we're not sharing, we don't have a copy of it already and
2573 * we're only asking for a small portion of it.
2574 *
2575 * In this case load up just that segment ourselves, freeing any old
2576 * small segments in the process.
2577 */
2578
2579 /* Unmapped ref ID */
2580 if (id < 0) {
2581 if (fd->ref_free) {
2582 free(fd->ref_free);
2583 fd->ref_free = NULL;
2584 }
2585 fd->ref = NULL;
2586 fd->ref_id = id;
2587 pthread_mutex_unlock(&fd->refs->lock);
2588 pthread_mutex_unlock(&fd->ref_lock);
2589 return NULL;
2590 }
2591
2592 /* Open file if it's not already the current open reference */
2593 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2594 if (fd->refs->fp)
2595 if (bgzf_close(fd->refs->fp) != 0)
2596 return NULL;
2597 fd->refs->fn = r->fn;
2598 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
2599 pthread_mutex_unlock(&fd->refs->lock);
2600 pthread_mutex_unlock(&fd->ref_lock);
2601 return NULL;
2602 }
2603 }
2604
2605 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2606 pthread_mutex_unlock(&fd->refs->lock);
2607 pthread_mutex_unlock(&fd->ref_lock);
2608 return NULL;
2609 }
2610
2611 if (fd->ref_free)
2612 free(fd->ref_free);
2613
2614 fd->ref_id = id;
2615 fd->ref_start = start;
2616 fd->ref_end = end;
2617 fd->ref_free = fd->ref;
2618 seq = fd->ref;
2619
2620 pthread_mutex_unlock(&fd->refs->lock);
2621 pthread_mutex_unlock(&fd->ref_lock);
2622
2623 return seq + ostart - start;
2624 }
2625
2626 /*
2627 * If fd has been opened for reading, it may be permitted to specify 'fn'
2628 * as NULL and let the code auto-detect the reference by parsing the
2629 * SAM header @SQ lines.
2630 */
cram_load_reference(cram_fd * fd,char * fn)2631 int cram_load_reference(cram_fd *fd, char *fn) {
2632 int ret = 0;
2633
2634 if (fn) {
2635 fd->refs = refs_load_fai(fd->refs, fn,
2636 !(fd->embed_ref && fd->mode == 'r'));
2637 fn = fd->refs ? fd->refs->fn : NULL;
2638 if (!fn)
2639 ret = -1;
2640 sanitise_SQ_lines(fd);
2641 }
2642 fd->ref_fn = fn;
2643
2644 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2645 if (fd->refs)
2646 refs_free(fd->refs);
2647 if (!(fd->refs = refs_create()))
2648 return -1;
2649 if (-1 == refs_from_header(fd->refs, fd, fd->header))
2650 return -1;
2651 }
2652
2653 if (fd->header)
2654 if (-1 == refs2id(fd->refs, fd->header))
2655 return -1;
2656
2657 return ret;
2658 }
2659
2660 /* ----------------------------------------------------------------------
2661 * Containers
2662 */
2663
2664 /*
2665 * Creates a new container, specifying the maximum number of slices
2666 * and records permitted.
2667 *
2668 * Returns cram_container ptr on success
2669 * NULL on failure
2670 */
cram_new_container(int nrec,int nslice)2671 cram_container *cram_new_container(int nrec, int nslice) {
2672 cram_container *c = calloc(1, sizeof(*c));
2673 enum cram_DS_ID id;
2674
2675 if (!c)
2676 return NULL;
2677
2678 c->curr_ref = -2;
2679
2680 c->max_c_rec = nrec * nslice;
2681 c->curr_c_rec = 0;
2682
2683 c->max_rec = nrec;
2684 c->record_counter = 0;
2685 c->num_bases = 0;
2686 c->s_num_bases = 0;
2687
2688 c->max_slice = nslice;
2689 c->curr_slice = 0;
2690
2691 c->pos_sorted = 1;
2692 c->max_apos = 0;
2693 c->multi_seq = 0;
2694
2695 c->bams = NULL;
2696
2697 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2698 goto err;
2699 c->slice = NULL;
2700
2701 if (!(c->comp_hdr = cram_new_compression_header()))
2702 goto err;
2703 c->comp_hdr_block = NULL;
2704
2705 for (id = DS_RN; id < DS_TN; id++)
2706 if (!(c->stats[id] = cram_stats_create())) goto err;
2707
2708 //c->aux_B_stats = cram_stats_create();
2709
2710 if (!(c->tags_used = kh_init(m_tagmap)))
2711 goto err;
2712 c->refs_used = 0;
2713
2714 return c;
2715
2716 err:
2717 if (c) {
2718 if (c->slices)
2719 free(c->slices);
2720 free(c);
2721 }
2722 return NULL;
2723 }
2724
cram_free_container(cram_container * c)2725 void cram_free_container(cram_container *c) {
2726 enum cram_DS_ID id;
2727 int i;
2728
2729 if (!c)
2730 return;
2731
2732 if (c->refs_used)
2733 free(c->refs_used);
2734
2735 if (c->landmark)
2736 free(c->landmark);
2737
2738 if (c->comp_hdr)
2739 cram_free_compression_header(c->comp_hdr);
2740
2741 if (c->comp_hdr_block)
2742 cram_free_block(c->comp_hdr_block);
2743
2744 if (c->slices) {
2745 for (i = 0; i < c->max_slice; i++)
2746 if (c->slices[i])
2747 cram_free_slice(c->slices[i]);
2748 free(c->slices);
2749 }
2750
2751 for (id = DS_RN; id < DS_TN; id++)
2752 if (c->stats[id]) cram_stats_free(c->stats[id]);
2753
2754 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2755
2756 if (c->tags_used) {
2757 khint_t k;
2758
2759 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2760 if (!kh_exist(c->tags_used, k))
2761 continue;
2762
2763 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2764 cram_codec *c = tm->codec;
2765
2766 if (c) c->free(c);
2767 free(tm);
2768 }
2769
2770 kh_destroy(m_tagmap, c->tags_used);
2771 }
2772
2773 free(c);
2774 }
2775
2776 /*
2777 * Reads a container header.
2778 *
2779 * Returns cram_container on success
2780 * NULL on failure or no container left (fd->err == 0).
2781 */
cram_read_container(cram_fd * fd)2782 cram_container *cram_read_container(cram_fd *fd) {
2783 cram_container c2, *c;
2784 int i, s;
2785 size_t rd = 0;
2786 uint32_t crc = 0;
2787
2788 fd->err = 0;
2789 fd->eof = 0;
2790
2791 memset(&c2, 0, sizeof(c2));
2792 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2793 if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
2794 fd->eof = fd->empty_container ? 1 : 2;
2795 return NULL;
2796 } else {
2797 rd+=s;
2798 }
2799 } else {
2800 uint32_t len;
2801 if ((s = int32_decode(fd, &c2.length)) == -1) {
2802 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2803 CRAM_MINOR_VERS(fd->version) == 0)
2804 fd->eof = 1; // EOF blocks arrived in v2.1
2805 else
2806 fd->eof = fd->empty_container ? 1 : 2;
2807 return NULL;
2808 } else {
2809 rd+=s;
2810 }
2811 len = le_int4(c2.length);
2812 crc = crc32(0L, (unsigned char *)&len, 4);
2813 }
2814 if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
2815 if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
2816 if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
2817 if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
2818
2819 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2820 c2.record_counter = 0;
2821 c2.num_bases = 0;
2822 } else {
2823 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2824 if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
2825 return NULL;
2826 else
2827 rd += s;
2828 } else {
2829 int32_t i32;
2830 if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
2831 return NULL;
2832 else
2833 rd += s;
2834 c2.record_counter = i32;
2835 }
2836
2837 if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
2838 return NULL;
2839 else
2840 rd += s;
2841 }
2842 if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s;
2843 if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
2844
2845 if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
2846 return NULL;
2847
2848 if (!(c = calloc(1, sizeof(*c))))
2849 return NULL;
2850
2851 *c = c2;
2852
2853 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2854 c->num_landmarks) {
2855 fd->err = errno;
2856 cram_free_container(c);
2857 return NULL;
2858 }
2859 for (i = 0; i < c->num_landmarks; i++) {
2860 if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
2861 cram_free_container(c);
2862 return NULL;
2863 } else {
2864 rd += s;
2865 }
2866 }
2867
2868 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2869 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2870 return NULL;
2871 else
2872 rd+=4;
2873
2874 if (crc != c->crc32) {
2875 hts_log_error("Container header CRC32 failure");
2876 cram_free_container(c);
2877 return NULL;
2878 }
2879 }
2880
2881 c->offset = rd;
2882 c->slices = NULL;
2883 c->curr_slice = 0;
2884 c->max_slice = c->num_landmarks;
2885 c->slice_rec = 0;
2886 c->curr_rec = 0;
2887 c->max_rec = 0;
2888
2889 if (c->ref_seq_id == -2) {
2890 c->multi_seq = 1;
2891 fd->multi_seq = 1;
2892 }
2893
2894 fd->empty_container =
2895 (c->num_records == 0 &&
2896 c->ref_seq_id == -1 &&
2897 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2898
2899 return c;
2900 }
2901
2902
2903 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)2904 int cram_container_size(cram_container *c) {
2905 return 55 + 5*c->num_landmarks;
2906 }
2907
2908
2909 /*
2910 * Stores the container structure in dat and returns *size as the
2911 * number of bytes written to dat[]. The input size of dat is also
2912 * held in *size and should be initialised to cram_container_size(c).
2913 *
2914 * Returns 0 on success;
2915 * -1 on failure
2916 */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)2917 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2918 {
2919 unsigned char *cp = (unsigned char *)dat;
2920 int i;
2921
2922 // Check the input buffer is large enough according to our stated
2923 // requirements. (NOTE: it may actually take less.)
2924 if (cram_container_size(c) > *size)
2925 return -1;
2926
2927 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2928 cp += itf8_put((char*)cp, c->length);
2929 } else {
2930 *(int32_t *)cp = le_int4(c->length);
2931 cp += 4;
2932 }
2933 if (c->multi_seq) {
2934 cp += itf8_put((char*)cp, -2);
2935 cp += itf8_put((char*)cp, 0);
2936 cp += itf8_put((char*)cp, 0);
2937 } else {
2938 cp += itf8_put((char*)cp, c->ref_seq_id);
2939 cp += itf8_put((char*)cp, c->ref_seq_start);
2940 cp += itf8_put((char*)cp, c->ref_seq_span);
2941 }
2942 cp += itf8_put((char*)cp, c->num_records);
2943 if (CRAM_MAJOR_VERS(fd->version) == 2) {
2944 cp += itf8_put((char*)cp, c->record_counter);
2945 cp += ltf8_put((char*)cp, c->num_bases);
2946 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2947 cp += ltf8_put((char*)cp, c->record_counter);
2948 cp += ltf8_put((char*)cp, c->num_bases);
2949 }
2950
2951 cp += itf8_put((char*)cp, c->num_blocks);
2952 cp += itf8_put((char*)cp, c->num_landmarks);
2953 for (i = 0; i < c->num_landmarks; i++)
2954 cp += itf8_put((char*)cp, c->landmark[i]);
2955
2956 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2957 c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat);
2958 cp[0] = c->crc32 & 0xff;
2959 cp[1] = (c->crc32 >> 8) & 0xff;
2960 cp[2] = (c->crc32 >> 16) & 0xff;
2961 cp[3] = (c->crc32 >> 24) & 0xff;
2962 cp += 4;
2963 }
2964
2965 *size = (char *)cp-dat; // actual used size
2966
2967 return 0;
2968 }
2969
2970
2971 /*
2972 * Writes a container structure.
2973 *
2974 * Returns 0 on success
2975 * -1 on failure
2976 */
cram_write_container(cram_fd * fd,cram_container * c)2977 int cram_write_container(cram_fd *fd, cram_container *c) {
2978 char buf_a[1024], *buf = buf_a;
2979 unsigned char *cp;
2980 int i;
2981
2982 if (55 + c->num_landmarks * 5 >= 1024)
2983 buf = malloc(55 + c->num_landmarks * 5);
2984 cp = (unsigned char *)buf;
2985
2986 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2987 cp += itf8_put((char*)cp, c->length);
2988 } else {
2989 *(int32_t *)cp = le_int4(c->length);
2990 cp += 4;
2991 }
2992 if (c->multi_seq) {
2993 cp += itf8_put((char*)cp, -2);
2994 cp += itf8_put((char*)cp, 0);
2995 cp += itf8_put((char*)cp, 0);
2996 } else {
2997 cp += itf8_put((char*)cp, c->ref_seq_id);
2998 cp += itf8_put((char*)cp, c->ref_seq_start);
2999 cp += itf8_put((char*)cp, c->ref_seq_span);
3000 }
3001 cp += itf8_put((char*)cp, c->num_records);
3002 if (CRAM_MAJOR_VERS(fd->version) == 2) {
3003 cp += itf8_put((char*)cp, c->record_counter);
3004 cp += ltf8_put((char*)cp, c->num_bases);
3005 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3006 cp += ltf8_put((char*)cp, c->record_counter);
3007 cp += ltf8_put((char*)cp, c->num_bases);
3008 }
3009
3010 cp += itf8_put((char*)cp, c->num_blocks);
3011 cp += itf8_put((char*)cp, c->num_landmarks);
3012 for (i = 0; i < c->num_landmarks; i++)
3013 cp += itf8_put((char*)cp, c->landmark[i]);
3014
3015 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3016 c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf);
3017 cp[0] = c->crc32 & 0xff;
3018 cp[1] = (c->crc32 >> 8) & 0xff;
3019 cp[2] = (c->crc32 >> 16) & 0xff;
3020 cp[3] = (c->crc32 >> 24) & 0xff;
3021 cp += 4;
3022 }
3023
3024 if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) {
3025 if (buf != buf_a)
3026 free(buf);
3027 return -1;
3028 }
3029
3030 if (buf != buf_a)
3031 free(buf);
3032
3033 return 0;
3034 }
3035
3036 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3037 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3038 int i, j;
3039
3040 if (c->curr_slice > 0 && !c->slices)
3041 return -1;
3042
3043 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3044
3045 /* Write the container struct itself */
3046 if (0 != cram_write_container(fd, c))
3047 return -1;
3048
3049 /* And the compression header */
3050 if (0 != cram_write_block(fd, c->comp_hdr_block))
3051 return -1;
3052
3053 /* Followed by the slice blocks */
3054 for (i = 0; i < c->curr_slice; i++) {
3055 cram_slice *s = c->slices[i];
3056
3057 if (0 != cram_write_block(fd, s->hdr_block))
3058 return -1;
3059
3060 for (j = 0; j < s->hdr->num_blocks; j++) {
3061 if (0 != cram_write_block(fd, s->block[j]))
3062 return -1;
3063 }
3064 }
3065
3066 return hflush(fd->fp) == 0 ? 0 : -1;
3067 }
3068
3069 /*
3070 * Flushes a completely or partially full container to disk, writing
3071 * container structure, header and blocks. This also calls the encoder
3072 * functions.
3073 *
3074 * Returns 0 on success
3075 * -1 on failure
3076 */
cram_flush_container(cram_fd * fd,cram_container * c)3077 int cram_flush_container(cram_fd *fd, cram_container *c) {
3078 /* Encode the container blocks and generate compression header */
3079 if (0 != cram_encode_container(fd, c))
3080 return -1;
3081
3082 return cram_flush_container2(fd, c);
3083 }
3084
3085 typedef struct {
3086 cram_fd *fd;
3087 cram_container *c;
3088 } cram_job;
3089
cram_flush_thread(void * arg)3090 void *cram_flush_thread(void *arg) {
3091 cram_job *j = (cram_job *)arg;
3092
3093 /* Encode the container blocks and generate compression header */
3094 if (0 != cram_encode_container(j->fd, j->c)) {
3095 hts_log_error("Call to cram_encode_container failed");
3096 return NULL;
3097 }
3098
3099 return arg;
3100 }
3101
cram_flush_result(cram_fd * fd)3102 static int cram_flush_result(cram_fd *fd) {
3103 int i, ret = 0;
3104 hts_tpool_result *r;
3105
3106 while ((r = hts_tpool_next_result(fd->rqueue))) {
3107 cram_job *j = (cram_job *)hts_tpool_result_data(r);
3108 cram_container *c;
3109
3110 if (!j) {
3111 hts_tpool_delete_result(r, 0);
3112 return -1;
3113 }
3114
3115 fd = j->fd;
3116 c = j->c;
3117
3118 if (fd->mode == 'w')
3119 if (0 != cram_flush_container2(fd, c))
3120 return -1;
3121
3122 /* Free the container */
3123 for (i = 0; i < c->max_slice; i++) {
3124 if (c->slices && c->slices[i]) {
3125 cram_free_slice(c->slices[i]);
3126 c->slices[i] = NULL;
3127 }
3128 }
3129
3130 c->slice = NULL;
3131 c->curr_slice = 0;
3132
3133 cram_free_container(c);
3134
3135 ret |= hflush(fd->fp) == 0 ? 0 : -1;
3136
3137 hts_tpool_delete_result(r, 1);
3138 }
3139
3140 return ret;
3141 }
3142
cram_flush_container_mt(cram_fd * fd,cram_container * c)3143 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3144 cram_job *j;
3145
3146 if (!fd->pool)
3147 return cram_flush_container(fd, c);
3148
3149 if (!(j = malloc(sizeof(*j))))
3150 return -1;
3151 j->fd = fd;
3152 j->c = c;
3153
3154 // Flush the job. Note our encoder queue may be full, so we
3155 // either have to keep trying in non-blocking mode (what we do) or
3156 // use a dedicated separate thread for draining the queue.
3157 for (;;) {
3158 errno = 0;
3159 hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
3160 int pending = (errno == EAGAIN);
3161 if (cram_flush_result(fd) != 0)
3162 return -1;
3163 if (!pending)
3164 break;
3165
3166 usleep(1000);
3167 }
3168
3169 return 0;
3170 }
3171
3172 /* ----------------------------------------------------------------------
3173 * Compression headers; the first part of the container
3174 */
3175
3176 /*
3177 * Creates a new blank container compression header
3178 *
3179 * Returns header ptr on success
3180 * NULL on failure
3181 */
cram_new_compression_header(void)3182 cram_block_compression_hdr *cram_new_compression_header(void) {
3183 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3184 if (!hdr)
3185 return NULL;
3186
3187 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3188 free(hdr);
3189 return NULL;
3190 }
3191
3192 if (!(hdr->TD_hash = kh_init(m_s2i))) {
3193 cram_free_block(hdr->TD_blk);
3194 free(hdr);
3195 return NULL;
3196 }
3197
3198 if (!(hdr->TD_keys = string_pool_create(8192))) {
3199 kh_destroy(m_s2i, hdr->TD_hash);
3200 cram_free_block(hdr->TD_blk);
3201 free(hdr);
3202 return NULL;
3203 }
3204
3205 return hdr;
3206 }
3207
cram_free_compression_header(cram_block_compression_hdr * hdr)3208 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3209 int i;
3210
3211 if (hdr->landmark)
3212 free(hdr->landmark);
3213
3214 if (hdr->preservation_map)
3215 kh_destroy(map, hdr->preservation_map);
3216
3217 for (i = 0; i < CRAM_MAP_HASH; i++) {
3218 cram_map *m, *m2;
3219 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3220 m2 = m->next;
3221 if (m->codec)
3222 m->codec->free(m->codec);
3223 free(m);
3224 }
3225 }
3226
3227 for (i = 0; i < CRAM_MAP_HASH; i++) {
3228 cram_map *m, *m2;
3229 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3230 m2 = m->next;
3231 if (m->codec)
3232 m->codec->free(m->codec);
3233 free(m);
3234 }
3235 }
3236
3237 for (i = 0; i < DS_END; i++) {
3238 if (hdr->codecs[i])
3239 hdr->codecs[i]->free(hdr->codecs[i]);
3240 }
3241
3242 if (hdr->TL)
3243 free(hdr->TL);
3244 if (hdr->TD_blk)
3245 cram_free_block(hdr->TD_blk);
3246 if (hdr->TD_hash)
3247 kh_destroy(m_s2i, hdr->TD_hash);
3248 if (hdr->TD_keys)
3249 string_pool_destroy(hdr->TD_keys);
3250
3251 free(hdr);
3252 }
3253
3254
3255 /* ----------------------------------------------------------------------
3256 * Slices and slice headers
3257 */
3258
cram_free_slice_header(cram_block_slice_hdr * hdr)3259 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3260 if (!hdr)
3261 return;
3262
3263 if (hdr->block_content_ids)
3264 free(hdr->block_content_ids);
3265
3266 free(hdr);
3267
3268 return;
3269 }
3270
cram_free_slice(cram_slice * s)3271 void cram_free_slice(cram_slice *s) {
3272 if (!s)
3273 return;
3274
3275 if (s->hdr_block)
3276 cram_free_block(s->hdr_block);
3277
3278 if (s->block) {
3279 int i;
3280
3281 if (s->hdr) {
3282 for (i = 0; i < s->hdr->num_blocks; i++) {
3283 cram_free_block(s->block[i]);
3284 }
3285 }
3286 free(s->block);
3287 }
3288
3289 if (s->block_by_id)
3290 free(s->block_by_id);
3291
3292 if (s->hdr)
3293 cram_free_slice_header(s->hdr);
3294
3295 if (s->seqs_blk)
3296 cram_free_block(s->seqs_blk);
3297
3298 if (s->qual_blk)
3299 cram_free_block(s->qual_blk);
3300
3301 if (s->name_blk)
3302 cram_free_block(s->name_blk);
3303
3304 if (s->aux_blk)
3305 cram_free_block(s->aux_blk);
3306
3307 if (s->base_blk)
3308 cram_free_block(s->base_blk);
3309
3310 if (s->soft_blk)
3311 cram_free_block(s->soft_blk);
3312
3313 if (s->cigar)
3314 free(s->cigar);
3315
3316 if (s->crecs)
3317 free(s->crecs);
3318
3319 if (s->features)
3320 free(s->features);
3321
3322 if (s->TN)
3323 free(s->TN);
3324
3325 if (s->pair_keys)
3326 string_pool_destroy(s->pair_keys);
3327
3328 if (s->pair[0])
3329 kh_destroy(m_s2i, s->pair[0]);
3330 if (s->pair[1])
3331 kh_destroy(m_s2i, s->pair[1]);
3332
3333 if (s->aux_block)
3334 free(s->aux_block);
3335
3336 free(s);
3337 }
3338
3339 /*
3340 * Creates a new empty slice in memory, for subsequent writing to
3341 * disk.
3342 *
3343 * Returns cram_slice ptr on success
3344 * NULL on failure
3345 */
cram_new_slice(enum cram_content_type type,int nrecs)3346 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3347 cram_slice *s = calloc(1, sizeof(*s));
3348 if (!s)
3349 return NULL;
3350
3351 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3352 goto err;
3353 s->hdr->content_type = type;
3354
3355 s->hdr_block = NULL;
3356 s->block = NULL;
3357 s->block_by_id = NULL;
3358 s->last_apos = 0;
3359 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
3360 s->cigar = NULL;
3361 s->cigar_alloc = 0;
3362 s->ncigar = 0;
3363
3364 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3365 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3366 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3367 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3368 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3369 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3370
3371 s->features = NULL;
3372 s->nfeatures = s->afeatures = 0;
3373
3374 #ifndef TN_external
3375 s->TN = NULL;
3376 s->nTN = s->aTN = 0;
3377 #endif
3378
3379 // Volatile keys as we do realloc in dstring
3380 if (!(s->pair_keys = string_pool_create(8192))) goto err;
3381 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
3382 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
3383
3384 #ifdef BA_external
3385 s->BA_len = 0;
3386 #endif
3387
3388 return s;
3389
3390 err:
3391 if (s)
3392 cram_free_slice(s);
3393
3394 return NULL;
3395 }
3396
3397 /*
3398 * Loads an entire slice.
3399 * FIXME: In 1.0 the native unit of slices within CRAM is broken
3400 * as slices contain references to objects in other slices.
3401 * To work around this while keeping the slice oriented outer loop
3402 * we read all slices and stitch them together into a fake large
3403 * slice instead.
3404 *
3405 * Returns cram_slice ptr on success
3406 * NULL on failure
3407 */
cram_read_slice(cram_fd * fd)3408 cram_slice *cram_read_slice(cram_fd *fd) {
3409 cram_block *b = cram_read_block(fd);
3410 cram_slice *s = calloc(1, sizeof(*s));
3411 int i, n, max_id, min_id;
3412
3413 if (!b || !s)
3414 goto err;
3415
3416 s->hdr_block = b;
3417 switch (b->content_type) {
3418 case MAPPED_SLICE:
3419 case UNMAPPED_SLICE:
3420 if (!(s->hdr = cram_decode_slice_header(fd, b)))
3421 goto err;
3422 break;
3423
3424 default:
3425 hts_log_error("Unexpected block of type %s",
3426 cram_content_type2str(b->content_type));
3427 goto err;
3428 }
3429
3430 if (s->hdr->num_blocks < 1) {
3431 hts_log_error("Slice does not include any data blocks");
3432 goto err;
3433 }
3434
3435 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3436 if (!s->block)
3437 goto err;
3438
3439 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3440 if (!(s->block[i] = cram_read_block(fd)))
3441 goto err;
3442
3443 if (s->block[i]->content_type == EXTERNAL) {
3444 if (max_id < s->block[i]->content_id)
3445 max_id = s->block[i]->content_id;
3446 if (min_id > s->block[i]->content_id)
3447 min_id = s->block[i]->content_id;
3448 }
3449 }
3450 if (min_id >= 0 && max_id < 1024) {
3451 if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
3452 goto err;
3453
3454 for (i = 0; i < n; i++) {
3455 if (s->block[i]->content_type != EXTERNAL)
3456 continue;
3457 s->block_by_id[s->block[i]->content_id] = s->block[i];
3458 }
3459 }
3460
3461 /* Initialise encoding/decoding tables */
3462 s->cigar = NULL;
3463 s->cigar_alloc = 0;
3464 s->ncigar = 0;
3465
3466 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3467 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3468 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3469 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3470 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3471 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3472
3473 s->crecs = NULL;
3474
3475 s->last_apos = s->hdr->ref_seq_start;
3476
3477 return s;
3478
3479 err:
3480 if (b)
3481 cram_free_block(b);
3482 if (s) {
3483 s->hdr_block = NULL;
3484 cram_free_slice(s);
3485 }
3486 return NULL;
3487 }
3488
3489
3490 /* ----------------------------------------------------------------------
3491 * CRAM file definition (header)
3492 */
3493
3494 /*
3495 * Reads a CRAM file definition structure.
3496 * Returns file_def ptr on success
3497 * NULL on failure
3498 */
cram_read_file_def(cram_fd * fd)3499 cram_file_def *cram_read_file_def(cram_fd *fd) {
3500 cram_file_def *def = malloc(sizeof(*def));
3501 if (!def)
3502 return NULL;
3503
3504 if (26 != hread(fd->fp, &def->magic[0], 26)) {
3505 free(def);
3506 return NULL;
3507 }
3508
3509 if (memcmp(def->magic, "CRAM", 4) != 0) {
3510 free(def);
3511 return NULL;
3512 }
3513
3514 if (def->major_version > 3) {
3515 hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d",
3516 def->major_version, def->minor_version);
3517 free(def);
3518 return NULL;
3519 }
3520
3521 fd->first_container += 26;
3522 fd->last_slice = 0;
3523
3524 return def;
3525 }
3526
3527 /*
3528 * Writes a cram_file_def structure to cram_fd.
3529 * Returns 0 on success
3530 * -1 on failure
3531 */
cram_write_file_def(cram_fd * fd,cram_file_def * def)3532 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3533 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3534 }
3535
cram_free_file_def(cram_file_def * def)3536 void cram_free_file_def(cram_file_def *def) {
3537 if (def) free(def);
3538 }
3539
3540 /* ----------------------------------------------------------------------
3541 * SAM header I/O
3542 */
3543
3544
3545 /*
3546 * Reads the SAM header from the first CRAM data block.
3547 * Also performs minimal parsing to extract read-group
3548 * and sample information.
3549
3550 * Returns SAM hdr ptr on success
3551 * NULL on failure
3552 */
cram_read_SAM_hdr(cram_fd * fd)3553 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3554 int32_t header_len;
3555 char *header;
3556 SAM_hdr *hdr;
3557
3558 /* 1.1 onwards stores the header in the first block of a container */
3559 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3560 /* Length */
3561 if (-1 == int32_decode(fd, &header_len))
3562 return NULL;
3563
3564 /* Alloc and read */
3565 if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
3566 return NULL;
3567
3568 if (header_len != hread(fd->fp, header, header_len))
3569 return NULL;
3570 header[header_len] = '\0';
3571
3572 fd->first_container += 4 + header_len;
3573 } else {
3574 cram_container *c = cram_read_container(fd);
3575 cram_block *b;
3576 int i;
3577 int64_t len;
3578
3579 if (!c)
3580 return NULL;
3581
3582 fd->first_container += c->length + c->offset;
3583
3584 if (c->num_blocks < 1) {
3585 cram_free_container(c);
3586 return NULL;
3587 }
3588
3589 if (!(b = cram_read_block(fd))) {
3590 cram_free_container(c);
3591 return NULL;
3592 }
3593 if (cram_uncompress_block(b) != 0) {
3594 cram_free_container(c);
3595 cram_free_block(b);
3596 return NULL;
3597 }
3598
3599 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3600 itf8_size(b->content_id) +
3601 itf8_size(b->uncomp_size) +
3602 itf8_size(b->comp_size);
3603
3604 /* Extract header from 1st block */
3605 if (-1 == int32_get_blk(b, &header_len) ||
3606 header_len < 0 || /* Spec. says signed... why? */
3607 b->uncomp_size - 4 < header_len) {
3608 cram_free_container(c);
3609 cram_free_block(b);
3610 return NULL;
3611 }
3612 if (NULL == (header = malloc((size_t) header_len+1))) {
3613 cram_free_container(c);
3614 cram_free_block(b);
3615 return NULL;
3616 }
3617 memcpy(header, BLOCK_END(b), header_len);
3618 header[header_len] = '\0';
3619 cram_free_block(b);
3620
3621 /* Consume any remaining blocks */
3622 for (i = 1; i < c->num_blocks; i++) {
3623 if (!(b = cram_read_block(fd))) {
3624 cram_free_container(c);
3625 return NULL;
3626 }
3627 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3628 itf8_size(b->content_id) +
3629 itf8_size(b->uncomp_size) +
3630 itf8_size(b->comp_size);
3631 cram_free_block(b);
3632 }
3633
3634 if (c->length > 0 && len > 0 && c->length > len) {
3635 // Consume padding
3636 char *pads = malloc(c->length - len);
3637 if (!pads) {
3638 cram_free_container(c);
3639 return NULL;
3640 }
3641
3642 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3643 cram_free_container(c);
3644 return NULL;
3645 }
3646 free(pads);
3647 }
3648
3649 cram_free_container(c);
3650 }
3651
3652 /* Parse */
3653 hdr = sam_hdr_parse_(header, header_len);
3654 free(header);
3655
3656 return hdr;
3657 }
3658
3659 /*
3660 * Converts 'in' to a full pathname to store in out.
3661 * Out must be at least PATH_MAX bytes long.
3662 */
full_path(char * out,char * in)3663 static void full_path(char *out, char *in) {
3664 size_t in_l = strlen(in);
3665 if (*in == '/' ||
3666 // Windows paths
3667 (in_l > 3 && toupper(*in) >= 'A' && toupper(*in) <= 'Z' &&
3668 in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
3669 strncpy(out, in, PATH_MAX);
3670 out[PATH_MAX-1] = 0;
3671 } else {
3672 int len;
3673
3674 // unable to get dir or out+in is too long
3675 if (!getcwd(out, PATH_MAX) ||
3676 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3677 strncpy(out, in, PATH_MAX);
3678 out[PATH_MAX-1] = 0;
3679 return;
3680 }
3681
3682 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3683
3684 // FIXME: cope with `pwd`/../../../foo.fa ?
3685 }
3686 }
3687
3688 /*
3689 * Writes a CRAM SAM header.
3690 * Returns 0 on success
3691 * -1 on failure
3692 */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)3693 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3694 int header_len;
3695 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3696
3697 /* Write CRAM MAGIC if not yet written. */
3698 if (fd->file_def->major_version == 0) {
3699 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3700 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3701 if (0 != cram_write_file_def(fd, fd->file_def))
3702 return -1;
3703 }
3704
3705 /* 1.0 requires an UNKNOWN read-group */
3706 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3707 if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3708 if (sam_hdr_add(hdr, "RG",
3709 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3710 return -1;
3711 }
3712
3713 /* Fix M5 strings */
3714 if (fd->refs && !fd->no_ref) {
3715 int i;
3716 for (i = 0; i < hdr->nref; i++) {
3717 SAM_hdr_type *ty;
3718 char *ref;
3719
3720 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3721 return -1;
3722
3723 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3724 char unsigned buf[16];
3725 char buf2[33];
3726 int rlen;
3727 hts_md5_context *md5;
3728
3729 if (!fd->refs ||
3730 !fd->refs->ref_id ||
3731 !fd->refs->ref_id[i]) {
3732 return -1;
3733 }
3734 rlen = fd->refs->ref_id[i]->length;
3735 if (!(md5 = hts_md5_init()))
3736 return -1;
3737 ref = cram_get_ref(fd, i, 1, rlen);
3738 if (NULL == ref) return -1;
3739 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3740 hts_md5_update(md5, ref, rlen);
3741 hts_md5_final(buf, md5);
3742 hts_md5_destroy(md5);
3743 cram_ref_decr(fd->refs, i);
3744
3745 hts_md5_hex(buf2, buf);
3746 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3747 return -1;
3748 }
3749
3750 if (fd->ref_fn) {
3751 char ref_fn[PATH_MAX];
3752 full_path(ref_fn, fd->ref_fn);
3753 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3754 return -1;
3755 }
3756 }
3757 }
3758
3759 if (sam_hdr_rebuild(hdr))
3760 return -1;
3761
3762 /* Length */
3763 header_len = sam_hdr_length(hdr);
3764 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3765 if (-1 == int32_encode(fd, header_len))
3766 return -1;
3767
3768 /* Text data */
3769 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3770 return -1;
3771 } else {
3772 /* Create block(s) inside a container */
3773 cram_block *b = cram_new_block(FILE_HEADER, 0);
3774 cram_container *c = cram_new_container(0, 0);
3775 int padded_length;
3776 char *pads;
3777 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3778
3779 if (!b || !c) {
3780 if (b) cram_free_block(b);
3781 if (c) cram_free_container(c);
3782 return -1;
3783 }
3784
3785 int32_put_blk(b, header_len);
3786 if (header_len)
3787 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3788 BLOCK_UPLEN(b);
3789
3790 // Compress header block if V3.0 and above
3791 if (CRAM_MAJOR_VERS(fd->version) >= 3)
3792 cram_compress_block(fd, b, NULL, -1, -1);
3793
3794 if (blank_block) {
3795 c->length = b->comp_size + 2 + 4*is_cram_3 +
3796 itf8_size(b->content_id) +
3797 itf8_size(b->uncomp_size) +
3798 itf8_size(b->comp_size);
3799
3800 c->num_blocks = 2;
3801 c->num_landmarks = 2;
3802 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3803 cram_free_block(b);
3804 cram_free_container(c);
3805 return -1;
3806 }
3807 c->landmark[0] = 0;
3808 c->landmark[1] = c->length;
3809
3810 // Plus extra storage for uncompressed secondary blank block
3811 padded_length = MIN(c->length*.5, 10000);
3812 c->length += padded_length + 2 + 4*is_cram_3 +
3813 itf8_size(b->content_id) +
3814 itf8_size(padded_length)*2;
3815 } else {
3816 // Pad the block instead.
3817 c->num_blocks = 1;
3818 c->num_landmarks = 1;
3819 if (!(c->landmark = malloc(sizeof(*c->landmark))))
3820 return -1;
3821 c->landmark[0] = 0;
3822
3823 padded_length = MAX(c->length*1.5, 10000) - c->length;
3824
3825 c->length = b->comp_size + padded_length +
3826 2 + 4*is_cram_3 +
3827 itf8_size(b->content_id) +
3828 itf8_size(b->uncomp_size) +
3829 itf8_size(b->comp_size);
3830
3831 if (NULL == (pads = calloc(1, padded_length))) {
3832 cram_free_block(b);
3833 cram_free_container(c);
3834 return -1;
3835 }
3836 BLOCK_APPEND(b, pads, padded_length);
3837 BLOCK_UPLEN(b);
3838 free(pads);
3839 }
3840
3841 if (-1 == cram_write_container(fd, c)) {
3842 cram_free_block(b);
3843 cram_free_container(c);
3844 return -1;
3845 }
3846
3847 if (-1 == cram_write_block(fd, b)) {
3848 cram_free_block(b);
3849 cram_free_container(c);
3850 return -1;
3851 }
3852
3853 if (blank_block) {
3854 BLOCK_RESIZE(b, padded_length);
3855 memset(BLOCK_DATA(b), 0, padded_length);
3856 BLOCK_SIZE(b) = padded_length;
3857 BLOCK_UPLEN(b);
3858 b->method = RAW;
3859 if (-1 == cram_write_block(fd, b)) {
3860 cram_free_block(b);
3861 cram_free_container(c);
3862 return -1;
3863 }
3864 }
3865
3866 cram_free_block(b);
3867 cram_free_container(c);
3868 }
3869
3870 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3871 return -1;
3872 if (-1 == refs2id(fd->refs, fd->header))
3873 return -1;
3874
3875 if (0 != hflush(fd->fp))
3876 return -1;
3877
3878 RP("=== Finishing saving header ===\n");
3879
3880 return 0;
3881 }
3882
3883 /* ----------------------------------------------------------------------
3884 * The top-level cram opening, closing and option handling
3885 */
3886
3887 /*
3888 * Initialises the lookup tables. These could be global statics, but they're
3889 * clumsy to setup in a multi-threaded environment unless we generate
3890 * verbatim code and include that.
3891 */
cram_init_tables(cram_fd * fd)3892 static void cram_init_tables(cram_fd *fd) {
3893 int i;
3894
3895 memset(fd->L1, 4, 256);
3896 fd->L1['A'] = 0; fd->L1['a'] = 0;
3897 fd->L1['C'] = 1; fd->L1['c'] = 1;
3898 fd->L1['G'] = 2; fd->L1['g'] = 2;
3899 fd->L1['T'] = 3; fd->L1['t'] = 3;
3900
3901 memset(fd->L2, 5, 256);
3902 fd->L2['A'] = 0; fd->L2['a'] = 0;
3903 fd->L2['C'] = 1; fd->L2['c'] = 1;
3904 fd->L2['G'] = 2; fd->L2['g'] = 2;
3905 fd->L2['T'] = 3; fd->L2['t'] = 3;
3906 fd->L2['N'] = 4; fd->L2['n'] = 4;
3907
3908 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3909 for (i = 0; i < 0x200; i++) {
3910 int f = 0;
3911
3912 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
3913 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3914 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
3915 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
3916 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
3917 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
3918 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
3919 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
3920 if (i & CRAM_FDUP) f |= BAM_FDUP;
3921
3922 fd->bam_flag_swap[i] = f;
3923 }
3924
3925 for (i = 0; i < 0x1000; i++) {
3926 int g = 0;
3927
3928 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
3929 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
3930 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
3931 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
3932 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
3933 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
3934 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
3935 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
3936 if (i & BAM_FDUP) g |= CRAM_FDUP;
3937
3938 fd->cram_flag_swap[i] = g;
3939 }
3940 } else {
3941 /* NOP */
3942 for (i = 0; i < 0x1000; i++)
3943 fd->bam_flag_swap[i] = i;
3944 for (i = 0; i < 0x1000; i++)
3945 fd->cram_flag_swap[i] = i;
3946 }
3947
3948 memset(fd->cram_sub_matrix, 4, 32*32);
3949 for (i = 0; i < 32; i++) {
3950 fd->cram_sub_matrix[i]['A'&0x1f]=0;
3951 fd->cram_sub_matrix[i]['C'&0x1f]=1;
3952 fd->cram_sub_matrix[i]['G'&0x1f]=2;
3953 fd->cram_sub_matrix[i]['T'&0x1f]=3;
3954 fd->cram_sub_matrix[i]['N'&0x1f]=4;
3955 }
3956 for (i = 0; i < 20; i+=4) {
3957 int j;
3958 for (j = 0; j < 20; j++) {
3959 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3960 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3961 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3962 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3963 }
3964 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
3965 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
3966 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
3967 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
3968 }
3969 }
3970
3971 // Default version numbers for CRAM
3972 static int major_version = 3;
3973 static int minor_version = 0;
3974
3975 /*
3976 * Opens a CRAM file for read (mode "rb") or write ("wb").
3977 * The filename may be "-" to indicate stdin or stdout.
3978 *
3979 * Returns file handle on success
3980 * NULL on failure.
3981 */
cram_open(const char * filename,const char * mode)3982 cram_fd *cram_open(const char *filename, const char *mode) {
3983 hFILE *fp;
3984 cram_fd *fd;
3985 char fmode[3]= { mode[0], '\0', '\0' };
3986
3987 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
3988 fmode[1] = 'b';
3989 }
3990
3991 fp = hopen(filename, fmode);
3992 if (!fp)
3993 return NULL;
3994
3995 fd = cram_dopen(fp, filename, mode);
3996 if (!fd)
3997 hclose_abruptly(fp);
3998
3999 return fd;
4000 }
4001
4002 /* Opens an existing stream for reading or writing.
4003 *
4004 * Returns file handle on success;
4005 * NULL on failure.
4006 */
cram_dopen(hFILE * fp,const char * filename,const char * mode)4007 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
4008 int i;
4009 char *cp;
4010 cram_fd *fd = calloc(1, sizeof(*fd));
4011 if (!fd)
4012 return NULL;
4013
4014 fd->level = 5;
4015 for (i = 0; mode[i]; i++) {
4016 if (mode[i] >= '0' && mode[i] <= '9') {
4017 fd->level = mode[i] - '0';
4018 break;
4019 }
4020 }
4021
4022 fd->fp = fp;
4023 fd->mode = *mode;
4024 fd->first_container = 0;
4025
4026 if (fd->mode == 'r') {
4027 /* Reader */
4028
4029 if (!(fd->file_def = cram_read_file_def(fd)))
4030 goto err;
4031
4032 fd->version = fd->file_def->major_version * 256 +
4033 fd->file_def->minor_version;
4034
4035 if (!(fd->header = cram_read_SAM_hdr(fd))) {
4036 cram_free_file_def(fd->file_def);
4037 goto err;
4038 }
4039
4040 } else {
4041 /* Writer */
4042 cram_file_def *def = calloc(1, sizeof(*def));
4043 if (!def)
4044 return NULL;
4045
4046 fd->file_def = def;
4047
4048 def->magic[0] = 'C';
4049 def->magic[1] = 'R';
4050 def->magic[2] = 'A';
4051 def->magic[3] = 'M';
4052 def->major_version = 0; // Indicator to write file def later.
4053 def->minor_version = 0;
4054 memset(def->file_id, 0, 20);
4055 strncpy(def->file_id, filename, 20);
4056
4057 fd->version = major_version * 256 + minor_version;
4058
4059 /* SAM header written later along with this file_def */
4060 }
4061
4062 cram_init_tables(fd);
4063
4064 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
4065 if (!fd->prefix)
4066 goto err;
4067 fd->first_base = fd->last_base = -1;
4068 fd->record_counter = 0;
4069
4070 fd->ctr = NULL;
4071 fd->refs = refs_create();
4072 if (!fd->refs)
4073 goto err;
4074 fd->ref_id = -2;
4075 fd->ref = NULL;
4076
4077 fd->decode_md = 0;
4078 fd->seqs_per_slice = SEQS_PER_SLICE;
4079 fd->bases_per_slice = BASES_PER_SLICE;
4080 fd->slices_per_container = SLICE_PER_CNT;
4081 fd->embed_ref = 0;
4082 fd->no_ref = 0;
4083 fd->ignore_md5 = 0;
4084 fd->lossy_read_names = 0;
4085 fd->use_bz2 = 0;
4086 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
4087 fd->use_lzma = 0;
4088 fd->multi_seq = -1;
4089 fd->unsorted = 0;
4090 fd->shared_ref = 0;
4091
4092 fd->index = NULL;
4093 fd->own_pool = 0;
4094 fd->pool = NULL;
4095 fd->rqueue = NULL;
4096 fd->job_pending = NULL;
4097 fd->ooc = 0;
4098 fd->required_fields = INT_MAX;
4099
4100 for (i = 0; i < DS_END; i++)
4101 fd->m[i] = cram_new_metrics();
4102
4103 if (!(fd->tags_used = kh_init(m_metrics)))
4104 goto err;
4105
4106 fd->range.refid = -2; // no ref.
4107 fd->eof = 1; // See samtools issue #150
4108 fd->ref_fn = NULL;
4109
4110 fd->bl = NULL;
4111
4112 /* Initialise dummy refs from the @SQ headers */
4113 if (-1 == refs_from_header(fd->refs, fd, fd->header))
4114 goto err;
4115
4116 return fd;
4117
4118 err:
4119 if (fd)
4120 free(fd);
4121
4122 return NULL;
4123 }
4124
4125 /*
4126 * Seek within a CRAM file.
4127 *
4128 * Returns 0 on success
4129 * -1 on failure
4130 */
cram_seek(cram_fd * fd,off_t offset,int whence)4131 int cram_seek(cram_fd *fd, off_t offset, int whence) {
4132 char buf[65536];
4133
4134 fd->ooc = 0;
4135
4136 if (hseek(fd->fp, offset, whence) >= 0) {
4137 return 0;
4138 }
4139
4140 if (!(whence == SEEK_CUR && offset >= 0))
4141 return -1;
4142
4143 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4144 while (offset > 0) {
4145 int len = MIN(65536, offset);
4146 if (len != hread(fd->fp, buf, len))
4147 return -1;
4148 offset -= len;
4149 }
4150
4151 return 0;
4152 }
4153
4154 /*
4155 * Flushes a CRAM file.
4156 * Useful for when writing to stdout without wishing to close the stream.
4157 *
4158 * Returns 0 on success
4159 * -1 on failure
4160 */
cram_flush(cram_fd * fd)4161 int cram_flush(cram_fd *fd) {
4162 if (!fd)
4163 return -1;
4164
4165 if (fd->mode == 'w' && fd->ctr) {
4166 if(fd->ctr->slice)
4167 cram_update_curr_slice(fd->ctr);
4168
4169 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4170 return -1;
4171 }
4172
4173 return 0;
4174 }
4175
4176 /*
4177 * Closes a CRAM file.
4178 * Returns 0 on success
4179 * -1 on failure
4180 */
cram_close(cram_fd * fd)4181 int cram_close(cram_fd *fd) {
4182 spare_bams *bl, *next;
4183 int i;
4184
4185 if (!fd)
4186 return -1;
4187
4188 if (fd->mode == 'w' && fd->ctr) {
4189 if(fd->ctr->slice)
4190 cram_update_curr_slice(fd->ctr);
4191
4192 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4193 return -1;
4194 }
4195
4196 if (fd->pool && fd->eof >= 0) {
4197 hts_tpool_process_flush(fd->rqueue);
4198
4199 if (0 != cram_flush_result(fd))
4200 return -1;
4201
4202 pthread_mutex_destroy(&fd->metrics_lock);
4203 pthread_mutex_destroy(&fd->ref_lock);
4204 pthread_mutex_destroy(&fd->bam_list_lock);
4205
4206 fd->ctr = NULL; // prevent double freeing
4207
4208 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4209
4210 hts_tpool_process_destroy(fd->rqueue);
4211 }
4212
4213 if (fd->mode == 'w') {
4214 /* Write EOF block */
4215 if (CRAM_MAJOR_VERS(fd->version) == 3) {
4216 if (38 != hwrite(fd->fp,
4217 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4218 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4219 "\x00\x01\x00" // Cont HDR
4220 "\x05\xbd\xd9\x4f" // CRC32
4221 "\x00\x01\x00\x06\x06" // Comp.HDR blk
4222 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
4223 "\xee\x63\x01\x4b", // CRC32
4224 38))
4225 return -1;
4226 } else {
4227 if (30 != hwrite(fd->fp,
4228 "\x0b\x00\x00\x00\xff\xff\xff\xff"
4229 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4230 "\x00\x01\x00\x00\x01\x00\x06\x06"
4231 "\x01\x00\x01\x00\x01\x00", 30))
4232 return -1;
4233 }
4234 }
4235
4236 for (bl = fd->bl; bl; bl = next) {
4237 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
4238
4239 next = bl->next;
4240 for (i = 0; i < max_rec; i++) {
4241 if (bl->bams[i])
4242 bam_free(bl->bams[i]);
4243 }
4244 free(bl->bams);
4245 free(bl);
4246 }
4247
4248 if (hclose(fd->fp) != 0)
4249 return -1;
4250
4251 if (fd->file_def)
4252 cram_free_file_def(fd->file_def);
4253
4254 if (fd->header)
4255 sam_hdr_free(fd->header);
4256
4257 free(fd->prefix);
4258
4259 if (fd->ctr)
4260 cram_free_container(fd->ctr);
4261
4262 if (fd->refs)
4263 refs_free(fd->refs);
4264 if (fd->ref_free)
4265 free(fd->ref_free);
4266
4267 for (i = 0; i < DS_END; i++)
4268 if (fd->m[i])
4269 free(fd->m[i]);
4270
4271 if (fd->tags_used) {
4272 khint_t k;
4273
4274 for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
4275 if (kh_exist(fd->tags_used, k))
4276 free(kh_val(fd->tags_used, k));
4277 }
4278
4279 kh_destroy(m_metrics, fd->tags_used);
4280 }
4281
4282 if (fd->index)
4283 cram_index_free(fd);
4284
4285 if (fd->own_pool && fd->pool)
4286 hts_tpool_destroy(fd->pool);
4287
4288 free(fd);
4289 return 0;
4290 }
4291
4292 /*
4293 * Returns 1 if we hit an EOF while reading.
4294 */
cram_eof(cram_fd * fd)4295 int cram_eof(cram_fd *fd) {
4296 return fd->eof;
4297 }
4298
4299
4300 /*
4301 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4302 * Use this immediately after opening.
4303 *
4304 * Returns 0 on success
4305 * -1 on failure
4306 */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)4307 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
4308 int r;
4309 va_list args;
4310
4311 va_start(args, opt);
4312 r = cram_set_voption(fd, opt, args);
4313 va_end(args);
4314
4315 return r;
4316 }
4317
4318 /*
4319 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4320 * Use this immediately after opening.
4321 *
4322 * Returns 0 on success
4323 * -1 on failure
4324 */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)4325 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
4326 refs_t *refs;
4327
4328 if (!fd) {
4329 errno = EBADF;
4330 return -1;
4331 }
4332
4333 switch (opt) {
4334 case CRAM_OPT_DECODE_MD:
4335 fd->decode_md = va_arg(args, int);
4336 break;
4337
4338 case CRAM_OPT_PREFIX:
4339 if (fd->prefix)
4340 free(fd->prefix);
4341 if (!(fd->prefix = strdup(va_arg(args, char *))))
4342 return -1;
4343 break;
4344
4345 case CRAM_OPT_VERBOSITY:
4346 break;
4347
4348 case CRAM_OPT_SEQS_PER_SLICE:
4349 fd->seqs_per_slice = va_arg(args, int);
4350 break;
4351
4352 case CRAM_OPT_BASES_PER_SLICE:
4353 fd->bases_per_slice = va_arg(args, int);
4354 break;
4355
4356 case CRAM_OPT_SLICES_PER_CONTAINER:
4357 fd->slices_per_container = va_arg(args, int);
4358 break;
4359
4360 case CRAM_OPT_EMBED_REF:
4361 fd->embed_ref = va_arg(args, int);
4362 break;
4363
4364 case CRAM_OPT_NO_REF:
4365 fd->no_ref = va_arg(args, int);
4366 break;
4367
4368 case CRAM_OPT_IGNORE_MD5:
4369 fd->ignore_md5 = va_arg(args, int);
4370 break;
4371
4372 case CRAM_OPT_LOSSY_NAMES:
4373 fd->lossy_read_names = va_arg(args, int);
4374 // Currently lossy read names required paired (attached) reads.
4375 // TLEN 0 or being 1 out causes read pairs to be detached, breaking
4376 // the lossy read name compression, so we have extra options to
4377 // slacken the exact TLEN round-trip checks.
4378 fd->tlen_approx = fd->lossy_read_names;
4379 fd->tlen_zero = fd->lossy_read_names;
4380 break;
4381
4382 case CRAM_OPT_USE_BZIP2:
4383 fd->use_bz2 = va_arg(args, int);
4384 break;
4385
4386 case CRAM_OPT_USE_RANS:
4387 fd->use_rans = va_arg(args, int);
4388 break;
4389
4390 case CRAM_OPT_USE_LZMA:
4391 fd->use_lzma = va_arg(args, int);
4392 break;
4393
4394 case CRAM_OPT_SHARED_REF:
4395 fd->shared_ref = 1;
4396 refs = va_arg(args, refs_t *);
4397 if (refs != fd->refs) {
4398 if (fd->refs)
4399 refs_free(fd->refs);
4400 fd->refs = refs;
4401 fd->refs->count++;
4402 }
4403 break;
4404
4405 case CRAM_OPT_RANGE:
4406 fd->range = *va_arg(args, cram_range *);
4407 return cram_seek_to_refpos(fd, &fd->range);
4408
4409 case CRAM_OPT_REFERENCE:
4410 return cram_load_reference(fd, va_arg(args, char *));
4411
4412 case CRAM_OPT_VERSION: {
4413 int major, minor;
4414 char *s = va_arg(args, char *);
4415 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4416 hts_log_error("Malformed version string %s", s);
4417 return -1;
4418 }
4419 if (!((major == 1 && minor == 0) ||
4420 (major == 2 && (minor == 0 || minor == 1)) ||
4421 (major == 3 && minor == 0))) {
4422 hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0");
4423 errno = EINVAL;
4424 return -1;
4425 }
4426 fd->version = major*256 + minor;
4427
4428 if (CRAM_MAJOR_VERS(fd->version) >= 3)
4429 fd->use_rans = 1;
4430 break;
4431 }
4432
4433 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4434 fd->multi_seq = va_arg(args, int);
4435 break;
4436
4437 case CRAM_OPT_NTHREADS: {
4438 int nthreads = va_arg(args, int);
4439 if (nthreads >= 1) {
4440 if (!(fd->pool = hts_tpool_init(nthreads)))
4441 return -1;
4442
4443 fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
4444 pthread_mutex_init(&fd->metrics_lock, NULL);
4445 pthread_mutex_init(&fd->ref_lock, NULL);
4446 pthread_mutex_init(&fd->bam_list_lock, NULL);
4447 fd->shared_ref = 1;
4448 fd->own_pool = 1;
4449 }
4450 break;
4451 }
4452
4453 case CRAM_OPT_THREAD_POOL: {
4454 htsThreadPool *p = va_arg(args, htsThreadPool *);
4455 fd->pool = p ? p->pool : NULL;
4456 if (fd->pool) {
4457 fd->rqueue = hts_tpool_process_init(fd->pool,
4458 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
4459 0);
4460 pthread_mutex_init(&fd->metrics_lock, NULL);
4461 pthread_mutex_init(&fd->ref_lock, NULL);
4462 pthread_mutex_init(&fd->bam_list_lock, NULL);
4463 }
4464 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4465 fd->own_pool = 0;
4466
4467 //fd->qsize = 1;
4468 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4469 //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4470 break;
4471 }
4472
4473 case CRAM_OPT_REQUIRED_FIELDS:
4474 fd->required_fields = va_arg(args, int);
4475 break;
4476
4477 case HTS_OPT_COMPRESSION_LEVEL:
4478 fd->level = va_arg(args, int);
4479 break;
4480
4481 default:
4482 hts_log_error("Unknown CRAM option code %d", opt);
4483 errno = EINVAL;
4484 return -1;
4485 }
4486
4487 return 0;
4488 }
4489
cram_check_EOF(cram_fd * fd)4490 int cram_check_EOF(cram_fd *fd)
4491 {
4492 // Byte 9 in these templates is & with 0x0f to resolve differences
4493 // between ITF-8 interpretations between early Java and C
4494 // implementations of CRAM
4495 static const unsigned char TEMPLATE_2_1[30] = {
4496 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4497 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4498 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4499 };
4500 static const unsigned char TEMPLATE_3[38] = {
4501 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4502 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4503 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4504 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4505 };
4506
4507 unsigned char buf[38]; // max(sizeof TEMPLATE_*)
4508
4509 uint8_t major = CRAM_MAJOR_VERS(fd->version);
4510 uint8_t minor = CRAM_MINOR_VERS(fd->version);
4511
4512 const unsigned char *template;
4513 ssize_t template_len;
4514 if ((major < 2) ||
4515 (major == 2 && minor == 0)) {
4516 return 3; // No EOF support in cram versions less than 2.1
4517 } else if (major == 2 && minor == 1) {
4518 template = TEMPLATE_2_1;
4519 template_len = sizeof TEMPLATE_2_1;
4520 } else {
4521 template = TEMPLATE_3;
4522 template_len = sizeof TEMPLATE_3;
4523 }
4524
4525 off_t offset = htell(fd->fp);
4526 if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
4527 if (errno == ESPIPE) {
4528 hclearerr(fd->fp);
4529 return 2;
4530 }
4531 else {
4532 return -1;
4533 }
4534 }
4535 if (hread(fd->fp, buf, template_len) != template_len) return -1;
4536 if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
4537 buf[8] &= 0x0f;
4538 return (memcmp(template, buf, template_len) == 0)? 1 : 0;
4539 }
4540