1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36 *
37 * CRAM I/O primitives.
38 *
39 * - ITF8 encoding and decoding.
40 * - Block based I/O
41 * - Zlib inflating and deflating (memory)
42 * - CRAM basic data structure reading and writing
43 * - File opening / closing
44 * - Reference sequence handling
45 */
46
47 /*
48 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
49 * a way to return errors for when malloc fails.
50 */
51
52 #ifdef HAVE_CONFIG_H
53 #include "io_lib_config.h"
54 #endif
55
56 #include <stdio.h>
57 #include <errno.h>
58 #include <assert.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #include <unistd.h>
62 #include <zlib.h>
63 #ifdef HAVE_LIBBZ2
64 #include <bzlib.h>
65 #endif
66 #ifdef HAVE_LIBLZMA
67 #include <lzma.h>
68 #endif
69 #ifdef HAVE_LIBBSC
70 #include <libbsc/libbsc.h>
71 #endif
72 #include <sys/types.h>
73 #include <sys/stat.h>
74 #include <math.h>
75 #include <ctype.h>
76
77 #ifdef _MSC_VER
78 #include <direct.h>
79 #include <io.h>
80
81 #define chmod _chmod
82 #define getcwd _getcwd
83
84 //_mkdir does not take a mode argument on windows
85 //but all calls below are followed up by calls to chmod
86 //so this should not matter.
87 #define mkdir(path,mode) _mkdir(path)
88 #endif
89
90 #include "io_lib/cram.h"
91 #include "io_lib/os.h"
92 #include "io_lib/md5.h"
93 #include "io_lib/crc32.h"
94 #include "io_lib/open_trace_file.h"
95 #include "io_lib/rANS_static.h"
96 #include "io_lib/rANS_static4x16.h"
97 #include "io_lib/tokenise_name3.h"
98
99 // Enable if we want V3.1 support. TODO: add a configure param for this
100 #define HAVE_FQZ
101
102 #ifdef HAVE_FQZ
103 #include "fqzcomp_qual.h"
104 #endif
105
106 #if defined(HAVE_STDIO_EXT_H)
107 #include <stdio_ext.h>
108 #endif
109
110 //#define REF_DEBUG
111
112 #ifdef REF_DEBUG
113 #define RP(...) fprintf (stderr, __VA_ARGS__)
114 #include <sys/syscall.h>
115 #define gettid() (int)syscall(SYS_gettid)
116 #else
117 #define RP(...)
118 #define gettid() 0
119 #endif
120
121 #define TRIAL_SPAN 50
122 #define NTRIALS 3
123
124 /* ----------------------------------------------------------------------
125 * custom buffering helper routines
126 */
127 #if defined(CRAM_IO_CUSTOM_BUFFERING)
cram_io_C_FILE_fread(void * ptr,size_t size,size_t nmemb,void * stream)128 static size_t cram_io_C_FILE_fread(void *ptr, size_t size, size_t nmemb, void *stream)
129 {
130 return fread(ptr,size,nmemb,(FILE *)stream);
131 }
132
cram_io_C_FILE_fwrite(void * ptr,size_t size,size_t nmemb,void * stream)133 static size_t cram_io_C_FILE_fwrite(void *ptr, size_t size, size_t nmemb, void *stream)
134 {
135 return fwrite(ptr,size,nmemb,(FILE *)stream);
136 }
137
cram_io_C_FILE_fseek(void * fd,off_t offset,int whence)138 static int cram_io_C_FILE_fseek(void * fd, off_t offset, int whence)
139 {
140 return fseeko((FILE *)fd,offset,whence);
141 }
142
cram_io_C_FILE_ftell(void * fd)143 static off_t cram_io_C_FILE_ftell(void * fd)
144 {
145 return ftello((FILE *)fd);
146 }
147
148 /* ----------------------------------------------------------------------
149 * Input buffering
150 */
151
152 /* fill empty buffer */
cram_io_fill_input_buffer(cram_fd * fd)153 static void cram_io_fill_input_buffer(cram_fd * fd)
154 {
155 /* buffer need to be empty */
156 assert ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe );
157
158 /* read up to buffer size bytes */
159 do {
160 /* C-IO fread */
161 size_t const r =
162 (fd->fp_in_callbacks->fread_callback)(fd->fp_in_buffer->fp_in_buf_pa,
163 1,
164 fd->fp_in_buffer->fp_in_buf_size,
165 fd->fp_in_callbacks->user_data);
166 /* move offset */
167 fd->fp_in_buffer->fp_in_buf_start += (fd->fp_in_buffer->fp_in_buf_pe -
168 fd->fp_in_buffer->fp_in_buf_pa);
169 /* set end of window */
170 fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa + r;
171 /* set current input */
172 fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
173 }
174 while ( 0 ) ;
175 }
176
177 /* fill buffer and return next byte or EOF */
cram_io_input_buffer_underflow(cram_fd * fd)178 int cram_io_input_buffer_underflow(cram_fd * fd)
179 {
180 cram_io_fill_input_buffer(fd);
181
182 if ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe )
183 return EOF;
184 else
185 return (int)((unsigned char )(*(fd->fp_in_buffer->fp_in_buf_pc++)));
186 }
187
188 /* integer minimum */
imin(size_t const a,size_t const b)189 static inline size_t imin(size_t const a, size_t const b)
190 {
191 return (a<b)?a:b;
192 }
193
194 /* fread simulation */
cram_io_input_buffer_read(void * vptr,size_t size,size_t nmemb,cram_fd * fd)195 size_t cram_io_input_buffer_read(void *vptr, size_t size, size_t nmemb, cram_fd * fd)
196 {
197 size_t toread = size * nmemb; /* number of bytes still to be read */
198 size_t r = 0; /* number of bytes copied to ptr */
199 size_t inbuf = fd->fp_in_buffer->fp_in_buf_pe -
200 fd->fp_in_buffer->fp_in_buf_pc; /* number of bytes in buffer */
201 size_t tocopy = imin(toread,inbuf); /* number of bytes to be copied from buffer */
202 size_t blockread = 0;
203 char *ptr = (char *)vptr;
204
205 /* copy bytes still in buffer and update values */
206 memcpy(ptr,fd->fp_in_buffer->fp_in_buf_pc,tocopy);
207 toread -= tocopy;
208 r += tocopy;
209 ptr += tocopy;
210 fd->fp_in_buffer->fp_in_buf_pc += tocopy;
211
212 /* read whole blocks without copying to buffer first, C-IO fread */
213 while ( (toread >= fd->fp_in_buffer->fp_in_buf_size) &&
214 ((blockread = ((fd->fp_in_callbacks->
215 fread_callback))(ptr,
216 1,
217 fd->fp_in_buffer->fp_in_buf_size,
218 fd->fp_in_callbacks->user_data))!=0) ) {
219 toread -= blockread;
220 r += blockread;
221 ptr += blockread;
222 fd->fp_in_buffer->fp_in_buf_start += blockread;
223 }
224
225 /* read rest of bytes using buffer */
226 while ( toread ) {
227 /* buffer should be empty */
228 assert ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe );
229 /* fill buffer */
230 cram_io_fill_input_buffer(fd);
231 /* number of bytes in buffer after filling */
232 inbuf = fd->fp_in_buffer->fp_in_buf_pe-fd->fp_in_buffer->fp_in_buf_pc;
233 /* number of bytes to copy */
234 tocopy = imin(toread,inbuf);
235
236 /* break if there is no more data */
237 if ( ! inbuf )
238 break;
239
240 memcpy(ptr,fd->fp_in_buffer->fp_in_buf_pc,tocopy);
241 toread -= tocopy;
242 r += tocopy;
243 ptr += tocopy;
244 fd->fp_in_buffer->fp_in_buf_pc += tocopy;
245 }
246
247 return size ? (r / size) : r;
248 }
249
cram_io_input_buffer_seek(cram_fd * fd,off_t offset,int whence)250 int cram_io_input_buffer_seek(cram_fd * fd, off_t offset, int whence)
251 {
252 int r = -1;
253
254 if ( whence == SEEK_CUR )
255 {
256 /* current absolute input position in buffer */
257 uint64_t const curpos = fd->fp_in_buffer->fp_in_buf_start +
258 (fd->fp_in_buffer->fp_in_buf_pc -
259 fd->fp_in_buffer->fp_in_buf_pa);
260 /* absolute buffer low */
261 uint64_t const bufferlow = fd->fp_in_buffer->fp_in_buf_start;
262 /* absolute buffer high */
263 uint64_t const bufferhigh = fd->fp_in_buffer->fp_in_buf_start +
264 (fd->fp_in_buffer->fp_in_buf_pe -
265 fd->fp_in_buffer->fp_in_buf_pa);
266 /* absolute seek target position */
267 int64_t const abstarget = ((int64_t)curpos) + offset;
268
269 /* if target is inside buffer, then just adjust the current pointer */
270 if ( abstarget >= bufferlow && abstarget <= bufferhigh ) {
271 /* update current pointer */
272 fd->fp_in_buffer->fp_in_buf_pc += offset;
273 assert ( fd->fp_in_buffer->fp_in_buf_pc >= fd->fp_in_buffer->fp_in_buf_pa );
274 assert ( fd->fp_in_buffer->fp_in_buf_pc <= fd->fp_in_buffer->fp_in_buf_pe );
275 /* seek successful */
276 return 0;
277 }
278 else {
279 /* current position of underlying input stream */
280 int64_t const filepos = fd->fp_in_buffer->fp_in_buf_start +
281 (fd->fp_in_buffer->fp_in_buf_pe -
282 fd->fp_in_buffer->fp_in_buf_pa);
283 int64_t const seekoffset = abstarget - filepos;
284
285 /* perform seek */
286 r = fd->fp_in_callbacks->fseek_callback(fd->fp_in_callbacks->user_data,
287 seekoffset, SEEK_CUR);
288
289 /* seek successful */
290 if ( ! r ) {
291 /* mark buffer as empty */
292 fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
293 fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa;
294 /* set new buffer start offset */
295 fd->fp_in_buffer->fp_in_buf_start = abstarget;
296 return 0;
297 } else {
298 return -1;
299 }
300 }
301 }
302
303 /* mark buffer as empty */
304 fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
305 fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa;
306
307 /* perform seek, C-IO fseek */
308 r = fd->fp_in_callbacks->fseek_callback(fd->fp_in_callbacks->user_data,
309 offset, whence);
310
311 /* get new offset if seek was successful */
312 if ( !r )
313 /* C-IO ftell */
314 fd->fp_in_buffer->fp_in_buf_start =
315 fd->fp_in_callbacks->ftell_callback(fd->fp_in_callbacks->user_data);
316
317 return r;
318 }
319
320 static cram_io_input_t *
cram_IO_deallocate_cram_io_input(cram_io_input_t * obj)321 cram_IO_deallocate_cram_io_input(cram_io_input_t * obj)
322 {
323 if ( obj ) {
324 free(obj);
325 obj = NULL;
326 }
327
328 return obj;
329 }
330
331 static cram_io_input_t *
cram_IO_allocate_cram_io_input()332 cram_IO_allocate_cram_io_input()
333 {
334 cram_io_input_t * obj = (cram_io_input_t *)malloc(sizeof(cram_io_input_t));
335 if ( ! obj ) {
336 return cram_IO_deallocate_cram_io_input(obj);
337 }
338 obj->user_data = NULL;
339 obj->fread_callback = NULL;
340 obj->fseek_callback = NULL;
341 obj->ftell_callback = NULL;
342 return obj;
343 }
344
345 static cram_io_input_t *
cram_IO_allocate_cram_io_input_from_C_FILE(FILE * file)346 cram_IO_allocate_cram_io_input_from_C_FILE(FILE * file)
347 {
348 cram_io_input_t * obj = cram_IO_allocate_cram_io_input();
349 if ( ! obj ) {
350 return cram_IO_deallocate_cram_io_input(obj);
351 }
352 obj->user_data = file;
353 obj->fread_callback = cram_io_C_FILE_fread;
354 obj->fseek_callback = cram_io_C_FILE_fseek;
355 obj->ftell_callback = cram_io_C_FILE_ftell;
356 return obj;
357 }
358
359 static cram_fd_input_buffer *
cram_io_deallocate_input_buffer(cram_fd_input_buffer * buffer)360 cram_io_deallocate_input_buffer(cram_fd_input_buffer * buffer)
361 {
362 if ( buffer ) {
363 if ( buffer->fp_in_buffer ) {
364 free(buffer->fp_in_buffer);
365 buffer->fp_in_buffer = NULL;
366 }
367 free(buffer);
368 buffer = NULL;
369 }
370 return buffer;
371 }
372
373 static cram_fd_input_buffer *
cram_io_allocate_input_buffer(size_t const bufsize)374 cram_io_allocate_input_buffer(size_t const bufsize)
375 {
376 cram_fd_input_buffer * buffer =
377 (cram_fd_input_buffer *)malloc(sizeof(cram_fd_input_buffer));
378
379 if ( ! buffer )
380 return cram_io_deallocate_input_buffer(buffer);
381
382 memset(buffer,0,sizeof(cram_fd_input_buffer));
383
384 buffer->fp_in_buf_size = bufsize;
385 buffer->fp_in_buffer = (char *)malloc(buffer->fp_in_buf_size);
386
387 if ( ! buffer->fp_in_buffer ) {
388 return cram_io_deallocate_input_buffer(buffer);
389 }
390
391 buffer->fp_in_buf_pa = buffer->fp_in_buffer;
392 buffer->fp_in_buf_pc = buffer->fp_in_buffer;
393 buffer->fp_in_buf_pe = buffer->fp_in_buffer;
394
395 return buffer;
396 }
397
cram_io_input_buffer_fgets(char * s,int size,cram_fd * fd)398 char * cram_io_input_buffer_fgets(char * s, int size, cram_fd * fd)
399 {
400 int linelen = 0;
401
402 while ( linelen < size-1 ) {
403 int const c = CRAM_IO_GETC(fd);
404
405 if ( c == EOF ) {
406 break;
407 }
408 else {
409 s[linelen++] = c;
410 }
411
412 if ( c == '\n' )
413 break;
414 }
415
416 if ( ! linelen )
417 return NULL;
418
419 s[linelen++] = 0;
420
421 return s;
422 }
423
424 /* ----------------------------------------------------------------------
425 * Output buffering
426 */
427
428 /*
429 * Flush buffer.
430 *
431 * Returns 0 on success,
432 * -1 on failure.
433 */
cram_io_flush_output_buffer(cram_fd * fd)434 int cram_io_flush_output_buffer(cram_fd *fd)
435 {
436 size_t r;
437 char *dat;
438 size_t olen;
439 size_t len;
440
441 if (!fd->fp_out_buffer)
442 return 0;
443
444 dat = fd->fp_out_buffer->fp_out_buf_pa;
445 olen = fd->fp_out_buffer->fp_out_buf_pc - dat;
446 len = olen;
447
448 /* write up to buffer size bytes */
449 /* C-IO fwrite */
450 if (len) {
451 r = fd->fp_out_callbacks->fwrite_callback
452 (dat, 1, len, fd->fp_out_callbacks->user_data);
453
454 dat += r;
455 len -= r;
456 fd->fp_out_buffer->fp_out_buf_start += r; /* move offset */
457
458 if (r < olen) {
459 /* Write failed, possible partial */
460 if (r > 0) {
461 memmove(fd->fp_out_buffer->fp_out_buf_pa, dat, len);
462 fd->fp_out_buffer->fp_out_buf_pc
463 = fd->fp_out_buffer->fp_out_buf_pa + len;
464 }
465
466 /* Output is probably unfixable now so return error */
467 return -1;
468 }
469 }
470
471 /* reset current output */
472 fd->fp_out_buffer->fp_out_buf_pc = fd->fp_out_buffer->fp_out_buf_pa;
473
474 return 0;
475 }
476
477
478 /* fwrite simulation */
cram_io_output_buffer_write(void * vptr,size_t size,size_t nmemb,cram_fd * fd)479 size_t cram_io_output_buffer_write(void *vptr, size_t size, size_t nmemb,
480 cram_fd *fd)
481 {
482 size_t towrite = size * nmemb; /* number of bytes still to be written */
483 size_t r = 0; /* number of bytes copied to ptr */
484
485 /* number of bytes in buffer */
486 size_t outbuf = fd->fp_out_buffer->fp_out_buf_pe -
487 fd->fp_out_buffer->fp_out_buf_pc;
488
489 /* number of bytes to be copied from buffer */
490 size_t tocopy = imin(towrite, outbuf);
491 size_t blockwrite = 0;
492 char *ptr = (char *)vptr;
493
494 /* place as many bytes in out_buffer as will fit */
495 memcpy(fd->fp_out_buffer->fp_out_buf_pc, ptr, tocopy);
496 towrite -= tocopy;
497 r += tocopy;
498 ptr += tocopy;
499 fd->fp_out_buffer->fp_out_buf_pc += tocopy;
500
501 if (towrite) /* Still some left over */
502 if (cram_io_flush_output_buffer(fd) < 0)
503 goto partial_write;
504
505 /* Write any remaining whole blocks without buffer copy, C-IO fwrite */
506 while (towrite >= fd->fp_out_buffer->fp_out_buf_size) {
507 blockwrite = fd->fp_out_callbacks->fwrite_callback
508 (ptr,
509 1,
510 fd->fp_out_buffer->fp_out_buf_size,
511 fd->fp_out_callbacks->user_data);
512
513 towrite -= blockwrite;
514 ptr += blockwrite;
515 r += blockwrite;
516 fd->fp_out_buffer->fp_out_buf_start += blockwrite;
517
518 if (blockwrite < fd->fp_out_buffer->fp_out_buf_size)
519 goto partial_write;
520 }
521
522 /* Push any remaining bytes into the output buffer */
523 if (towrite) {
524 /* buffer should be empty */
525 assert(fd->fp_out_buffer->fp_out_buf_pc ==
526 fd->fp_out_buffer->fp_out_buf_pa);
527
528 /* buffer should be large enough */
529 assert(towrite <= fd->fp_out_buffer->fp_out_buf_size);
530
531 memcpy(fd->fp_out_buffer->fp_out_buf_pc, ptr, towrite);
532 r += towrite;
533 fd->fp_out_buffer->fp_out_buf_pc += towrite;
534 }
535
536 partial_write:
537 return size ? (r / size) : r;
538 }
539
540 static cram_io_output_t *
cram_IO_deallocate_cram_io_output(cram_io_output_t * obj)541 cram_IO_deallocate_cram_io_output(cram_io_output_t * obj)
542 {
543 if ( obj ) {
544 free(obj);
545 obj = NULL;
546 }
547
548 return obj;
549 }
550
551 static cram_io_output_t *
cram_IO_allocate_cram_io_output()552 cram_IO_allocate_cram_io_output()
553 {
554 cram_io_output_t *obj
555 = (cram_io_output_t *)malloc(sizeof(cram_io_output_t));
556
557 if ( ! obj ) {
558 return cram_IO_deallocate_cram_io_output(obj);
559 }
560 obj->user_data = NULL;
561 obj->fwrite_callback = NULL;
562 obj->ftell_callback = NULL;
563 return obj;
564 }
565
566 static cram_io_output_t *
cram_IO_allocate_cram_io_output_from_C_FILE(FILE * file)567 cram_IO_allocate_cram_io_output_from_C_FILE(FILE * file)
568 {
569 cram_io_output_t *obj = cram_IO_allocate_cram_io_output();
570 if ( ! obj ) {
571 return cram_IO_deallocate_cram_io_output(obj);
572 }
573 obj->user_data = file;
574 obj->fwrite_callback = cram_io_C_FILE_fwrite;
575 obj->ftell_callback = cram_io_C_FILE_ftell;
576 return obj;
577 }
578
579 cram_fd_output_buffer *
cram_io_deallocate_output_buffer(cram_fd_output_buffer * buffer)580 cram_io_deallocate_output_buffer(cram_fd_output_buffer * buffer)
581 {
582 if ( buffer ) {
583 if ( buffer->fp_out_buffer ) {
584 free(buffer->fp_out_buffer);
585 buffer->fp_out_buffer = NULL;
586 }
587 free(buffer);
588 buffer = NULL;
589 }
590 return buffer;
591 }
592
593 cram_fd_output_buffer *
cram_io_allocate_output_buffer(size_t const bufsize)594 cram_io_allocate_output_buffer(size_t const bufsize)
595 {
596 cram_fd_output_buffer * buffer =
597 (cram_fd_output_buffer *)malloc(sizeof(cram_fd_output_buffer));
598
599 if ( ! buffer )
600 return cram_io_deallocate_output_buffer(buffer);
601
602 // FIXME: is memset really needed here? I suspect pa/pc is sufficient.
603 memset(buffer,0,sizeof(cram_fd_output_buffer));
604
605 buffer->fp_out_buf_size = bufsize;
606 buffer->fp_out_buffer = (char *)malloc(buffer->fp_out_buf_size);
607
608 if ( ! buffer->fp_out_buffer ) {
609 return cram_io_deallocate_output_buffer(buffer);
610 }
611
612 buffer->fp_out_buf_pa = buffer->fp_out_buffer;
613 buffer->fp_out_buf_pc = buffer->fp_out_buffer;
614 buffer->fp_out_buf_pe = buffer->fp_out_buffer + bufsize;
615
616 return buffer;
617 }
618
619 // FIXME: Currently inefficient
cram_io_output_buffer_putc(int c,cram_fd * fd)620 int cram_io_output_buffer_putc(int c, cram_fd * fd)
621 {
622 char cc = c;
623
624 if (cram_io_output_buffer_write(&cc, 1, 1, fd) == 1)
625 return c;
626 else
627 return EOF;
628 }
629
630 #endif
631
632 #ifndef USE_INT7_ENCODING
633 /* ----------------------------------------------------------------------
634 * ITF8 encoding and decoding.
635 *
636 * Also see the itf8_get and itf8_put macros in cram_io.h
637 */
638
639 /*
640 * LEGACY: consider using itf8_decode_crc.
641 *
642 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
643 * *val.
644 *
645 * Returns the number of bytes read on success
646 * -1 on failure
647 */
itf8_decode(cram_fd * fd,int32_t * val_p)648 int itf8_decode(cram_fd *fd, int32_t *val_p) {
649 static int nbytes[16] = {
650 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
651 1,1,1,1, // 1000xxxx - 1011xxxx
652 2,2, // 1100xxxx - 1101xxxx
653 3, // 1110xxxx
654 4, // 1111xxxx
655 };
656
657 static int nbits[16] = {
658 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
659 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
660 0x1f, 0x1f, // 1100xxxx - 1101xxxx
661 0x0f, // 1110xxxx
662 0x0f, // 1111xxxx
663 };
664
665 int32_t val = CRAM_IO_GETC(fd);
666 if (val == -1)
667 return -1;
668
669 int i = nbytes[val>>4];
670 val &= nbits[val>>4];
671
672 switch(i) {
673 case 0:
674 *val_p = val;
675 return 1;
676
677 case 1:
678 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
679 *val_p = val;
680 return 2;
681
682 case 2:
683 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
684 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
685 *val_p = val;
686 return 3;
687
688 case 3:
689 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
690 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
691 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
692 *val_p = val;
693 return 4;
694
695 case 4: // really 3.5 more, why make it different?
696 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
697 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
698 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
699 val = (val<<4) | (((unsigned char)CRAM_IO_GETC(fd)) & 0x0f);
700 *val_p = val;
701 }
702
703 return 5;
704 }
705
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)706 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
707 static int nbytes[16] = {
708 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
709 1,1,1,1, // 1000xxxx - 1011xxxx
710 2,2, // 1100xxxx - 1101xxxx
711 3, // 1110xxxx
712 4, // 1111xxxx
713 };
714
715 static int nbits[16] = {
716 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
717 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
718 0x1f, 0x1f, // 1100xxxx - 1101xxxx
719 0x0f, // 1110xxxx
720 0x0f, // 1111xxxx
721 };
722 unsigned char c[5];
723
724 int32_t val = CRAM_IO_GETC(fd);
725 if (val == -1)
726 return -1;
727
728 c[0]=val;
729
730 int i = nbytes[val>>4];
731 val &= nbits[val>>4];
732
733 switch(i) {
734 case 0:
735 *val_p = val;
736 *crc = iolib_crc32(*crc, c, 1);
737 return 1;
738
739 case 1:
740 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
741 *val_p = val;
742 *crc = iolib_crc32(*crc, c, 2);
743 return 2;
744
745 case 2:
746 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
747 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));
748 *val_p = val;
749 *crc = iolib_crc32(*crc, c, 3);
750 return 3;
751
752 case 3:
753 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
754 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));
755 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));
756 *val_p = val;
757 *crc = iolib_crc32(*crc, c, 4);
758 return 4;
759
760 case 4: // really 3.5 more, why make it different?
761 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
762 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));
763 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));
764 val = (val<<4) | (((c[4]=CRAM_IO_GETC(fd))) & 0x0f);
765 *val_p = val;
766 *crc = iolib_crc32(*crc, c, 5);
767 }
768
769 return 5;
770 }
771
772 /*
773 * Encodes and writes a single integer in ITF-8 format.
774 * Returns 0 on success
775 * -1 on failure
776 */
itf8_encode(cram_fd * fd,int32_t val)777 int itf8_encode(cram_fd *fd, int32_t val) {
778 char buf[5];
779 int len = itf8_put(buf, val);
780 return CRAM_IO_WRITE(buf, 1, len, fd) == len ? 0 : -1;
781 }
782
783 const int itf8_bytes[16] = {
784 1, 1, 1, 1, 1, 1, 1, 1,
785 2, 2, 2, 2, 3, 3, 4, 5
786 };
787
788 const int ltf8_bytes[256] = {
789 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
790 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
791 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
792 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
793
794 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
795 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
796 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
798
799 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
800 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
801 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
802 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
803
804 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
805 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
806
807 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
808
809 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
810 };
811
812 #ifndef ITF8_MACROS
813 /*
814 * As above, but decoding from memory
815 */
itf8_get(char * cp,int32_t * val_p)816 int itf8_get(char *cp, int32_t *val_p) {
817 unsigned char *up = (unsigned char *)cp;
818
819 if (up[0] < 0x80) {
820 *val_p = up[0];
821 return 1;
822 } else if (up[0] < 0xc0) {
823 *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
824 return 2;
825 } else if (up[0] < 0xe0) {
826 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
827 return 3;
828 } else if (up[0] < 0xf0) {
829 *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
830 return 4;
831 } else {
832 *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
833 return 5;
834 }
835 }
836
837 /*
838 * Stores a value to memory in ITF-8 format.
839 *
840 * Returns the number of bytes required to store the number.
841 * This is a maximum of 5 bytes.
842 */
itf8_put(char * cp,int32_t val)843 int itf8_put(char *cp, int32_t val) {
844 if (!(val & ~0x00000007f)) { // 1 byte
845 *cp = val;
846 return 1;
847 } else if (!(val & ~0x00003fff)) { // 2 byte
848 *cp++ = (val >> 8 ) | 0x80;
849 *cp = val & 0xff;
850 return 2;
851 } else if (!(val & ~0x01fffff)) { // 3 byte
852 *cp++ = (val >> 16) | 0xc0;
853 *cp++ = (val >> 8 ) & 0xff;
854 *cp = val & 0xff;
855 return 3;
856 } else if (!(val & ~0x0fffffff)) { // 4 byte
857 *cp++ = (val >> 24) | 0xe0;
858 *cp++ = (val >> 16) & 0xff;
859 *cp++ = (val >> 8 ) & 0xff;
860 *cp = val & 0xff;
861 return 4;
862 } else { // 5 byte
863 *cp++ = 0xf0 | ((val>>28) & 0xff);
864 *cp++ = (val >> 20) & 0xff;
865 *cp++ = (val >> 12) & 0xff;
866 *cp++ = (val >> 4 ) & 0xff;
867 *cp = val & 0x0f;
868 return 5;
869 }
870 }
871 #endif
872
873 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)874 int ltf8_put(char *cp, int64_t val) {
875 if (!(val & ~((1LL<<7)-1))) {
876 *cp = val;
877 return 1;
878 } else if (!(val & ~((1LL<<(6+8))-1))) {
879 *cp++ = (val >> 8 ) | 0x80;
880 *cp = val & 0xff;
881 return 2;
882 } else if (!(val & ~((1LL<<(5+2*8))-1))) {
883 *cp++ = (val >> 16) | 0xc0;
884 *cp++ = (val >> 8 ) & 0xff;
885 *cp = val & 0xff;
886 return 3;
887 } else if (!(val & ~((1LL<<(4+3*8))-1))) {
888 *cp++ = (val >> 24) | 0xe0;
889 *cp++ = (val >> 16) & 0xff;
890 *cp++ = (val >> 8 ) & 0xff;
891 *cp = val & 0xff;
892 return 4;
893 } else if (!(val & ~((1LL<<(3+4*8))-1))) {
894 *cp++ = (val >> 32) | 0xf0;
895 *cp++ = (val >> 24) & 0xff;
896 *cp++ = (val >> 16) & 0xff;
897 *cp++ = (val >> 8 ) & 0xff;
898 *cp = val & 0xff;
899 return 5;
900 } else if (!(val & ~((1LL<<(2+5*8))-1))) {
901 *cp++ = (val >> 40) | 0xf8;
902 *cp++ = (val >> 32) & 0xff;
903 *cp++ = (val >> 24) & 0xff;
904 *cp++ = (val >> 16) & 0xff;
905 *cp++ = (val >> 8 ) & 0xff;
906 *cp = val & 0xff;
907 return 6;
908 } else if (!(val & ~((1LL<<(1+6*8))-1))) {
909 *cp++ = (val >> 48) | 0xfc;
910 *cp++ = (val >> 40) & 0xff;
911 *cp++ = (val >> 32) & 0xff;
912 *cp++ = (val >> 24) & 0xff;
913 *cp++ = (val >> 16) & 0xff;
914 *cp++ = (val >> 8 ) & 0xff;
915 *cp = val & 0xff;
916 return 7;
917 } else if (!(val & ~((1LL<<(7*8))-1))) {
918 *cp++ = (val >> 56) | 0xfe;
919 *cp++ = (val >> 48) & 0xff;
920 *cp++ = (val >> 40) & 0xff;
921 *cp++ = (val >> 32) & 0xff;
922 *cp++ = (val >> 24) & 0xff;
923 *cp++ = (val >> 16) & 0xff;
924 *cp++ = (val >> 8 ) & 0xff;
925 *cp = val & 0xff;
926 return 8;
927 } else {
928 *cp++ = 0xff;
929 *cp++ = (val >> 56) & 0xff;
930 *cp++ = (val >> 48) & 0xff;
931 *cp++ = (val >> 40) & 0xff;
932 *cp++ = (val >> 32) & 0xff;
933 *cp++ = (val >> 24) & 0xff;
934 *cp++ = (val >> 16) & 0xff;
935 *cp++ = (val >> 8 ) & 0xff;
936 *cp = val & 0xff;
937 return 9;
938 }
939 }
940
ltf8_get(char * cp,int64_t * val_p)941 int ltf8_get(char *cp, int64_t *val_p) {
942 unsigned char *up = (unsigned char *)cp;
943
944 if (up[0] < 0x80) {
945 *val_p = up[0];
946 return 1;
947 } else if (up[0] < 0xc0) {
948 *val_p = (((uint64_t)up[0]<< 8) |
949 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
950 return 2;
951 } else if (up[0] < 0xe0) {
952 *val_p = (((uint64_t)up[0]<<16) |
953 ((uint64_t)up[1]<< 8) |
954 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
955 return 3;
956 } else if (up[0] < 0xf0) {
957 *val_p = (((uint64_t)up[0]<<24) |
958 ((uint64_t)up[1]<<16) |
959 ((uint64_t)up[2]<< 8) |
960 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
961 return 4;
962 } else if (up[0] < 0xf8) {
963 *val_p = (((uint64_t)up[0]<<32) |
964 ((uint64_t)up[1]<<24) |
965 ((uint64_t)up[2]<<16) |
966 ((uint64_t)up[3]<< 8) |
967 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
968 return 5;
969 } else if (up[0] < 0xfc) {
970 *val_p = (((uint64_t)up[0]<<40) |
971 ((uint64_t)up[1]<<32) |
972 ((uint64_t)up[2]<<24) |
973 ((uint64_t)up[3]<<16) |
974 ((uint64_t)up[4]<< 8) |
975 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
976 return 6;
977 } else if (up[0] < 0xfe) {
978 *val_p = (((uint64_t)up[0]<<48) |
979 ((uint64_t)up[1]<<40) |
980 ((uint64_t)up[2]<<32) |
981 ((uint64_t)up[3]<<24) |
982 ((uint64_t)up[4]<<16) |
983 ((uint64_t)up[5]<< 8) |
984 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
985 return 7;
986 } else if (up[0] < 0xff) {
987 *val_p = (((uint64_t)up[1]<<48) |
988 ((uint64_t)up[2]<<40) |
989 ((uint64_t)up[3]<<32) |
990 ((uint64_t)up[4]<<24) |
991 ((uint64_t)up[5]<<16) |
992 ((uint64_t)up[6]<< 8) |
993 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
994 return 8;
995 } else {
996 *val_p = (((uint64_t)up[1]<<56) |
997 ((uint64_t)up[2]<<48) |
998 ((uint64_t)up[3]<<40) |
999 ((uint64_t)up[4]<<32) |
1000 ((uint64_t)up[5]<<24) |
1001 ((uint64_t)up[6]<<16) |
1002 ((uint64_t)up[7]<< 8) |
1003 (uint64_t)up[8]);
1004 return 9;
1005 }
1006 }
1007
1008 /*
1009 * LEGACY: consider using ltf8_decode_crc.
1010 */
ltf8_decode(cram_fd * fd,int64_t * val_p)1011 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
1012 int c = CRAM_IO_GETC(fd);
1013 int64_t val = (unsigned char)c;
1014 if (c == -1)
1015 return -1;
1016
1017 if (val < 0x80) {
1018 *val_p = val;
1019 return 1;
1020
1021 } else if (val < 0xc0) {
1022 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1023 *val_p = val & (((1LL<<(6+8)))-1);
1024 return 2;
1025
1026 } else if (val < 0xe0) {
1027 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1028 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1029 *val_p = val & ((1LL<<(5+2*8))-1);
1030 return 3;
1031
1032 } else if (val < 0xf0) {
1033 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1034 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1035 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1036 *val_p = val & ((1LL<<(4+3*8))-1);
1037 return 4;
1038
1039 } else if (val < 0xf8) {
1040 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1041 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1042 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1043 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1044 *val_p = val & ((1LL<<(3+4*8))-1);
1045 return 5;
1046
1047 } else if (val < 0xfc) {
1048 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1049 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1050 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1051 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1052 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1053 *val_p = val & ((1LL<<(2+5*8))-1);
1054 return 6;
1055
1056 } else if (val < 0xfe) {
1057 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1058 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1059 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1060 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1061 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1062 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1063 *val_p = val & ((1LL<<(1+6*8))-1);
1064 return 7;
1065
1066 } else if (val < 0xff) {
1067 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1068 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1069 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1070 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1071 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1072 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1073 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1074 *val_p = val & ((1LL<<(7*8))-1);
1075 return 8;
1076
1077 } else {
1078 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1079 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1080 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1081 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1082 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1083 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1084 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1085 val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1086 *val_p = val;
1087 }
1088
1089 return 9;
1090 }
1091
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)1092 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
1093 unsigned char c[9];
1094 int64_t val = (unsigned char)CRAM_IO_GETC(fd);
1095 if (val == -1)
1096 return -1;
1097
1098 c[0] = val;
1099
1100 if (val < 0x80) {
1101 *val_p = val;
1102 *crc = iolib_crc32(*crc, c, 1);
1103 return 1;
1104
1105 } else if (val < 0xc0) {
1106 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1107 *val_p = val & (((1LL<<(6+8)))-1);
1108 *crc = iolib_crc32(*crc, c, 2);
1109 return 2;
1110
1111 } else if (val < 0xe0) {
1112 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1113 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1114 *val_p = val & ((1LL<<(5+2*8))-1);
1115 *crc = iolib_crc32(*crc, c, 3);
1116 return 3;
1117
1118 } else if (val < 0xf0) {
1119 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1120 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1121 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1122 *val_p = val & ((1LL<<(4+3*8))-1);
1123 *crc = iolib_crc32(*crc, c, 4);
1124 return 4;
1125
1126 } else if (val < 0xf8) {
1127 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1128 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1129 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1130 val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1131 *val_p = val & ((1LL<<(3+4*8))-1);
1132 *crc = iolib_crc32(*crc, c, 5);
1133 return 5;
1134
1135 } else if (val < 0xfc) {
1136 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1137 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1138 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1139 val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1140 val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1141 *val_p = val & ((1LL<<(2+5*8))-1);
1142 *crc = iolib_crc32(*crc, c, 6);
1143 return 6;
1144
1145 } else if (val < 0xfe) {
1146 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1147 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1148 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1149 val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1150 val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1151 val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1152 *val_p = val & ((1LL<<(1+6*8))-1);
1153 *crc = iolib_crc32(*crc, c, 7);
1154 return 7;
1155
1156 } else if (val < 0xff) {
1157 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1158 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1159 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1160 val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1161 val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1162 val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1163 val = (val<<8) | (c[7]=CRAM_IO_GETC(fd));;
1164 *val_p = val & ((1LL<<(7*8))-1);
1165 *crc = iolib_crc32(*crc, c, 8);
1166 return 8;
1167
1168 } else {
1169 val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1170 val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1171 val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1172 val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1173 val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1174 val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1175 val = (val<<8) | (c[7]=CRAM_IO_GETC(fd));;
1176 val = (val<<8) | (c[8]=CRAM_IO_GETC(fd));;
1177 *crc = iolib_crc32(*crc, c, 9);
1178 *val_p = val;
1179 }
1180
1181 return 9;
1182 }
1183
1184 /*
1185 * Pushes a value in ITF8 format onto the end of a block.
1186 * This shouldn't be used for high-volume data as it is not the fastest
1187 * method.
1188 *
1189 * Returns the number of bytes written
1190 */
itf8_put_blk(cram_block * blk,int32_t val)1191 int itf8_put_blk(cram_block *blk, int32_t val) {
1192 char buf[5];
1193 int sz;
1194
1195 sz = itf8_put(buf, val);
1196 BLOCK_APPEND(blk, buf, sz);
1197 return sz;
1198 }
1199
ltf8_put_blk(cram_block * blk,int64_t val)1200 int ltf8_put_blk(cram_block *blk, int64_t val) {
1201 char buf[9];
1202 int sz;
1203
1204 sz = ltf8_put(buf, val);
1205 BLOCK_APPEND(blk, buf, sz);
1206 return sz;
1207 }
1208
1209 #else
1210
uint7_encode(cram_fd * fd,int64_t val)1211 int uint7_encode(cram_fd *fd, int64_t val) {
1212 char buf[10];
1213 int len = uint7_put(buf, NULL, val);
1214 return CRAM_IO_WRITE(buf, 1, len, fd) == len ? 0 : -1;
1215 }
1216
uint7_put_blk(cram_block * blk,uint64_t v)1217 int uint7_put_blk(cram_block *blk, uint64_t v) {
1218 uint8_t buf[10];
1219 int sz = uint7_put(buf, buf+10, v);
1220 BLOCK_APPEND(blk, buf, sz);
1221 return sz;
1222 }
1223
sint7_put_blk(cram_block * blk,int64_t v)1224 int sint7_put_blk(cram_block *blk, int64_t v) {
1225 uint8_t buf[10];
1226 int sz = sint7_put(buf, buf+10, v);
1227 BLOCK_APPEND(blk, buf, sz);
1228 return sz;
1229 }
1230
uint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)1231 int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
1232 uint8_t b[5], i = 0;
1233 int c;
1234 uint32_t v = 0;
1235
1236 do {
1237 b[i++] = c = CRAM_IO_GETC(fd);
1238 if (c < 0)
1239 return -1;
1240 v = (v<<7) | (c & 0x7f);
1241 } while (i < 5 && (c & 0x80));
1242 *crc = iolib_crc32(*crc, b, i);
1243
1244 *val_p = v;
1245 return i;
1246 }
1247
uint7_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)1248 int uint7_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
1249 uint8_t b[10], i = 0;
1250 int c;
1251 uint64_t v = 0;
1252
1253 do {
1254 b[i++] = c = CRAM_IO_GETC(fd);
1255 if (c < 0)
1256 return -1;
1257 v = (v<<7) | (c & 0x7f);
1258 } while (i < 10 && (c & 0x80));
1259 *crc = iolib_crc32(*crc, b, i);
1260
1261 *val_p = v;
1262 return i;
1263 }
1264 #endif
1265
1266 /*
1267 * Decodes a 32-bit little endian value from fd and stores in val.
1268 *
1269 * Returns the number of bytes read on success
1270 * -1 on failure
1271 */
int32_decode(cram_fd * fd,int32_t * val)1272 int int32_decode(cram_fd *fd, int32_t *val) {
1273 int32_t i;
1274 if (1 != CRAM_IO_READ(&i, 4, 1, fd))
1275 return -1;
1276
1277 *val = le_int4(i);
1278 return 4;
1279 }
1280
1281 /*
1282 * Encodes a 32-bit little endian value 'val' and writes to fd.
1283 *
1284 * Returns the number of bytes written on success
1285 * -1 on failure
1286 */
int32_encode(cram_fd * fd,int32_t val)1287 int int32_encode(cram_fd *fd, int32_t val) {
1288 val = le_int4(val);
1289 if (1 != CRAM_IO_WRITE(&val, 4, 1, fd))
1290 return -1;
1291
1292 return 4;
1293 }
1294
1295 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)1296 int int32_get_blk(cram_block *b, int32_t *val) {
1297 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1298 return -1;
1299
1300 *val =
1301 b->data[b->byte ] |
1302 (b->data[b->byte+1] << 8) |
1303 (b->data[b->byte+2] << 16) |
1304 (b->data[b->byte+3] << 24);
1305 BLOCK_SIZE(b) += 4;
1306 return 4;
1307 }
1308
1309 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put(cram_block * b,int32_t val)1310 int int32_put(cram_block *b, int32_t val) {
1311 unsigned char cp[4];
1312 cp[0] = ( val & 0xff);
1313 cp[1] = ((val>>8) & 0xff);
1314 cp[2] = ((val>>16) & 0xff);
1315 cp[3] = ((val>>24) & 0xff);
1316
1317 BLOCK_APPEND(b, cp, 4);
1318 return b->data ? 0 : -1;
1319 }
1320
1321 /* ----------------------------------------------------------------------
1322 * zlib compression code - from Gap5's tg_iface_g.c
1323 * They're static here as they're only used within the cram_compress_block
1324 * and cram_uncompress_block functions, which are the external interface.
1325 */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1326 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1327 z_stream s;
1328 unsigned char *data = NULL; /* Uncompressed output */
1329 int data_alloc = 0;
1330 int err;
1331
1332 /* Starting point at uncompressed size, and scale after that */
1333 data = malloc(data_alloc = csize*1.2+100);
1334 if (!data)
1335 return NULL;
1336
1337 /* Initialise zlib stream */
1338 s.zalloc = Z_NULL; /* use default allocation functions */
1339 s.zfree = Z_NULL;
1340 s.opaque = Z_NULL;
1341 s.next_in = (unsigned char *)cdata;
1342 s.avail_in = csize;
1343 s.total_in = 0;
1344 s.next_out = data;
1345 s.avail_out = data_alloc;
1346 s.total_out = 0;
1347
1348 //err = inflateInit(&s);
1349 err = inflateInit2(&s, 15 + 32);
1350 if (err != Z_OK) {
1351 fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
1352 free(data);
1353 return NULL;
1354 }
1355
1356 /* Decode to 'data' array */
1357 for (;s.avail_in;) {
1358 unsigned char *data_tmp;
1359 int alloc_inc;
1360
1361 s.next_out = &data[s.total_out];
1362 err = inflate(&s, Z_NO_FLUSH);
1363 if (err == Z_STREAM_END)
1364 break;
1365
1366 if (err != Z_OK) {
1367 fprintf(stderr, "zlib inflate error: %s\n", s.msg);
1368 if (data)
1369 free(data);
1370 return NULL;
1371 }
1372
1373 /* More to come, so realloc based on growth so far */
1374 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1375 data = realloc((data_tmp = data), data_alloc += alloc_inc);
1376 if (!data) {
1377 free(data_tmp);
1378 return NULL;
1379 }
1380 s.avail_out += alloc_inc;
1381 }
1382 inflateEnd(&s);
1383
1384 *size = s.total_out;
1385 return (char *)data;
1386 }
1387
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1388 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1389 int level, int strat) {
1390 z_stream s;
1391 unsigned char *cdata = NULL; /* Compressed output */
1392 int cdata_alloc = 0;
1393 int cdata_pos = 0;
1394 int err;
1395
1396 cdata = malloc(cdata_alloc = size*1.05+100);
1397 if (!cdata)
1398 return NULL;
1399 cdata_pos = 0;
1400
1401 /* Initialise zlib stream */
1402 s.zalloc = Z_NULL; /* use default allocation functions */
1403 s.zfree = Z_NULL;
1404 s.opaque = Z_NULL;
1405 s.next_in = (unsigned char *)data;
1406 s.avail_in = size;
1407 s.total_in = 0;
1408 s.next_out = cdata;
1409 s.avail_out = cdata_alloc;
1410 s.total_out = 0;
1411 s.data_type = Z_BINARY;
1412
1413 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1414 if (err != Z_OK) {
1415 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
1416 return NULL;
1417 }
1418
1419 /* Encode to 'cdata' array */
1420 for (;s.avail_in;) {
1421 s.next_out = &cdata[cdata_pos];
1422 s.avail_out = cdata_alloc - cdata_pos;
1423 if (cdata_alloc - cdata_pos <= 0) {
1424 fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
1425 return NULL;
1426 }
1427 err = deflate(&s, Z_NO_FLUSH);
1428 cdata_pos = cdata_alloc - s.avail_out;
1429 if (err != Z_OK) {
1430 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1431 break;
1432 }
1433 }
1434 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1435 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1436 }
1437 *cdata_size = s.total_out;
1438
1439 if (deflateEnd(&s) != Z_OK) {
1440 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1441 }
1442 return (char *)cdata;
1443 }
1444
1445 #ifdef HAVE_LIBLZMA
1446 /* ------------------------------------------------------------------------ */
1447 /*
1448 * Data compression routines using liblzma (xz)
1449 *
1450 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1451 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1452 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1453 * as compression times.
1454 *
1455 * For now we disable this functionality. If it's to be reenabled make sure you
1456 * improve the mem_inflate implementation as it's just a test hack at the
1457 * moment.
1458 */
1459
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)1460 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1461 int level) {
1462 char *out;
1463 size_t out_size = lzma_stream_buffer_bound(size);
1464 *cdata_size = 0;
1465
1466 out = malloc(out_size);
1467
1468 /* Single call compression */
1469 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1470 (uint8_t *)data, size,
1471 (uint8_t *)out, cdata_size,
1472 out_size))
1473 return NULL;
1474
1475 return out;
1476 }
1477
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)1478 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1479 lzma_stream strm = LZMA_STREAM_INIT;
1480 size_t out_size = 0, out_pos = 0;
1481 char *out = NULL;
1482 int r;
1483
1484 /* Initiate the decoder */
1485 if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1486 return NULL;
1487
1488 /* Decode loop */
1489 strm.avail_in = csize;
1490 strm.next_in = (uint8_t *)cdata;
1491
1492 for (;strm.avail_in;) {
1493 if (strm.avail_in > out_size - out_pos) {
1494 out_size += strm.avail_in * 4 + 32768;
1495 out = realloc(out, out_size);
1496 }
1497 strm.avail_out = out_size - out_pos;
1498 strm.next_out = (uint8_t *)&out[out_pos];
1499
1500 r = lzma_code(&strm, LZMA_RUN);
1501 if (LZMA_OK != r && LZMA_STREAM_END != r) {
1502 fprintf(stderr, "r=%d\n", r);
1503 fprintf(stderr, "mem=%"PRId64"\n", (int64_t)lzma_memusage(&strm));
1504 return NULL;
1505 }
1506
1507 out_pos = strm.total_out;
1508
1509 if (r == LZMA_STREAM_END)
1510 break;
1511 }
1512
1513 /* finish up any unflushed data; necessary? */
1514 r = lzma_code(&strm, LZMA_FINISH);
1515 if (r != LZMA_OK && r != LZMA_STREAM_END) {
1516 fprintf(stderr, "r=%d\n", r);
1517 return NULL;
1518 }
1519
1520 out = realloc(out, strm.total_out);
1521 *size = strm.total_out;
1522
1523 lzma_end(&strm);
1524
1525 return out;
1526 }
1527 #endif
1528
1529 /* ----------------------------------------------------------------------
1530 * CRAM blocks - the dynamically growable data block. We have code to
1531 * create, update, (un)compress and read/write.
1532 *
1533 * These are derived from the deflate_interlaced.c blocks, but with the
1534 * CRAM extension of content types and IDs.
1535 */
1536
1537 /*
1538 * Allocates a new cram_block structure with a specified content_type and
1539 * id.
1540 *
1541 * Returns block pointer on success
1542 * NULL on failure
1543 */
cram_new_block(enum cram_content_type content_type,int content_id)1544 cram_block *cram_new_block(enum cram_content_type content_type,
1545 int content_id) {
1546 cram_block *b = malloc(sizeof(*b));
1547 if (!b)
1548 return NULL;
1549 b->method = b->orig_method = RAW;
1550 b->content_type = content_type;
1551 b->content_id = content_id;
1552 b->comp_size = 0;
1553 b->uncomp_size = 0;
1554 b->data = NULL;
1555 b->alloc = 0;
1556 b->byte = 0;
1557 b->bit = 7; // MSB
1558 b->crc32 = 0;
1559
1560 return b;
1561 }
1562
1563 /*
1564 * Reads a block from a cram file.
1565 * Returns cram_block pointer on success.
1566 * NULL on failure
1567 */
cram_read_block(cram_fd * fd)1568 cram_block *cram_read_block(cram_fd *fd) {
1569 cram_block *b = malloc(sizeof(*b));
1570 unsigned char c;
1571 uint32_t crc = 0;
1572 if (!b)
1573 return NULL;
1574
1575 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1576
1577 if (-1 == (b->method = (c=CRAM_IO_GETC(fd)))) { free(b); return NULL; }
1578 crc = iolib_crc32(crc, &c, 1);
1579 if (-1 == (b->content_type = (c=CRAM_IO_GETC(fd)))) { free(b); return NULL; }
1580 crc = iolib_crc32(crc, &c, 1);
1581 if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
1582 if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
1583 if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1584
1585 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1586 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1587
1588 if (b->method == RAW) {
1589 b->alloc = b->uncomp_size;
1590 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1591 if (b->uncomp_size != CRAM_IO_READ(b->data, 1, b->uncomp_size, fd)) {
1592 free(b->data);
1593 free(b);
1594 return NULL;
1595 }
1596 } else {
1597 b->alloc = b->comp_size;
1598 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
1599 if (b->comp_size != CRAM_IO_READ(b->data, 1, b->comp_size, fd)) {
1600 free(b->data);
1601 free(b);
1602 return NULL;
1603 }
1604 }
1605
1606 if (IS_CRAM_3_VERS(fd)) {
1607 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1608 free(b);
1609 return NULL;
1610 }
1611
1612 // Check later, if and only if we do decompression of this block
1613 b->crc32_checked = fd->ignore_md5;
1614 b->crc_part = crc;
1615 } else {
1616 b->crc32_checked = 1; // CRC not present
1617 }
1618
1619 b->orig_method = b->method;
1620 b->idx = 0;
1621 b->byte = 0;
1622 b->bit = 7; // MSB
1623
1624 return b;
1625 }
1626
1627 /*
1628 * Writes a CRAM block.
1629 * Returns 0 on success
1630 * -1 on failure
1631 */
cram_write_block(cram_fd * fd,cram_block * b)1632 int cram_write_block(cram_fd *fd, cram_block *b) {
1633 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1634
1635 if (CRAM_IO_PUTC(b->method, fd) == EOF) return -1;
1636 if (CRAM_IO_PUTC(b->content_type, fd) == EOF) return -1;
1637 if (itf8_encode(fd, b->content_id) == -1) return -1;
1638 if (itf8_encode(fd, b->comp_size) == -1) return -1;
1639 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
1640
1641 if (b->method == RAW) {
1642 if (b->uncomp_size != CRAM_IO_WRITE(b->data, 1, b->uncomp_size, fd))
1643 return -1;
1644 } else {
1645 if (b->comp_size != CRAM_IO_WRITE(b->data, 1, b->comp_size, fd))
1646 return -1;
1647 }
1648
1649 if (IS_CRAM_3_VERS(fd)) {
1650 unsigned char dat[100], *cp = dat;;
1651 uint32_t crc;
1652
1653 *cp++ = b->method;
1654 *cp++ = b->content_type;
1655 cp += itf8_put(cp, b->content_id);
1656 cp += itf8_put(cp, b->comp_size);
1657 cp += itf8_put(cp, b->uncomp_size);
1658 crc = iolib_crc32(0L, dat, cp-dat);
1659
1660 if (!b->crc32) {
1661 if (b->method == RAW) {
1662 b->crc32 = iolib_crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1663 } else {
1664 b->crc32 = iolib_crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1665 }
1666 }
1667
1668 if (-1 == int32_encode(fd, b->crc32))
1669 return -1;
1670 }
1671
1672 return 0;
1673 }
1674
1675 /*
1676 * Frees a CRAM block, deallocating internal data too.
1677 */
cram_free_block(cram_block * b)1678 void cram_free_block(cram_block *b) {
1679 if (!b)
1680 return;
1681 if (b->data)
1682 free(b->data);
1683 free(b);
1684 }
1685
1686 #ifdef HAVE_LIBBSC
1687 #define BSC_FEATURES LIBBSC_FEATURE_FASTMODE
1688 pthread_once_t bsc_once = PTHREAD_ONCE_INIT;
bsc_init_once(void)1689 static void bsc_init_once(void) {
1690 bsc_init(BSC_FEATURES);
1691 }
1692 #endif
1693
1694 /*
1695 * Uncompresses a CRAM block, if compressed.
1696 */
cram_uncompress_block(cram_block * b)1697 int cram_uncompress_block(cram_block *b) {
1698 char *uncomp;
1699 size_t uncomp_size = 0;
1700
1701 if (b->crc32_checked == 0) {
1702 uint32_t crc = iolib_crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1703 b->crc32_checked = 1;
1704 if (crc != b->crc32) {
1705 fprintf(stderr, "Block CRC32 failure\n");
1706 return -1;
1707 }
1708 }
1709
1710 if (b->uncomp_size == 0) {
1711 // blank block
1712 b->method = RAW;
1713 return 0;
1714 }
1715
1716 switch (b->method) {
1717 case RAW:
1718 //b->uncomp_size = b->comp_size;
1719 return 0;
1720
1721 case GZIP:
1722 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1723 if (!uncomp)
1724 return -1;
1725 if ((int)uncomp_size != b->uncomp_size) {
1726 free(uncomp);
1727 return -1;
1728 }
1729 free(b->data);
1730 b->data = (unsigned char *)uncomp;
1731 b->alloc = uncomp_size;
1732 b->method = RAW;
1733 break;
1734
1735 #ifdef HAVE_LIBBZ2
1736 case BZIP2: {
1737 unsigned int usize = b->uncomp_size;
1738 if (!(uncomp = malloc(usize)))
1739 return -1;
1740 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1741 (char *)b->data, b->comp_size,
1742 0, 0)) {
1743 free(uncomp);
1744 return -1;
1745 }
1746 free(b->data);
1747 b->data = (unsigned char *)uncomp;
1748 b->alloc = usize;
1749 b->method = RAW;
1750 b->uncomp_size = usize; // Just incase it differs
1751 break;
1752 }
1753 #else
1754 case BZIP2:
1755 fprintf(stderr, "Bzip2 compression is not compiled into this "
1756 "version.\nPlease rebuild and try again.\n");
1757 return -1;
1758 #endif
1759
1760 #ifdef HAVE_LIBBSC
1761 case BSC: {
1762 int block_size, data_size;
1763
1764 pthread_once(&bsc_once, bsc_init_once);
1765
1766 if (bsc_block_info(b->data, LIBBSC_HEADER_SIZE,
1767 &block_size, &data_size,
1768 BSC_FEATURES) != LIBBSC_NO_ERROR)
1769 return -1;
1770
1771 uncomp_size = data_size;
1772 if (!(uncomp = malloc(data_size)))
1773 return -1;
1774
1775 assert(block_size == b->comp_size);
1776 if (bsc_decompress(b->data, block_size,
1777 (unsigned char *)uncomp, data_size,
1778 BSC_FEATURES) != LIBBSC_NO_ERROR) {
1779 free(uncomp);
1780 return -1;
1781 }
1782
1783 free(b->data);
1784 b->data = (unsigned char *)uncomp;
1785 b->alloc = data_size;
1786 b->method = RAW;
1787 b->uncomp_size = data_size; // Just incase it differs
1788 break;
1789 }
1790 #else
1791 case BSC:
1792 fprintf(stderr, "Libbsc compression is not compiled into this "
1793 "version.\nPlease rebuild and try again.\n");
1794 return -1;
1795 #endif
1796
1797 #ifdef HAVE_FQZ
1798 case FQZ: {
1799 uncomp_size = b->uncomp_size;
1800 uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size);
1801 if (!uncomp)
1802 return -1;
1803 free(b->data);
1804 b->data = (unsigned char *)uncomp;
1805 b->alloc = uncomp_size;
1806 b->method = RAW;
1807 break;
1808 }
1809 #else
1810 case FQZ:
1811 fprintf(stderr, "Fqzcomp compression is not compiled into this "
1812 "version.\nPlease rebuild and try again.\n");
1813 return -1;
1814 #endif
1815
1816 #ifdef HAVE_LIBLZMA
1817 case LZMA:
1818 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1819 if (!uncomp)
1820 return -1;
1821 if ((int)uncomp_size != b->uncomp_size)
1822 return -1;
1823 free(b->data);
1824 b->data = (unsigned char *)uncomp;
1825 b->alloc = uncomp_size;
1826 b->method = RAW;
1827 break;
1828 #else
1829 case LZMA:
1830 fprintf(stderr, "Lzma compression is not compiled into this "
1831 "version.\nPlease rebuild and try again.\n");
1832 return -1;
1833 break;
1834 #endif
1835
1836 case RANS0: {
1837 unsigned int usize = b->uncomp_size, usize2;
1838 if (*b->data == 1) b->orig_method = RANS1; // useful in debugging
1839 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2, 0);
1840 if (!uncomp || usize != usize2)
1841 return -1;
1842 b->orig_method = b->data[0]&1 ? RANS1 : RANS0;
1843 free(b->data);
1844 b->data = (unsigned char *)uncomp;
1845 b->alloc = usize2;
1846 b->method = RAW;
1847 b->uncomp_size = usize2; // Just incase it differs
1848 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1849 break;
1850 }
1851
1852 case RANS_PR0: {
1853 unsigned int usize = b->uncomp_size, usize2;
1854 uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2, 0);
1855 if (!uncomp || usize != usize2)
1856 return -1;
1857 b->orig_method = RANS_PR0 + (b->data[0]&1)
1858 + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1859 free(b->data);
1860 b->data = (unsigned char *)uncomp;
1861 b->alloc = usize2;
1862 b->method = RAW;
1863 b->uncomp_size = usize2; // Just incase it differs
1864 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1865 break;
1866 }
1867
1868 case NAME_TOK3: {
1869 int out_len;
1870 uint8_t *cp = decode_names(b->data, b->comp_size, &out_len);
1871 b->orig_method = NAME_TOK3;
1872 b->method = RAW;
1873 free(b->data);
1874 b->data = cp;
1875 b->alloc = out_len;
1876 b->uncomp_size = out_len;
1877 break;
1878 }
1879
1880
1881
1882 default:
1883 return -1;
1884 }
1885
1886 return 0;
1887 }
1888
1889 #define EBASE 65536
1890 //static double entropy16(unsigned short *data, int len) {
1891 // double E[EBASE];
1892 // double P[EBASE];
1893 // double e = 0;
1894 // int i;
1895 //
1896 // for (i = 0; i < EBASE; i++)
1897 // P[i] = 0;
1898 //
1899 // for (i = 0; i < len; i++)
1900 // P[data[i]]++;
1901 //
1902 // for (i = 0; i < EBASE; i++) {
1903 // if (P[i]) {
1904 // P[i] /= len;
1905 // E[i] = -(log(P[i])/log(EBASE));
1906 // } else {
1907 // E[i] = 0;
1908 // }
1909 // }
1910 //
1911 // for (e = i = 0; i < len; i++)
1912 // e += E[data[i]];
1913 //
1914 // return e * log(EBASE)/log(256);
1915 //}
1916 //
1917 //#define EBASE2 256
1918 //static double entropy8(unsigned char *data, int len) {
1919 // int F[EBASE2];
1920 // double e = 0;
1921 // int i;
1922 //
1923 // for (i = 0; i < EBASE2; i++)
1924 // F[i] = 0;
1925 //
1926 // for (i = 0; i < len; i++)
1927 // F[data[i]]++;
1928 //
1929 // for (i = 0; i < EBASE2; i++) {
1930 // if (F[i]) {
1931 // e += -log((double)F[i]/len) * F[i];
1932 // }
1933 // }
1934 //
1935 // return e / log(EBASE2);
1936 //}
1937
cram_compress_by_method(cram_slice * s,char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1938 static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1939 size_t *out_size,
1940 enum cram_block_method method,
1941 int level, int strat) {
1942 switch (method) {
1943 case GZIP:
1944 case GZIP_RLE:
1945 case GZIP_1:
1946 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1947
1948 case BZIP2: {
1949 #ifdef HAVE_LIBBZ2
1950 unsigned int comp_size = in_size*1.01 + 600;
1951 char *comp = malloc(comp_size);
1952 if (!comp)
1953 return NULL;
1954
1955 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1956 in, in_size,
1957 level, 0, 30)) {
1958 free(comp);
1959 return NULL;
1960 }
1961 *out_size = comp_size;
1962 return comp;
1963 #else
1964 return NULL;
1965 #endif
1966 }
1967
1968 case BSC: {
1969 #ifdef HAVE_LIBBSC
1970 unsigned int comp_size = in_size*1.01 + LIBBSC_HEADER_SIZE;
1971 char *comp = malloc(comp_size);
1972 if (!comp)
1973 return NULL;
1974
1975 pthread_once(&bsc_once, bsc_init_once);
1976
1977 *out_size = bsc_compress((unsigned char *)in,
1978 (unsigned char *)comp,
1979 in_size,
1980 LIBBSC_DEFAULT_LZPHASHSIZE,
1981 LIBBSC_DEFAULT_LZPMINLEN,
1982 LIBBSC_BLOCKSORTER_BWT,
1983 level >= 5
1984 ? LIBBSC_CODER_QLFC_ADAPTIVE // maybe 50% slower?
1985 : LIBBSC_CODER_QLFC_STATIC,
1986 BSC_FEATURES);
1987
1988 assert(comp_size >= *out_size);
1989
1990 return comp;
1991 #else
1992 return NULL;
1993 #endif
1994 }
1995
1996 case FQZ:
1997 #ifdef HAVE_FQZ
1998 return fqz_compress(strat, s, in, in_size, out_size, level);
1999 #else
2000 return NULL;
2001 #endif
2002
2003 case LZMA:
2004 #ifdef HAVE_LIBLZMA
2005 return lzma_mem_deflate(in, in_size, out_size, level);
2006 #else
2007 return NULL;
2008 #endif
2009
2010 case RANS0:
2011 case RANS1: {
2012 unsigned int out_size_i;
2013 unsigned char *cp;
2014
2015 cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
2016 method == RANS0 ? 0 : 1);
2017 *out_size = out_size_i;
2018 return (char *)cp;
2019 }
2020
2021 case RANS_PR0:
2022 case RANS_PR1:
2023 case RANS_PR64:
2024 case RANS_PR65:
2025 case RANS_PR128:
2026 case RANS_PR129:
2027 case RANS_PR192:
2028 case RANS_PR193: {
2029 unsigned int out_size_i;
2030 unsigned char *cp;
2031
2032 // see enum cram_block. We map RANS_* methods to order bit-fields
2033 static int methmap[] = {
2034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // non-ranspr
2035 0,1, 64,65, 128,129, 192,193
2036 };
2037
2038 cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, methmap[method]);
2039 *out_size = out_size_i;
2040 return (char *)cp;
2041 }
2042
2043 case NAME_TOK3: {
2044 int out_len;
2045 uint8_t *cp = encode_names(in, in_size, &out_len, NULL);
2046 *out_size = out_len;
2047 return (char *)cp;
2048 }
2049
2050 case RAW:
2051 break;
2052
2053 default:
2054 return NULL;
2055 }
2056
2057 return NULL;
2058 }
2059
2060 /*
2061 * Compresses a block using one of two different zlib strategies. If we only
2062 * want one choice set strat2 to be -1.
2063 *
2064 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
2065 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
2066 * significantly faster.
2067 */
cram_compress_block(cram_fd * fd,cram_slice * s,cram_block * b,cram_metrics * metrics,int method,int level)2068 int cram_compress_block(cram_fd *fd, cram_slice *s,
2069 cram_block *b, cram_metrics *metrics,
2070 int method, int level) {
2071
2072 char *comp = NULL;
2073 size_t comp_size = 0;
2074 int strat = 0;
2075
2076 // Internally we have parameterised methods that externally map
2077 // to the same CRAM method value.
2078 // See enum_cram_block_method.
2079 int methmap[] = {
2080 RAW, GZIP, BZIP2, LZMA, RANS0, BSC, FQZ,
2081 BM_ERROR, BM_ERROR, BM_ERROR,
2082 RANS0, // RANS1
2083 GZIP, GZIP, // GZIP_RLE and GZIP_1
2084 RANS_PR0, RANS_PR0, RANS_PR0, RANS_PR0, // RANS_PR1-193
2085 RANS_PR0, RANS_PR0, RANS_PR0, RANS_PR0,
2086 NAME_TOK3,
2087 };
2088
2089 if (b->method != RAW) {
2090 // Maybe already compressed if s->block[0] was compressed and
2091 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
2092 // one base type present and hence using E_HUFFMAN on block 0.
2093 // A second explicit attempt to compress the same block then
2094 // occurs.
2095 return 0;
2096 }
2097
2098 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
2099
2100 if (method == RAW || level == 0 || b->uncomp_size == 0) {
2101 b->method = RAW;
2102 b->comp_size = b->uncomp_size;
2103 //fprintf(stderr, "Skip block id %d\n", b->content_id);
2104 return 0;
2105 }
2106
2107 if (metrics) {
2108 if (fd->metrics_lock) pthread_mutex_lock(fd->metrics_lock);
2109 if (fd->unsorted == 2)
2110 metrics->next_trial = 0; // force recheck on mode switch.
2111 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
2112 int m;
2113 size_t sz_best = INT_MAX;
2114 size_t sz[CRAM_MAX_METHOD] = {0};
2115 int method_best = 0;
2116 char *c_best = NULL, *c = NULL;
2117
2118 if (metrics->revised_method)
2119 method = metrics->revised_method;
2120 else
2121 metrics->revised_method = method;
2122
2123 if (metrics->next_trial <= 0) {
2124 metrics->next_trial = TRIAL_SPAN;
2125 metrics->trial = NTRIALS;
2126 for (m = 0; m < CRAM_MAX_METHOD; m++)
2127 metrics->sz[m] /= 2;
2128 }
2129
2130 if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2131
2132 // Compress this block using the best method
2133 if (metrics->stats && metrics->stats->nvals > 16) {
2134 // No point trying bit-pack if 17+ symbols.
2135 if (method & (1<<RANS_PR128))
2136 method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2137 if (method & (1<<RANS_PR129))
2138 method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2139 if (method & (1<<RANS_PR192))
2140 method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2141 if (method & (1<<RANS_PR193))
2142 method = (method|(1<<RANS_PR65))&~(1<<RANS_PR193);
2143 }
2144 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2145 if (method & (1<<m)) {
2146 int lvl = level;
2147 switch (m) {
2148 case GZIP: strat = Z_FILTERED; break;
2149 case GZIP_1: strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2150 case GZIP_RLE: strat = Z_RLE; break;
2151 case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break;
2152 default: strat = 0;
2153 }
2154
2155 c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2156 &sz[m], m, lvl, strat);
2157 if (fd->verbose > 1)
2158 fprintf(stderr, "Try compression of block ID %d from %d to %d by method %s, strat %d\n",
2159 b->content_id, b->uncomp_size, (int)sz[m], cram_block_method2str(m), strat);
2160
2161 if (c && sz_best > sz[m]) {
2162 sz_best = sz[m];
2163 method_best = m;
2164 if (c_best)
2165 free(c_best);
2166 c_best = c;
2167 } else if (c) {
2168 free(c);
2169 } else {
2170 sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2171 }
2172 }
2173 }
2174
2175 //fprintf(stderr, "sz_best = %d\n", sz_best);
2176
2177 free(b->data);
2178 b->data = (unsigned char *)c_best;
2179 //printf("method_best = %s\n", cram_block_method2str(method_best));
2180
2181 b->method = method_best;
2182 b->comp_size = sz_best;
2183 assert(sz_best != INT_MAX);
2184
2185 // Accumulate stats for all methods tried
2186 if (fd->metrics_lock) pthread_mutex_lock(fd->metrics_lock);
2187 for (m = 0; m < CRAM_MAX_METHOD; m++)
2188 metrics->sz[m] += sz[m];
2189
2190 // When enough trials performed, find the best on average
2191 if (--metrics->trial == 0) {
2192 int best_method = RAW;
2193 int best_sz = INT_MAX;
2194
2195 // Relative costs of methods. See enum_cram_block_method.
2196 double meth_cost[32] = {
2197 1, // raw
2198 1.04, // gzip
2199 1.08, // bzip2
2200 1.10, // lzma
2201 1.00, // rans0
2202 1.09, // bsc
2203 1.05, // fqz
2204 1,1,1, // unused
2205 1.02, // rans1
2206 1.00, // gzip rle
2207 1.02, // gzip -1
2208 1.00, // rans_pr0
2209 1.03, // rans_pr1
2210 1.00, // rans_pr64; if smaller, usually fast
2211 1.00, // rans_pr65
2212 1.00, // rans_pr128
2213 1.00, // rans_pr129
2214 1.00, // rans_pr192
2215 1.00, // rans_pr193
2216 1,1,1,1,1,1,1,1,1,1,1 // unused
2217 };
2218
2219 // Scale methods by cost based on compression level
2220 if (fd->level <= 1) {
2221 for (m = 0; m < CRAM_MAX_METHOD; m++)
2222 metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2223 } else if (fd->level <= 3) {
2224 for (m = 0; m < CRAM_MAX_METHOD; m++)
2225 metrics->sz[m] *= 1+(meth_cost[m]-1);
2226 } else if (fd->level <= 6) {
2227 for (m = 0; m < CRAM_MAX_METHOD; m++)
2228 metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2229 } // else cost is ignored
2230
2231 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2232 if ((!metrics->sz[m]) || (!(method & (1<<m))))
2233 continue;
2234
2235 if (best_sz > metrics->sz[m])
2236 best_sz = metrics->sz[m], best_method = m;
2237 }
2238
2239 if (fd->verbose > 1)
2240 fprintf(stderr, "Choosing method %s (was %s), strat %d for block ID %d\n",
2241 cram_block_method2str(best_method),
2242 cram_block_method2str(metrics->method),
2243 strat, b->content_id);
2244
2245 if (best_method != metrics->method) {
2246 metrics->trial = (NTRIALS+1)/2; // be sure
2247 //metrics->next_trial /= 1.5;
2248 metrics->consistency = 0;
2249 } else {
2250 metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2251 metrics->consistency++;
2252 }
2253
2254 metrics->method = best_method;
2255 switch (best_method) {
2256 case GZIP: strat = Z_FILTERED; break;
2257 case GZIP_1: strat = Z_DEFAULT_STRATEGY; break;
2258 case GZIP_RLE: strat = Z_RLE; break;
2259 case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break;
2260 default: strat = 0;
2261 }
2262 metrics->strat = strat;
2263
2264 // If we see at least MAXFAIL trials in a row for a specific
2265 // compression method with more than MAXDELTA aggregate
2266 // size then we drop this from the list of methods used
2267 // for this block type.
2268 #define MAXDELTA 0.20
2269 #define MAXFAILS 4
2270 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2271 if (best_method == m) {
2272 metrics->cnt[m] = 0;
2273 metrics->extra[m] = 0;
2274 } else if (best_sz < metrics->sz[m]) {
2275 double r = (double)metrics->sz[m] / best_sz - 1;
2276 if (++metrics->cnt[m] >= MAXFAILS &&
2277 (metrics->extra[m] += r) >= MAXDELTA)
2278 method &= ~(1<<m);
2279 }
2280 }
2281
2282 if (fd->verbose > 1 && method != metrics->revised_method)
2283 fprintf(stderr, "%d: revising method from %x to %x\n",
2284 b->content_id, metrics->revised_method, method);
2285 metrics->revised_method = method;
2286 }
2287 if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2288 } else {
2289 strat = metrics->strat;
2290 method = metrics->method;
2291
2292 if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2293 comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2294 &comp_size, method,
2295 method == GZIP_1 ? 1 : level,
2296 strat);
2297 if (!comp)
2298 return -1;
2299
2300 if (comp_size < b->uncomp_size) {
2301 free(b->data);
2302 b->data = (unsigned char *)comp;
2303 b->comp_size = comp_size;
2304 b->method = method;
2305 } else {
2306 free(comp);
2307 }
2308 }
2309
2310 } else {
2311 // no cached metrics, so just do zlib?
2312 comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2313 &comp_size, GZIP, level, Z_FILTERED);
2314 if (!comp) {
2315 fprintf(stderr, "Compression failed!\n");
2316 return -1;
2317 }
2318
2319 if (comp_size < b->uncomp_size) {
2320 free(b->data);
2321 b->data = (unsigned char *)comp;
2322 b->comp_size = comp_size;
2323 b->method = GZIP;
2324 } else {
2325 free(comp);
2326 }
2327 strat = Z_FILTERED;
2328 }
2329
2330 if (fd->verbose)
2331 fprintf(stderr, "Compressed block ID %d from %d to %d by method %s strat %d\n",
2332 b->content_id, b->uncomp_size, b->comp_size,
2333 cram_block_method2str(b->method), strat);
2334
2335 b->method = methmap[b->method];
2336
2337 return 0;
2338 }
2339
cram_new_metrics(void)2340 cram_metrics *cram_new_metrics(void) {
2341 cram_metrics *m = calloc(1, sizeof(*m));
2342 if (!m)
2343 return NULL;
2344 m->trial = NTRIALS;
2345 m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2346 m->method = RAW;
2347 m->strat = 0;
2348 m->revised_method = 0;
2349 m->consistency = 0;
2350 m->stats = NULL;
2351
2352 return m;
2353 }
2354
cram_block_method2str(enum cram_block_method m)2355 char *cram_block_method2str(enum cram_block_method m) {
2356 switch(m) {
2357 case RAW: return "RAW";
2358 case GZIP: return "GZIP";
2359 case BZIP2: return "BZIP2";
2360 case BSC: return "BSC";
2361 case FQZ: return "FQZ";
2362 case LZMA: return "LZMA";
2363 case RANS0: return "RANS0";
2364 case RANS1: return "RANS1";
2365 case GZIP_RLE: return "GZIP_RLE";
2366 case GZIP_1: return "GZIP-1";
2367 case RANS_PR0: return "RANS_PR0";
2368 case RANS_PR1: return "RANS_PR1";
2369 case RANS_PR64: return "RANS_PR64";
2370 case RANS_PR65: return "RANS_PR65";
2371 case RANS_PR128: return "RANS_PR128";
2372 case RANS_PR129: return "RANS_PR129";
2373 case RANS_PR192: return "RANS_PR192";
2374 case RANS_PR193: return "RANS_PR193";
2375 default: break;
2376 }
2377 return "?";
2378 }
2379
cram_content_type2str(enum cram_content_type t)2380 char *cram_content_type2str(enum cram_content_type t) {
2381 switch (t) {
2382 case FILE_HEADER: return "FILE_HEADER";
2383 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
2384 case MAPPED_SLICE: return "MAPPED_SLICE";
2385 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
2386 case EXTERNAL: return "EXTERNAL";
2387 case CORE: return "CORE";
2388 case CT_ERROR: break;
2389 }
2390 return "?";
2391 }
2392
2393 /*
2394 * Extra error checking on fclose to really ensure data is written.
2395 * Care needs to be taken to handle pipes vs real files.
2396 *
2397 * Returns 0 on success
2398 * -1 on failure.
2399 */
paranoid_fclose(FILE * fp)2400 int paranoid_fclose(FILE *fp) {
2401 if (-1 == fflush(fp) && errno != EBADF) {
2402 fclose(fp);
2403 return -1;
2404 }
2405
2406 errno = 0;
2407 #ifdef HAVE_FSYNC
2408 if (-1 == fsync(fileno(fp))) {
2409 if (errno != EINVAL) { // eg pipe
2410 fclose(fp);
2411 return -1;
2412 }
2413 }
2414 #endif
2415
2416 return fclose(fp);
2417 }
2418
2419 /* ----------------------------------------------------------------------
2420 * Reference sequence handling
2421 *
2422 * These revolve around the refs_t structure, which may potentially be
2423 * shared between multiple cram_fd.
2424 *
2425 * We start with refs_create() to allocate an empty refs_t and then
2426 * populate it with @SQ line data using refs_from_header(). This is done on
2427 * cram_open(). Also at start up we can call cram_load_reference() which
2428 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2429 * new one specified. In either case refs2id() is then called which
2430 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2431 *
2432 * Later, possibly within a thread, we will want to know the actual ref
2433 * seq itself, obtained by calling cram_get_ref(). This may use the
2434 * UR: or M5: fields or the filename specified in the original
2435 * cram_load_reference() call.
2436 *
2437 * Given the potential for multi-threaded reference usage, we have
2438 * reference counting (sorry for the confusing double use of "ref") to
2439 * track the number of callers interested in any specific reference.
2440 */
2441
2442 /*
2443 * Frees/unmaps a reference sequence and associated file handles.
2444 */
ref_entry_free_seq(ref_entry * e)2445 static void ref_entry_free_seq(ref_entry *e) {
2446 if (e->mf)
2447 mfclose(e->mf);
2448 if (e->seq && !e->mf)
2449 free(e->seq);
2450
2451 e->seq = NULL;
2452 e->mf = NULL;
2453 }
2454
refs_free(refs_t * r)2455 void refs_free(refs_t *r) {
2456 RP("refs_free()\n");
2457
2458 if (--r->count > 0)
2459 return;
2460
2461 if (!r)
2462 return;
2463
2464 if (r->pool)
2465 string_pool_destroy(r->pool);
2466
2467 if (r->h_meta) {
2468 HashIter *iter = HashTableIterCreate();
2469 HashItem *hi;
2470
2471 while ((hi = HashTableIterNext(r->h_meta, iter))) {
2472 ref_entry *e = (ref_entry *)hi->data.p;
2473 if (!e)
2474 continue;
2475 ref_entry_free_seq(e);
2476 free(e);
2477 }
2478
2479 HashTableIterDestroy(iter);
2480 HashTableDestroy(r->h_meta, 0);
2481 }
2482
2483 if (r->ref_id)
2484 free(r->ref_id);
2485
2486 if (r->fp)
2487 bzi_close(r->fp);
2488
2489 pthread_mutex_destroy(&r->lock);
2490
2491 free(r);
2492 }
2493
refs_create(void)2494 static refs_t *refs_create(void) {
2495 refs_t *r = calloc(1, sizeof(*r));
2496
2497 RP("refs_create()\n");
2498
2499 if (!r)
2500 return NULL;
2501
2502 if (!(r->pool = string_pool_create(8192)))
2503 goto err;
2504
2505 r->ref_id = NULL; // see refs2id() to populate.
2506 r->count = 1;
2507 r->last = NULL;
2508 r->last_id = -1;
2509
2510 r->h_meta = HashTableCreate(16, HASH_DYNAMIC_SIZE | HASH_NONVOLATILE_KEYS);
2511 if (!r->h_meta)
2512 goto err;
2513
2514 pthread_mutex_init(&r->lock, NULL);
2515
2516 return r;
2517
2518 err:
2519 refs_free(r);
2520 return NULL;
2521 }
2522
2523 /*
2524 * Loads a FAI file for a reference.fasta.
2525 * "is_err" indicates whether failure to load is worthy of emitting an
2526 * error message. In some cases (eg with embedded references) we
2527 * speculatively load, just incase, and silently ignore errors.
2528 *
2529 * Returns the refs_t struct on success (maybe newly allocated);
2530 * NULL on failure
2531 */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)2532 refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
2533 struct stat sb;
2534 FILE *fp = NULL;
2535 HashData hd;
2536 char fai_fn[PATH_MAX];
2537 char line[8192];
2538 refs_t *r = r_orig;
2539 int id = 0, id_alloc = 0;
2540
2541 RP("refs_load_fai %s\n", fn);
2542
2543 if (!r)
2544 if (!(r = refs_create()))
2545 goto err;
2546
2547 /* Open reference, for later use */
2548 if (stat(fn, &sb) != 0) {
2549 if (is_err)
2550 perror(fn);
2551 goto err;
2552 }
2553
2554 if (r->fp)
2555 bzi_close(r->fp);
2556 r->fp = NULL;
2557
2558 if (!(r->fn = string_dup(r->pool, fn)))
2559 goto err;
2560
2561 if (!(r->fp = bzi_open(fn, "r"))) {
2562 if (is_err)
2563 perror(fn);
2564 goto err;
2565 }
2566
2567 /* Parse .fai file and load meta-data */
2568 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn);
2569 if (stat(fai_fn, &sb) != 0) {
2570 if (is_err)
2571 perror(fai_fn);
2572 goto err;
2573 }
2574 if (!(fp = fopen(fai_fn, "r"))) {
2575 if (is_err)
2576 perror(fai_fn);
2577 goto err;
2578 }
2579 while (fgets(line, 8192, fp) != NULL) {
2580 ref_entry *e = malloc(sizeof(*e));
2581 char *cp;
2582 int n;
2583 HashItem *hi;
2584
2585 if (!e)
2586 return NULL;
2587
2588 // id
2589 for (cp = line; *cp && !isspace(*cp); cp++)
2590 ;
2591 *cp++ = 0;
2592 e->name = string_dup(r->pool, line);
2593
2594 // length
2595 while (*cp && isspace(*cp))
2596 cp++;
2597 e->length = strtoll(cp, &cp, 10);
2598
2599 // offset
2600 while (*cp && isspace(*cp))
2601 cp++;
2602 e->offset = strtoll(cp, &cp, 10);
2603
2604 // bases per line
2605 while (*cp && isspace(*cp))
2606 cp++;
2607 e->bases_per_line = strtol(cp, &cp, 10);
2608
2609 // line length
2610 while (*cp && isspace(*cp))
2611 cp++;
2612 e->line_length = strtol(cp, &cp, 10);
2613
2614 // filename
2615 e->fn = r->fn;
2616
2617 e->count = 0;
2618 e->seq = NULL;
2619 e->mf = NULL;
2620
2621 hd.p = e;
2622 if (!(hi = HashTableAdd(r->h_meta, e->name, strlen(e->name), hd, &n))){
2623 free(e);
2624 return NULL;
2625 }
2626
2627 if (!n) {
2628 /* Replace old one if needed. */
2629 ref_entry *r = (ref_entry *)hi->data.p;
2630
2631 if (r && (r->count != 0 || r->length != 0)) {
2632 /* Keep old one */
2633 free(e);
2634 } else {
2635 if (r)
2636 free(r);
2637 hi->data.p = e;
2638 }
2639 }
2640
2641 if (id >= id_alloc) {
2642 int x;
2643
2644 id_alloc = id_alloc ?id_alloc*2 : 16;
2645 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2646
2647 for (x = id; x < id_alloc; x++)
2648 r->ref_id[x] = NULL;
2649 }
2650 r->ref_id[id] = e;
2651 r->nref = ++id;
2652 }
2653
2654 RP("refs_load_fai %s END (success)\n", fn);
2655
2656 fclose(fp);
2657 return r;
2658
2659 err:
2660 RP("refs_load_fai %s END (fail)\n", fn);
2661
2662 if (fp)
2663 fclose(fp);
2664
2665 if (!r_orig)
2666 refs_free(r);
2667
2668 return NULL;
2669 }
2670
2671 /*
2672 * Verifies that the CRAM @SQ lines and .fai files match.
2673 */
sanitise_SQ_lines(cram_fd * fd)2674 static void sanitise_SQ_lines(cram_fd *fd) {
2675 int i;
2676
2677 if (!fd->header)
2678 return;
2679
2680 if (!fd->refs || !fd->refs->h_meta)
2681 return;
2682
2683 for (i = 0; i < fd->header->nref; i++) {
2684 char *name = fd->header->ref[i].name;
2685 HashItem *hi = HashTableSearch(fd->refs->h_meta, name, 0);
2686 ref_entry *r;
2687
2688 // We may have @SQ lines which have no known .fai, but do not
2689 // in themselves pose a problem because they are unused in the file.
2690 if (!hi)
2691 continue;
2692
2693 if (!(r = (ref_entry *)hi->data.p))
2694 continue;
2695
2696 if (r->length && r->length != fd->header->ref[i].len) {
2697 assert(strcmp(r->name, fd->header->ref[i].name) == 0);
2698
2699 // Should we also check MD5sums here to ensure the correct
2700 // reference was given?
2701 fprintf(stderr, "WARNING: Header @SQ length mismatch for "
2702 "ref %s, %d vs %d\n",
2703 r->name, fd->header->ref[i].len, (int)r->length);
2704
2705 // Fixing the parsed @SQ header will make MD:Z: strings work
2706 // and also stop it producing N for the sequence.
2707 fd->header->ref[i].len = r->length;
2708 }
2709 }
2710 }
2711
2712 /*
2713 * Indexes references by the order they appear in a BAM file. This may not
2714 * necessarily be the same order they appear in the fasta reference file.
2715 *
2716 * Returns 0 on success
2717 * -1 on failure
2718 */
refs2id(refs_t * r,SAM_hdr * h)2719 int refs2id(refs_t *r, SAM_hdr *h) {
2720 int i;
2721
2722 if (r->ref_id)
2723 free(r->ref_id);
2724 if (r->last)
2725 r->last = NULL;
2726
2727 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2728 if (!r->ref_id)
2729 return -1;
2730
2731 r->nref = h->nref;
2732 for (i = 0; i < h->nref; i++) {
2733 HashItem *hi;
2734 if ((hi = HashTableSearch(r->h_meta, h->ref[i].name, 0))) {
2735 r->ref_id[i] = hi->data.p;
2736 } else {
2737 fprintf(stderr, "Unable to find ref name '%s'\n",
2738 h->ref[i].name);
2739 }
2740 }
2741
2742 return 0;
2743 }
2744
2745 /*
2746 * Generates refs_t entries based on @SQ lines in the header.
2747 * Returns 0 on success
2748 * -1 on failure
2749 */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)2750 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
2751 int i, j;
2752
2753 if (!r)
2754 return -1;
2755
2756 if (!h || h->nref == 0)
2757 return 0;
2758
2759 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2760
2761 /* Existing refs are fine, as long as they're compatible with the hdr. */
2762 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
2763 return -1;
2764
2765 /* Copy info from h->ref[i] over to r */
2766 for (i = 0, j = r->nref; i < h->nref; i++) {
2767 SAM_hdr_type *ty;
2768 SAM_hdr_tag *tag;
2769 HashData hd;
2770 int n;
2771
2772 if (!h->ref[i].name) {
2773 fprintf(stderr, "refs_from_header: no sequence name found for reference\n");
2774 return -1;
2775 }
2776
2777 if (HashTableSearch(r->h_meta, h->ref[i].name, strlen(h->ref[i].name)))
2778 // Ref already known about
2779 continue;
2780
2781 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2782 return -1;
2783
2784 if (!h->ref[i].name)
2785 return -1;
2786
2787 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
2788 r->ref_id[j]->length = 0; // marker for not yet loaded
2789
2790 /* Initialise likely filename if known */
2791 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
2792 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
2793 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2794 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[j]->name, r->ref_id[j]->fn);
2795 }
2796 }
2797
2798 hd.p = r->ref_id[j];
2799 if (!HashTableAdd(r->h_meta, r->ref_id[j]->name,
2800 strlen(r->ref_id[j]->name), hd, &n))
2801 return -1;
2802 if (!n)
2803 return -1;
2804
2805 j++;
2806 }
2807 r->nref = j;
2808
2809 return 0;
2810 }
2811
2812 /*
2813 * Converts a directory and a filename into an expanded path, replacing %s
2814 * in directory with the filename and %[0-9]+s with portions of the filename
2815 * Any remaining parts of filename are added to the end with /%s.
2816 */
expand_cache_path(char * path,char * dir,char * fn)2817 void expand_cache_path(char *path, char *dir, char *fn) {
2818 char *cp;
2819
2820 while ((cp = strchr(dir, '%'))) {
2821 strncpy(path, dir, cp-dir);
2822 path += cp-dir;
2823
2824 if (*++cp == 's') {
2825 strcpy(path, fn);
2826 path += strlen(fn);
2827 fn += strlen(fn);
2828 cp++;
2829 } else if (*cp >= '0' && *cp <= '9') {
2830 char *endp;
2831 long l;
2832
2833 l = strtol(cp, &endp, 10);
2834 l = MIN(l, strlen(fn));
2835 if (*endp == 's') {
2836 strncpy(path, fn, l);
2837 path += l;
2838 fn += l;
2839 *path = 0;
2840 cp = endp+1;
2841 } else {
2842 *path++ = '%';
2843 *path++ = *cp++;
2844 }
2845 } else {
2846 *path++ = '%';
2847 *path++ = *cp++;
2848 }
2849 dir = cp;
2850 }
2851 strcpy(path, dir);
2852 path += strlen(dir);
2853 if (*fn && path[-1] != '/')
2854 *path++ = '/';
2855 strcpy(path, fn);
2856 }
2857
2858 /*
2859 * Make the directory containing path and any prefix directories.
2860 */
mkdir_prefix(char * path,int mode)2861 void mkdir_prefix(char *path, int mode) {
2862 char *cp = strrchr(path, '/');
2863 if (!cp)
2864 return;
2865
2866 *cp = 0;
2867 if (is_directory(path)) {
2868 *cp = '/';
2869 return;
2870 }
2871
2872 if (mkdir(path, mode) == 0) {
2873 chmod(path, mode);
2874 *cp = '/';
2875 return;
2876 }
2877
2878 mkdir_prefix(path, mode);
2879 mkdir(path, mode);
2880 chmod(path, mode);
2881 *cp = '/';
2882 }
2883
2884 /*
2885 * Queries the M5 string from the header and attempts to populate the
2886 * reference from this using the REF_PATH environment.
2887 *
2888 * Returns 0 on sucess
2889 * -1 on failure
2890 */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2891 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2892 char *ref_path = getenv("REF_PATH");
2893 SAM_hdr_type *ty;
2894 SAM_hdr_tag *tag;
2895 char path[PATH_MAX], path_tmp[PATH_MAX+20];
2896 char *local_cache = getenv("REF_CACHE");
2897 mFILE *mf;
2898
2899 if (fd->verbose)
2900 fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
2901
2902 if (!ref_path)
2903 ref_path = ".";
2904
2905 if (!r->name)
2906 return -1;
2907
2908 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2909 return -1;
2910
2911 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2912 goto no_M5;
2913
2914 if (fd->verbose)
2915 fprintf(stderr, "Querying ref %s\n", tag->str+3);
2916
2917 /* Use cache if available */
2918 if (local_cache && *local_cache) {
2919 struct stat sb;
2920 bzi_FILE *fp;
2921
2922 expand_cache_path(path, local_cache, tag->str+3);
2923
2924 if (0 == stat(path, &sb) && (fp = bzi_open(path, "r"))) {
2925 r->length = sb.st_size;
2926 r->offset = r->line_length = r->bases_per_line = 0;
2927
2928 r->fn = string_dup(fd->refs->pool, path);
2929
2930 if (fd->refs->fp)
2931 bzi_close(fd->refs->fp);
2932 fd->refs->fp = fp;
2933 fd->refs->fn = r->fn;
2934
2935 // Fall back to cram_get_ref() where it'll do the actual
2936 // reading of the file.
2937 return 0;
2938 }
2939 }
2940
2941 /* Otherwise search */
2942 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2943 size_t sz;
2944 r->seq = mfsteal(mf, &sz);
2945 if (r->seq) {
2946 r->mf = NULL;
2947 } else {
2948 // keep mf around as we couldn't detach
2949 r->seq = mf->data;
2950 r->mf = mf;
2951 }
2952 r->length = sz;
2953 } else {
2954 refs_t *refs;
2955 char *fn;
2956
2957 no_M5:
2958 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2959 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2960 return -1;
2961
2962 fn = (strncmp(tag->str+3, "file:", 5) == 0)
2963 ? tag->str+8
2964 : tag->str+3;
2965
2966 if (fd->refs->fp) {
2967 bzi_close(fd->refs->fp);
2968 fd->refs->fp = NULL;
2969 }
2970 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2971 return -1;
2972 sanitise_SQ_lines(fd);
2973
2974 fd->refs = refs;
2975 if (fd->refs->fp) {
2976 bzi_close(fd->refs->fp);
2977 fd->refs->fp = NULL;
2978 }
2979
2980 if (!fd->refs->fn)
2981 return -1;
2982
2983 if (-1 == refs2id(fd->refs, fd->header))
2984 return -1;
2985 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2986 return -1;
2987
2988 // Local copy already, so fall back to cram_get_ref().
2989 return 0;
2990 }
2991
2992 /* Populate the local disk cache if required */
2993 if (local_cache && *local_cache) {
2994 FILE *fp;
2995 int i;
2996
2997 expand_cache_path(path, local_cache, tag->str+3);
2998 if (fd->verbose)
2999 fprintf(stderr, "Path='%s'\n", path);
3000 mkdir_prefix(path, 01777);
3001
3002 i = 0;
3003 do {
3004 sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
3005 i++;
3006 fp = fopen(path_tmp, "wx");
3007 } while (fp == NULL && errno == EEXIST);
3008 if (!fp) {
3009 perror(path_tmp);
3010
3011 // Not fatal - we have the data already so keep going.
3012 return 0;
3013 }
3014
3015 if (r->length != fwrite(r->seq, 1, r->length, fp)) {
3016 perror(path);
3017 }
3018 if (-1 == paranoid_fclose(fp)) {
3019 unlink(path_tmp);
3020 } else {
3021 if (0 == chmod(path_tmp, 0444))
3022 rename(path_tmp, path);
3023 else
3024 unlink(path_tmp);
3025 }
3026 }
3027
3028 return 0;
3029 }
3030
cram_ref_incr_locked(refs_t * r,int id)3031 static void cram_ref_incr_locked(refs_t *r, int id) {
3032 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
3033
3034 if (id >= 0 && id >= r->nref)
3035 return;
3036
3037 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3038 return;
3039
3040 if (r->last_id == id)
3041 r->last_id = -1;
3042
3043 ++r->ref_id[id]->count;
3044 }
3045
cram_ref_incr(refs_t * r,int id)3046 void cram_ref_incr(refs_t *r, int id) {
3047 pthread_mutex_lock(&r->lock);
3048 cram_ref_incr_locked(r, id);
3049 pthread_mutex_unlock(&r->lock);
3050 }
3051
cram_ref_decr_locked(refs_t * r,int id)3052 static void cram_ref_decr_locked(refs_t *r, int id) {
3053 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
3054
3055 if (id >= 0 && id >= r->nref)
3056 return;
3057
3058 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3059 assert(id < 0 || !r->ref_id[id] || r->ref_id[id]->count >= 0);
3060 return;
3061 }
3062
3063 if (--r->ref_id[id]->count <= 0) {
3064 assert(r->ref_id[id]->count == 0);
3065 if (r->last_id >= 0) {
3066 if (r->ref_id[r->last_id]->count <= 0 &&
3067 r->ref_id[r->last_id]->seq) {
3068 RP("%d FREE REF %d (%p)\n", gettid(),
3069 r->last_id, r->ref_id[r->last_id]->seq);
3070 ref_entry_free_seq(r->ref_id[r->last_id]);
3071 r->ref_id[r->last_id]->length = 0;
3072 }
3073 }
3074 r->last_id = id;
3075 }
3076 }
3077
cram_ref_decr(refs_t * r,int id)3078 void cram_ref_decr(refs_t *r, int id) {
3079 pthread_mutex_lock(&r->lock);
3080 cram_ref_decr_locked(r, id);
3081 pthread_mutex_unlock(&r->lock);
3082 }
3083
3084 /*
3085 * Used by cram_ref_load and cram_ref_get. The file handle will have
3086 * already been opened, so we can catch it. The ref_entry *e informs us
3087 * of whether this is a multi-line fasta file or a raw MD5 style file.
3088 * Either way we create a single contiguous sequence.
3089 *
3090 * Returns all or part of a reference sequence on success (malloced);
3091 * NULL on failure.
3092 */
load_ref_portion(bzi_FILE * fp,ref_entry * e,int start,int end)3093 char *load_ref_portion(bzi_FILE *fp, ref_entry *e, int start, int end) {
3094 off_t offset, len;
3095 char *seq;
3096
3097 if (end < start)
3098 end = start;
3099
3100 /*
3101 * Compute locations in file. This is trivial for the MD5 files, but
3102 * is still necessary for the fasta variants.
3103 */
3104 offset = e->line_length
3105 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3106 (start-1) % e->bases_per_line
3107 : start-1;
3108
3109 len = (e->line_length
3110 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3111 (end-1) % e->bases_per_line
3112 : end-1) - offset + 1;
3113
3114 if (0 != bzi_seek(fp, offset, SEEK_SET)) {
3115 perror("fseeko() on reference file");
3116 return NULL;
3117 }
3118
3119 if (len == 0 || !(seq = malloc(len))) {
3120 return NULL;
3121 }
3122
3123 if (len != bzi_read(seq, 1, len, fp)) {
3124 perror("fread() on reference file");
3125 free(seq);
3126 return NULL;
3127 }
3128
3129 /* Strip white-space if required. */
3130 if (len != end-start+1) {
3131 int i, j;
3132 char *cp = seq;
3133 char *cp_to;
3134
3135 for (i = j = 0; i < len; i++) {
3136 if (cp[i] >= '!' && cp[i] <= '~')
3137 cp[j++] = toupper(cp[i]);
3138 }
3139 cp_to = cp+j;
3140
3141 if (cp_to - seq != end-start+1) {
3142 fprintf(stderr, "Malformed reference file?\n");
3143 free(seq);
3144 return NULL;
3145 }
3146 } else {
3147 int i;
3148 for (i = 0; i < len; i++) {
3149 seq[i] = toupper(seq[i]);
3150 }
3151 }
3152
3153 return seq;
3154 }
3155
3156 /*
3157 * Load the entire reference 'id'.
3158 * This also increments the reference count by 1.
3159 *
3160 * Returns ref_entry on success;
3161 * NULL on failure
3162 */
cram_ref_load(refs_t * r,int id)3163 ref_entry *cram_ref_load(refs_t *r, int id) {
3164 ref_entry *e = r->ref_id[id];
3165 int start = 1, end = e->length;
3166 char *seq;
3167
3168 if (e->seq) {
3169 return e;
3170 }
3171
3172 assert(e->count == 0);
3173
3174 if (r->last) {
3175 #ifdef REF_DEBUG
3176 int idx = 0;
3177 for (idx = 0; idx < r->nref; idx++)
3178 if (r->last == r->ref_id[idx])
3179 break;
3180 RP("%d cram_ref_load DECR %d => %d\n", gettid(), idx, r->last->count-1);
3181 #endif
3182
3183 assert(r->last->count > 0);
3184 if (--r->last->count <= 0) {
3185 RP("%d FREE REF %d (%p)\n", gettid(), id, r->last->seq);
3186 if (r->last->seq)
3187 ref_entry_free_seq(r->last);
3188 }
3189 }
3190
3191 /* Open file if it's not already the current open reference */
3192 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3193 if (r->fp)
3194 bzi_close(r->fp);
3195 r->fn = e->fn;
3196 if (!(r->fp = bzi_open(r->fn, "r"))) {
3197 perror(r->fn);
3198 return NULL;
3199 }
3200 }
3201
3202 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3203
3204 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3205 return NULL;
3206 }
3207
3208 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3209
3210 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
3211 e->seq = seq;
3212 e->mf = NULL;
3213 e->count++;
3214
3215 /*
3216 * Also keep track of last used ref so incr/decr loops on the same
3217 * sequence don't cause load/free loops.
3218 */
3219 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
3220 r->last = e;
3221 e->count++;
3222
3223 return e;
3224 }
3225
3226 /*
3227 * Returns a portion of a reference sequence from start to end inclusive.
3228 * The returned pointer is owned by either the cram_file fd or by the
3229 * internal refs_t structure and should not be freed by the caller.
3230 *
3231 * The difference is whether or not this refs_t is in use by just the one
3232 * cram_fd or by multiples, or whether we have multiple threads accessing
3233 * references. In either case fd->shared will be true and we start using
3234 * reference counting to track the number of users of a specific reference
3235 * sequence.
3236 *
3237 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3238 * and will be freed up on the next call to cram_get_ref or cram_close.
3239 *
3240 * To return the entire reference sequence, specify start as 1 and end
3241 * as 0.
3242 *
3243 * To cease using a reference, call cram_ref_decr().
3244 *
3245 * Returns reference on success,
3246 * NULL on failure
3247 */
cram_get_ref(cram_fd * fd,int id,int start,int end)3248 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
3249 ref_entry *r;
3250 char *seq;
3251 int ostart = start;
3252
3253 if (id == -1)
3254 return NULL;
3255
3256 /* FIXME: axiomatic query of r->seq being true?
3257 * Or shortcut for unsorted data where we load once and never free?
3258 */
3259
3260 //fd->shared_ref = 1; // hard code for now to simplify things
3261
3262 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3263
3264 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3265
3266 /*
3267 * Unsorted data implies we want to fetch an entire reference at a time.
3268 * We just deal with this at the moment by claiming we're sharing
3269 * references instead, which has the same requirement.
3270 */
3271 if (fd->unsorted)
3272 fd->shared_ref = 1;
3273
3274
3275 /* Sanity checking: does this ID exist? */
3276 if (id >= fd->refs->nref) {
3277 fprintf(stderr, "No reference found for id %d\n", id);
3278 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3279 return NULL;
3280 }
3281
3282 if (!fd->refs || !fd->refs->ref_id[id]) {
3283 fprintf(stderr, "No reference found for id %d\n", id);
3284 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3285 return NULL;
3286 }
3287
3288 if (!(r = fd->refs->ref_id[id])) {
3289 fprintf(stderr, "No reference found for id %d\n", id);
3290 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3291 return NULL;
3292 }
3293
3294
3295 /*
3296 * It has an entry, but may not have been populated yet.
3297 * Any manually loaded .fai files have their lengths known.
3298 * A ref entry computed from @SQ lines (M5 or UR field) will have
3299 * r->length == 0 unless it's been loaded once and verified that we have
3300 * an on-disk filename for it.
3301 *
3302 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3303 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3304 * rewrite my code to have one curl handle per thread.
3305 */
3306 pthread_mutex_lock(&fd->refs->lock);
3307 if (r->length == 0) {
3308 if (cram_populate_ref(fd, id, r) == -1) {
3309 fprintf(stderr, "Failed to populate reference for id %d\n", id);
3310 pthread_mutex_unlock(&fd->refs->lock);
3311 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3312 return NULL;
3313 }
3314 r = fd->refs->ref_id[id];
3315 if (fd->unsorted)
3316 cram_ref_incr_locked(fd->refs, id);
3317 }
3318
3319
3320 /*
3321 * We now know that we the filename containing the reference, so check
3322 * for limits. If it's over half the reference we'll load all of it in
3323 * memory as this will speed up subsequent calls.
3324 */
3325 if (end < 1)
3326 end = r->length;
3327 if (end >= r->length)
3328 end = r->length;
3329 if (start < 1)
3330 return NULL;
3331
3332 if (end - start >= 0.5*r->length || fd->shared_ref) {
3333 start = 1;
3334 end = r->length;
3335 }
3336
3337 /*
3338 * Maybe we have it cached already? If so use it.
3339 *
3340 * Alternatively if we don't have the sequence but we're sharing
3341 * references and/or are asking for the entire length of it, then
3342 * load the full reference into the refs structure and return
3343 * a pointer to that one instead.
3344 */
3345 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3346 char *cp;
3347
3348 if (id >= 0) {
3349 if (r->seq) {
3350 cram_ref_incr_locked(fd->refs, id);
3351 } else {
3352 ref_entry *e;
3353 if (!(e = cram_ref_load(fd->refs, id))) {
3354 pthread_mutex_unlock(&fd->refs->lock);
3355 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3356 return NULL;
3357 }
3358
3359 /* unsorted data implies cache ref indefinitely, to avoid
3360 * continually loading and unloading.
3361 */
3362 if (fd->unsorted)
3363 cram_ref_incr_locked(fd->refs, id);
3364 }
3365
3366 fd->ref = NULL; /* We never access it directly */
3367 fd->ref_start = 1;
3368 fd->ref_end = r->length;
3369 fd->ref_id = id;
3370
3371 cp = fd->refs->ref_id[id]->seq + ostart-1;
3372 } else {
3373 fd->ref = NULL;
3374 cp = NULL;
3375 }
3376
3377 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3378
3379 pthread_mutex_unlock(&fd->refs->lock);
3380 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3381 return cp;
3382 }
3383
3384 /*
3385 * Otherwise we're not sharing, we don't have a copy of it already and
3386 * we're only asking for a small portion of it.
3387 *
3388 * In this case load up just that segment ourselves, freeing any old
3389 * small segments in the process.
3390 */
3391
3392 /* Unmapped ref ID */
3393 if (id < 0) {
3394 if (fd->ref_free) {
3395 free(fd->ref_free);
3396 fd->ref_free = NULL;
3397 }
3398 fd->ref = NULL;
3399 fd->ref_id = id;
3400 pthread_mutex_unlock(&fd->refs->lock);
3401 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3402 return NULL;
3403 }
3404
3405 /* Open file if it's not already the current open reference */
3406 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3407 if (fd->refs->fp)
3408 bzi_close(fd->refs->fp);
3409 fd->refs->fn = r->fn;
3410 if (!(fd->refs->fp = bzi_open(fd->refs->fn, "r"))) {
3411 perror(fd->refs->fn);
3412 pthread_mutex_unlock(&fd->refs->lock);
3413 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3414 return NULL;
3415 }
3416 }
3417
3418 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3419 pthread_mutex_unlock(&fd->refs->lock);
3420 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3421 return NULL;
3422 }
3423
3424 if (fd->ref_free)
3425 free(fd->ref_free);
3426
3427 fd->ref_id = id;
3428 fd->ref_start = start;
3429 fd->ref_end = end;
3430 fd->ref_free = fd->ref;
3431 seq = fd->ref;
3432
3433 pthread_mutex_unlock(&fd->refs->lock);
3434 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3435
3436 return seq + ostart - start;
3437 }
3438
3439 /*
3440 * If fd has been opened for reading, it may be permitted to specify 'fn'
3441 * as NULL and let the code auto-detect the reference by parsing the
3442 * SAM header @SQ lines.
3443 */
cram_load_reference(cram_fd * fd,char * fn)3444 int cram_load_reference(cram_fd *fd, char *fn) {
3445 int ret = 0;
3446
3447 if (fn) {
3448 fd->refs = refs_load_fai(fd->refs, fn,
3449 !(fd->embed_ref && fd->mode == 'r'));
3450 fn = fd->refs ? fd->refs->fn : NULL;
3451 if (!fn)
3452 ret = -1;
3453 sanitise_SQ_lines(fd);
3454 }
3455 fd->ref_fn = fn;
3456
3457 if (!fd->refs && fd->header) {
3458 if (!(fd->refs = refs_create()))
3459 return -1;
3460 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3461 return -1;
3462 }
3463
3464 if (-1 == refs2id(fd->refs, fd->header))
3465 return -1;
3466
3467 return ret;
3468 }
3469
3470 /* ----------------------------------------------------------------------
3471 * Containers
3472 */
3473
3474 /*
3475 * Creates a new container, specifying the maximum number of slices
3476 * and records permitted.
3477 *
3478 * Returns cram_container ptr on success
3479 * NULL on failure
3480 */
cram_new_container(int nrec,int nslice)3481 cram_container *cram_new_container(int nrec, int nslice) {
3482 cram_container *c = calloc(1, sizeof(*c));
3483 enum cram_DS_ID id;
3484
3485 if (!c)
3486 return NULL;
3487
3488 c->curr_ref = -2;
3489
3490 c->max_c_rec = nrec * nslice;
3491 c->curr_c_rec = 0;
3492
3493 c->max_rec = nrec;
3494 c->record_counter = 0;
3495 c->num_bases = 0;
3496 c->s_num_bases = 0;
3497
3498 c->max_slice = nslice;
3499 c->curr_slice = 0;
3500
3501 c->pos_sorted = 1;
3502 c->max_apos = 0;
3503 c->multi_seq = 0;
3504
3505 c->bams = NULL;
3506
3507 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
3508 goto err;
3509 c->slice = NULL;
3510
3511 if (!(c->comp_hdr = cram_new_compression_header()))
3512 goto err;
3513 c->comp_hdr_block = NULL;
3514
3515 for (id = DS_RN; id < DS_TN; id++)
3516 if (!(c->stats[id] = cram_stats_create())) goto err;
3517
3518 //c->aux_B_stats = cram_stats_create();
3519
3520 if (!(c->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
3521 goto err;
3522 c->refs_used = 0;
3523
3524 c->last_name = "";
3525
3526 c->crc32 = 0;
3527
3528 return c;
3529
3530 err:
3531 if (c) {
3532 if (c->slices)
3533 free(c->slices);
3534 free(c);
3535 }
3536 return NULL;
3537 }
3538
cram_free_container(cram_container * c)3539 void cram_free_container(cram_container *c) {
3540 enum cram_DS_ID id;
3541 int i;
3542
3543 if (!c)
3544 return;
3545
3546 if (c->refs_used)
3547 free(c->refs_used);
3548
3549 if (c->landmark)
3550 free(c->landmark);
3551
3552 if (c->comp_hdr)
3553 cram_free_compression_header(c->comp_hdr);
3554
3555 if (c->comp_hdr_block)
3556 cram_free_block(c->comp_hdr_block);
3557
3558 if (c->slices) {
3559 for (i = 0; i < c->max_slice; i++)
3560 if (c->slices[i])
3561 cram_free_slice(c->slices[i]);
3562 free(c->slices);
3563 }
3564
3565 for (id = DS_RN; id < DS_TN; id++)
3566 if (c->stats[id]) cram_stats_free(c->stats[id]);
3567
3568 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3569
3570 if (c->tags_used) {
3571 HashItem *hi;
3572 HashIter *iter = HashTableIterCreate();
3573
3574 while (iter && (hi = HashTableIterNext(c->tags_used, iter))) {
3575 cram_tag_map *tm = (cram_tag_map *)hi->data.p;
3576 cram_codec *c = tm->codec;
3577
3578 if (c && c->free) c->free(c);
3579 free(tm);
3580 }
3581
3582 HashTableDestroy(c->tags_used, 0);
3583 HashTableIterDestroy(iter);
3584 }
3585
3586 free(c);
3587 }
3588
3589 /*
3590 * Reads a container header.
3591 *
3592 * Returns cram_container on success
3593 * NULL on failure or no container left (fd->err == 0).
3594 */
cram_read_container(cram_fd * fd)3595 cram_container *cram_read_container(cram_fd *fd) {
3596 cram_container c2, *c;
3597 int i, s;
3598 size_t rd = 0;
3599 uint32_t crc = 0;
3600
3601 fd->err = 0;
3602 fd->eof = 0;
3603
3604 memset(&c2, 0, sizeof(c2));
3605 if (IS_CRAM_1_VERS(fd)) {
3606 if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
3607 fd->eof = 1;
3608 return NULL;
3609 } else {
3610 rd+=s;
3611 }
3612 } else {
3613 uint32_t len;
3614 if ((s = int32_decode(fd, &c2.length)) == -1) {
3615 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3616 CRAM_MINOR_VERS(fd->version) == 0)
3617 fd->eof = 1; // EOF blocks arrived in v2.1
3618 else
3619 fd->eof = fd->empty_container ? 1 : 2;
3620 return NULL;
3621 } else {
3622 rd+=s;
3623 }
3624 len = le_int4(c2.length);
3625 crc = iolib_crc32(0L, (unsigned char *)&len, 4);
3626 }
3627 if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
3628 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3629 int64_t i64;
3630 if ((s = ltf8_decode_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3631 c2.ref_seq_start = i64;
3632 if ((s = ltf8_decode_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3633 c2.ref_seq_span = i64;
3634 } else {
3635 int32_t i32;
3636 if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3637 c2.ref_seq_start = i32;
3638 if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3639 c2.ref_seq_span = i32;
3640 }
3641 if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
3642
3643 if (IS_CRAM_1_VERS(fd)) {
3644 c2.record_counter = 0;
3645 c2.num_bases = 0;
3646 } else {
3647 if (IS_CRAM_3_VERS(fd)) {
3648 if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
3649 return NULL;
3650 else
3651 rd += s;
3652 } else {
3653 int32_t i32;
3654 if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
3655 return NULL;
3656 else
3657 rd += s;
3658 c2.record_counter = i32;
3659 }
3660
3661 if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
3662 return NULL;
3663 else
3664 rd += s;
3665 }
3666 if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s;
3667 if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
3668
3669 if (!(c = calloc(1, sizeof(*c))))
3670 return NULL;
3671
3672 *c = c2;
3673
3674 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
3675 c->num_landmarks) {
3676 fd->err = errno;
3677 cram_free_container(c);
3678 return NULL;
3679 }
3680 for (i = 0; i < c->num_landmarks; i++) {
3681 if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
3682 cram_free_container(c);
3683 return NULL;
3684 } else {
3685 rd += s;
3686 }
3687 }
3688
3689 if (IS_CRAM_3_VERS(fd)) {
3690 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
3691 return NULL;
3692 else
3693 rd+=4;
3694
3695 if (!fd->ignore_md5 && crc != c->crc32) {
3696 fprintf(stderr, "Container header CRC32 failure\n");
3697 cram_free_container(c);
3698 return NULL;
3699 }
3700 }
3701
3702 c->offset = rd;
3703 c->slices = NULL;
3704 c->curr_slice = 0;
3705 c->max_slice = c->num_landmarks;
3706 c->slice_rec = 0;
3707 c->curr_rec = 0;
3708 c->max_rec = 0;
3709
3710 if (c->ref_seq_id == -2) {
3711 c->multi_seq = 1;
3712 fd->multi_seq = 1;
3713 }
3714
3715 fd->empty_container =
3716 (c->num_records == 0 &&
3717 c->ref_seq_id == -1 &&
3718 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3719
3720 return c;
3721 }
3722
3723 /*
3724 * Writes a container structure.
3725 *
3726 * Returns 0 on success
3727 * -1 on failure
3728 */
cram_write_container(cram_fd * fd,cram_container * c)3729 int cram_write_container(cram_fd *fd, cram_container *c) {
3730 char buf_a[1024], *buf = buf_a, *cp;
3731 int i;
3732
3733 // worse case sizes given 32-bit & 64-bit quantities.
3734 if (61 + c->num_landmarks * 10 >= 1024)
3735 buf = malloc(61 + c->num_landmarks * 10);
3736 cp = buf;
3737
3738 *(int32_t *)cp = le_int4(c->length);
3739 cp += 4;
3740
3741 if (c->multi_seq) {
3742 cp += itf8_put(cp, -2);
3743 cp += itf8_put(cp, 0);
3744 cp += itf8_put(cp, 0);
3745 } else {
3746 cp += itf8_put(cp, c->ref_seq_id);
3747 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3748 cp += ltf8_put(cp, c->ref_seq_start);
3749 cp += ltf8_put(cp, c->ref_seq_span);
3750 } else {
3751 cp += itf8_put(cp, c->ref_seq_start);
3752 cp += itf8_put(cp, c->ref_seq_span);
3753 }
3754 }
3755 cp += itf8_put(cp, c->num_records);
3756 if (IS_CRAM_3_VERS(fd))
3757 cp += ltf8_put(cp, c->record_counter);
3758 else
3759 cp += itf8_put(cp, c->record_counter);
3760 cp += ltf8_put(cp, c->num_bases);
3761 cp += itf8_put(cp, c->num_blocks);
3762 cp += itf8_put(cp, c->num_landmarks);
3763 for (i = 0; i < c->num_landmarks; i++)
3764 cp += itf8_put(cp, c->landmark[i]);
3765
3766 if (IS_CRAM_3_VERS(fd)) {
3767 c->crc32 = iolib_crc32(0L, (uc *)buf, cp-buf);
3768 cp[0] = c->crc32 & 0xff;
3769 cp[1] = (c->crc32 >> 8) & 0xff;
3770 cp[2] = (c->crc32 >> 16) & 0xff;
3771 cp[3] = (c->crc32 >> 24) & 0xff;
3772 cp += 4;
3773 }
3774
3775 if (cp-buf != CRAM_IO_WRITE(buf, 1, cp-buf, fd)) {
3776 if (buf != buf_a)
3777 free(buf);
3778 return -1;
3779 }
3780
3781 if (buf != buf_a)
3782 free(buf);
3783
3784 return 0;
3785 }
3786
3787 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3788 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3789 int i, j;
3790
3791 if (c->curr_slice > 0 && !c->slices)
3792 return -1;
3793
3794 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3795
3796 /* Write the container struct itself */
3797 if (0 != cram_write_container(fd, c))
3798 return -1;
3799
3800 /* And the compression header */
3801 if (0 != cram_write_block(fd, c->comp_hdr_block))
3802 return -1;
3803
3804 /* Followed by the slice blocks */
3805 for (i = 0; i < c->curr_slice; i++) {
3806 cram_slice *s = c->slices[i];
3807
3808 if (0 != cram_write_block(fd, s->hdr_block))
3809 return -1;
3810
3811 for (j = 0; j < s->hdr->num_blocks; j++) {
3812 if (0 != cram_write_block(fd, s->block[j]))
3813 return -1;
3814 }
3815 }
3816
3817 return CRAM_IO_FLUSH(fd) == 0 ? 0 : -1;
3818 }
3819
3820 /*
3821 * Flushes a completely or partially full container to disk, writing
3822 * container structure, header and blocks. This also calls the encoder
3823 * functions.
3824 *
3825 * Returns 0 on success
3826 * -1 on failure
3827 */
cram_flush_container(cram_fd * fd,cram_container * c)3828 int cram_flush_container(cram_fd *fd, cram_container *c) {
3829 /* Encode the container blocks and generate compression header */
3830 if (0 != cram_encode_container(fd, c))
3831 return -1;
3832
3833 return cram_flush_container2(fd, c);
3834 }
3835
3836 typedef struct {
3837 cram_fd *fd;
3838 cram_container *c;
3839 } cram_job;
3840
cram_flush_thread(void * arg)3841 void *cram_flush_thread(void *arg) {
3842 cram_job *j = (cram_job *)arg;
3843
3844 /* Encode the container blocks and generate compression header */
3845 if (0 != cram_encode_container(j->fd, j->c)) {
3846 fprintf(stderr, "cram_encode_container failed\n");
3847 return NULL;
3848 }
3849
3850 return arg;
3851 }
3852
cram_flush_result(cram_fd * fd)3853 static int cram_flush_result(cram_fd *fd) {
3854 int i, ret = 0;
3855 t_pool_result *r;
3856 cram_container *lc = NULL;
3857
3858 // NB: we can have one result per slice, not per container,
3859 // so we need to free the container only after all slices
3860 // within it have been freed. (Automatic via reference counting.)
3861 while ((r = t_pool_next_result(fd->rqueue))) {
3862 cram_job *j = (cram_job *)r->data;
3863 cram_container *c;
3864
3865 if (!j) {
3866 t_pool_delete_result(r, 0);
3867 return -1;
3868 }
3869
3870 fd = j->fd;
3871 c = j->c;
3872
3873 if (fd->mode == 'w')
3874 if (0 != cram_flush_container2(fd, c))
3875 return -1;
3876
3877 /* Free the container */
3878 for (i = 0; i < c->max_slice; i++) {
3879 if (c->slices && c->slices[i]) {
3880 cram_free_slice(c->slices[i]);
3881 c->slices[i] = NULL;
3882 }
3883 }
3884
3885 c->slice = NULL;
3886 c->curr_slice = 0;
3887
3888 // Our jobs will be in order, so we free the last
3889 // container when our job has switched to a new one.
3890 if (c != lc) {
3891 if (lc) {
3892 cram_free_container(lc);
3893 if (fd->ctr == lc)
3894 fd->ctr = NULL;
3895 if (fd->ctr_mt == lc)
3896 fd->ctr_mt = NULL;
3897 }
3898 lc = c;
3899 }
3900
3901 ret |= CRAM_IO_FLUSH(fd) == 0 ? 0 : -1;
3902
3903 t_pool_delete_result(r, 1);
3904 }
3905 if (lc) {
3906 cram_free_container(lc);
3907 if (fd->ctr == lc)
3908 fd->ctr = NULL;
3909 if (fd->ctr_mt == lc)
3910 fd->ctr_mt = NULL;
3911 }
3912
3913 return ret;
3914 }
3915
cram_flush_container_mt(cram_fd * fd,cram_container * c)3916 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3917 cram_job *j;
3918
3919 if (!fd->pool)
3920 return cram_flush_container(fd, c);
3921
3922 if (!(j = malloc(sizeof(*j))))
3923 return -1;
3924 j->fd = fd;
3925 j->c = c;
3926
3927 t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
3928
3929 return cram_flush_result(fd);
3930 }
3931
3932 /* ----------------------------------------------------------------------
3933 * Compression headers; the first part of the container
3934 */
3935
3936 /*
3937 * Creates a new blank container compression header
3938 *
3939 * Returns header ptr on success
3940 * NULL on failure
3941 */
cram_new_compression_header(void)3942 cram_block_compression_hdr *cram_new_compression_header(void) {
3943 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3944 if (!hdr)
3945 return NULL;
3946
3947 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3948 free(hdr);
3949 return NULL;
3950 }
3951
3952 if (!(hdr->TD = HashTableCreate(16, HASH_DYNAMIC_SIZE))) {
3953 cram_free_block(hdr->TD_blk);
3954 free(hdr);
3955 return NULL;
3956 }
3957
3958 return hdr;
3959 }
3960
cram_free_compression_header(cram_block_compression_hdr * hdr)3961 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3962 int i;
3963
3964 if (hdr->landmark)
3965 free(hdr->landmark);
3966
3967 if (hdr->preservation_map)
3968 HashTableDestroy(hdr->preservation_map, 0);
3969
3970 for (i = 0; i < CRAM_MAP_HASH; i++) {
3971 cram_map *m, *m2;
3972 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3973 m2 = m->next;
3974 if (m->codec)
3975 m->codec->free(m->codec);
3976 free(m);
3977 }
3978 }
3979
3980 for (i = 0; i < CRAM_MAP_HASH; i++) {
3981 cram_map *m, *m2;
3982 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3983 m2 = m->next;
3984 if (m->codec)
3985 m->codec->free(m->codec);
3986 free(m);
3987 }
3988 }
3989
3990 for (i = 0; i < DS_END; i++) {
3991 if (hdr->codecs[i])
3992 hdr->codecs[i]->free(hdr->codecs[i]);
3993 }
3994
3995 if (hdr->TL)
3996 free(hdr->TL);
3997 if (hdr->TD_blk)
3998 cram_free_block(hdr->TD_blk);
3999 if (hdr->TD)
4000 HashTableDestroy(hdr->TD, 0);
4001
4002 free(hdr);
4003 }
4004
4005
4006 /* ----------------------------------------------------------------------
4007 * Slices and slice headers
4008 */
4009
cram_free_slice_header(cram_block_slice_hdr * hdr)4010 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4011 if (!hdr)
4012 return;
4013
4014 if (hdr->block_content_ids)
4015 free(hdr->block_content_ids);
4016
4017 if (hdr->tags)
4018 HashTableDestroy(hdr->tags, 0);
4019
4020 free(hdr);
4021
4022 return;
4023 }
4024
cram_free_slice(cram_slice * s)4025 void cram_free_slice(cram_slice *s) {
4026 if (!s)
4027 return;
4028
4029 if (s->bl)
4030 free(s->bl);
4031
4032 if (s->hdr_block)
4033 cram_free_block(s->hdr_block);
4034
4035 if (s->block) {
4036 int i;
4037
4038 if (s->hdr) {
4039 for (i = 0; i < s->hdr->num_blocks; i++) {
4040 cram_free_block(s->block[i]);
4041 }
4042 }
4043 free(s->block);
4044 }
4045
4046 if (s->block_by_id)
4047 free(s->block_by_id);
4048
4049 if (s->hdr)
4050 cram_free_slice_header(s->hdr);
4051
4052 if (s->seqs_blk)
4053 cram_free_block(s->seqs_blk);
4054
4055 if (s->qual_blk)
4056 cram_free_block(s->qual_blk);
4057
4058 if (s->name_blk)
4059 cram_free_block(s->name_blk);
4060
4061 if (s->aux_blk)
4062 cram_free_block(s->aux_blk);
4063
4064 if (s->base_blk)
4065 cram_free_block(s->base_blk);
4066
4067 if (s->soft_blk)
4068 cram_free_block(s->soft_blk);
4069
4070 #ifdef TN_external
4071 if (s->tn_blk)
4072 cram_free_block(s->tn_blk);
4073 #endif
4074
4075 if (s->cigar)
4076 free(s->cigar);
4077
4078 if (s->crecs)
4079 free(s->crecs);
4080
4081 if (s->features)
4082 free(s->features);
4083
4084 #ifndef TN_external
4085 if (s->TN)
4086 free(s->TN);
4087 #endif
4088
4089 if (s->pair[0])
4090 HashTableDestroy(s->pair[0], 0);
4091 if (s->pair[1])
4092 HashTableDestroy(s->pair[1], 0);
4093
4094 if (s->aux_block)
4095 free(s->aux_block);
4096
4097 free(s);
4098 }
4099
4100 /*
4101 * Creates a new empty slice in memory, for subsequent writing to
4102 * disk.
4103 *
4104 * Returns cram_slice ptr on success
4105 * NULL on failure
4106 */
cram_new_slice(enum cram_content_type type,int nrecs)4107 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4108 cram_slice *s = calloc(1, sizeof(*s));
4109 if (!s)
4110 return NULL;
4111
4112 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4113 goto err;
4114 s->hdr->content_type = type;
4115
4116 s->hdr_block = NULL;
4117 s->block = NULL;
4118 s->block_by_id = NULL;
4119 s->last_apos = 0;
4120 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
4121 s->cigar = NULL;
4122 s->cigar_alloc = 0;
4123 s->ncigar = 0;
4124
4125 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
4126 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
4127 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
4128 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
4129 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
4130 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
4131
4132 s->features = NULL;
4133 s->nfeatures = s->afeatures = 0;
4134
4135 #ifndef TN_external
4136 s->TN = NULL;
4137 s->nTN = s->aTN = 0;
4138 #endif
4139
4140 // Volatile keys as we do realloc in dstring
4141 if (!(s->pair[0] = HashTableCreate(10000, HASH_DYNAMIC_SIZE))) goto err;
4142 if (!(s->pair[1] = HashTableCreate(10000, HASH_DYNAMIC_SIZE))) goto err;
4143
4144 #ifdef BA_external
4145 s->BA_len = 0;
4146 #endif
4147
4148 //memset(&s->blocks[0], 0, 1024*sizeof(s->blocks[0]));
4149 //if (!(s->blocks[ID("QS")] = cram_new_block(EXTERNAL, ID("QS")))) goto err;
4150 //if (!(s->blocks[ID("RN")] = cram_new_block(EXTERNAL, ID("RN")))) goto err;
4151 //if (!(s->blocks[ID("IN")] = cram_new_block(EXTERNAL, ID("IN")))) goto err;
4152 //if (!(s->blocks[ID("SC")] = cram_new_block(EXTERNAL, ID("SC")))) goto err;
4153
4154 s->BD_crc = 0;
4155 s->SD_crc = 0;
4156
4157 return s;
4158
4159 err:
4160 if (s)
4161 cram_free_slice(s);
4162
4163 return NULL;
4164 }
4165
4166 /*
4167 * Loads an entire slice.
4168 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4169 * as slices contain references to objects in other slices.
4170 * To work around this while keeping the slice oriented outer loop
4171 * we read all slices and stitch them together into a fake large
4172 * slice instead.
4173 *
4174 * Returns cram_slice ptr on success
4175 * NULL on failure
4176 */
cram_read_slice(cram_fd * fd)4177 cram_slice *cram_read_slice(cram_fd *fd) {
4178 cram_block *b = cram_read_block(fd);
4179 cram_slice *s = calloc(1, sizeof(*s));
4180 int i, n, max_id, min_id;
4181
4182 if (!b || !s)
4183 goto err;
4184
4185 s->hdr_block = b;
4186 switch (b->content_type) {
4187 case MAPPED_SLICE:
4188 case UNMAPPED_SLICE:
4189 if (!(s->hdr = cram_decode_slice_header(fd, b)))
4190 goto err;
4191 break;
4192
4193 default:
4194 fprintf(stderr, "Unexpected block of type %s\n",
4195 cram_content_type2str(b->content_type));
4196 goto err;
4197 }
4198
4199 if (s->hdr->num_blocks < 1) {
4200 fprintf(stderr, "Slice does not include any data blocks.\n");
4201 goto err;
4202 }
4203
4204 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4205 if (!s->block)
4206 goto err;
4207
4208 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4209 if (!(s->block[i] = cram_read_block(fd)))
4210 goto err;
4211
4212 if (s->block[i]->content_type == EXTERNAL) {
4213 if (max_id < s->block[i]->content_id)
4214 max_id = s->block[i]->content_id;
4215 if (min_id > s->block[i]->content_id)
4216 min_id = s->block[i]->content_id;
4217 }
4218 }
4219 if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4220 goto err;
4221
4222 for (i = 0; i < n; i++) {
4223 if (s->block[i]->content_type != EXTERNAL)
4224 continue;
4225 int v = s->block[i]->content_id;
4226 if (v < 0 || v >= 512)
4227 v = 256 + (v > 0 ? v % 251 : (-v) % 251);
4228 s->block_by_id[v] = s->block[i];
4229 }
4230
4231 /* Initialise encoding/decoding tables */
4232 s->cigar = NULL;
4233 s->cigar_alloc = 0;
4234 s->ncigar = 0;
4235
4236 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
4237 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
4238 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
4239 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
4240 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
4241 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
4242
4243 s->crecs = NULL;
4244
4245 s->last_apos = s->hdr->ref_seq_start;
4246 s->decode_md = fd->decode_md;
4247
4248 return s;
4249
4250 err:
4251 if (b)
4252 cram_free_block(b);
4253 if (s) {
4254 s->hdr_block = NULL;
4255 cram_free_slice(s);
4256 }
4257 return NULL;
4258 }
4259
4260
4261 /* ----------------------------------------------------------------------
4262 * CRAM file definition (header)
4263 */
4264
4265 /*
4266 * Reads a CRAM file definition structure.
4267 * Returns file_def ptr on success
4268 * NULL on failure
4269 */
cram_read_file_def(cram_fd * fd)4270 cram_file_def *cram_read_file_def(cram_fd *fd) {
4271 cram_file_def *def = malloc(sizeof(*def));
4272 if (!def)
4273 return NULL;
4274
4275 if (26 != CRAM_IO_READ(&def->magic[0], 1, 26, fd)) {
4276 free(def);
4277 return NULL;
4278 }
4279
4280 if (memcmp(def->magic, "CRAM", 4) != 0) {
4281 free(def);
4282 return NULL;
4283 }
4284
4285 if (def->major_version > 4) {
4286 fprintf(stderr, "CRAM version number mismatch\n"
4287 "Expected 1.x, 2.x, 3.x or 4.x, got %d.%d\n",
4288 def->major_version, def->minor_version);
4289 free(def);
4290 return NULL;
4291 }
4292
4293 fd->first_container += 26;
4294 fd->last_slice = 0;
4295
4296 return def;
4297 }
4298
4299 /*
4300 * Writes a cram_file_def structure to cram_fd.
4301 * Returns 0 on success
4302 * -1 on failure
4303 */
cram_write_file_def(cram_fd * fd,cram_file_def * def)4304 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4305 return (CRAM_IO_WRITE(&def->magic[0], 1, 26, fd) == 26) ? 0 : -1;
4306 }
4307
cram_free_file_def(cram_file_def * def)4308 void cram_free_file_def(cram_file_def *def) {
4309 if (def) free(def);
4310 }
4311
4312 /* ----------------------------------------------------------------------
4313 * SAM header I/O
4314 */
4315
4316
4317 /*
4318 * Reads the SAM header from the first CRAM data block.
4319 * Also performs minimal parsing to extract read-group
4320 * and sample information.
4321
4322 * Returns SAM hdr ptr on success
4323 * NULL on failure
4324 */
cram_read_SAM_hdr(cram_fd * fd)4325 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
4326 int32_t header_len;
4327 char *header;
4328 SAM_hdr *hdr;
4329
4330 /* 1.1 onwards stores the header in the first block of a container */
4331 if (IS_CRAM_1_VERS(fd)) {
4332 /* Length */
4333 if (-1 == int32_decode(fd, &header_len))
4334 return NULL;
4335
4336 /* Alloc and read */
4337 if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4338 return NULL;
4339
4340 if (header_len != CRAM_IO_READ(header, 1, header_len, fd))
4341 return NULL;
4342 header[header_len] = '\0';
4343
4344 fd->first_container += 4 + header_len;
4345 } else {
4346 cram_container *c = cram_read_container(fd);
4347 cram_block *b;
4348 int i, len;
4349
4350 if (!c)
4351 return NULL;
4352
4353 fd->first_container += c->length + c->offset;
4354
4355 if (c->num_blocks < 1) {
4356 cram_free_container(c);
4357 return NULL;
4358 }
4359
4360 if (!(b = cram_read_block(fd))) {
4361 cram_free_container(c);
4362 return NULL;
4363 }
4364 if (cram_uncompress_block(b) != 0) {
4365 cram_free_container(c);
4366 cram_free_block(b);
4367 return NULL;
4368 }
4369
4370 len = b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4371 itf8_size(b->content_id) +
4372 itf8_size(b->uncomp_size) +
4373 itf8_size(b->comp_size);
4374
4375 /* Extract header from 1st block */
4376 if (-1 == int32_get_blk(b, &header_len) ||
4377 header_len < 0 || /* Spec. says signed... why? */
4378 b->uncomp_size - 4 < header_len) {
4379 cram_free_container(c);
4380 cram_free_block(b);
4381 return NULL;
4382 }
4383 if (NULL == (header = malloc((size_t) header_len+1))) {
4384 cram_free_container(c);
4385 cram_free_block(b);
4386 return NULL;
4387 }
4388 memcpy(header, BLOCK_END(b), header_len);
4389 header[header_len] = '\0';
4390 cram_free_block(b);
4391
4392 /* Consume any remaining blocks */
4393 for (i = 1; i < c->num_blocks; i++) {
4394 if (!(b = cram_read_block(fd))) {
4395 cram_free_container(c);
4396 return NULL;
4397 }
4398 len += b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4399 itf8_size(b->content_id) +
4400 itf8_size(b->uncomp_size) +
4401 itf8_size(b->comp_size);
4402 cram_free_block(b);
4403 }
4404
4405 if (c->length > 0 && len > 0 && c->length > len) {
4406 // Consume padding
4407 char *pads = malloc(c->length - len);
4408 if (!pads) {
4409 cram_free_container(c);
4410 return NULL;
4411 }
4412
4413 if (c->length - len != CRAM_IO_READ(pads, 1, c->length - len, fd)) {
4414 cram_free_container(c);
4415 return NULL;
4416 }
4417 free(pads);
4418 }
4419
4420 cram_free_container(c);
4421 }
4422
4423 /* Parse */
4424 #ifdef SAMTOOLS
4425 hdr = sam_hdr_parse_(header, header_len);
4426 #else
4427 hdr = sam_hdr_parse(header, header_len);
4428 #endif
4429 free(header);
4430
4431 return hdr;
4432 }
4433
4434 /*
4435 * Converts 'in' to a full pathname to store in out.
4436 * Out must be at least PATH_MAX bytes long.
4437 */
full_path(char * out,char * in)4438 static void full_path(char *out, char *in) {
4439 if (*in == '/') {
4440 strncpy(out, in, PATH_MAX);
4441 out[PATH_MAX-1] = 0;
4442 } else {
4443 int len;
4444
4445 // unable to get dir or out+in is too long
4446 if (!getcwd(out, PATH_MAX) ||
4447 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4448 strncpy(out, in, PATH_MAX);
4449 out[PATH_MAX-1] = 0;
4450 return;
4451 }
4452
4453 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
4454
4455 // FIXME: cope with `pwd`/../../../foo.fa ?
4456 }
4457 }
4458
4459 /*
4460 * Writes a CRAM SAM header.
4461 * Returns 0 on success
4462 * -1 on failure
4463 */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)4464 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
4465 int header_len;
4466 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4467
4468 /* Fix M5 strings */
4469 if (fd->refs && !fd->no_ref) {
4470 int i;
4471 for (i = 0; i < hdr->nref; i++) {
4472 SAM_hdr_type *ty;
4473 char *ref;
4474
4475 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
4476 return -1;
4477
4478 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
4479 char unsigned buf[16], buf2[33];
4480 int j, rlen;
4481 MD5_CTX md5;
4482
4483 if (!fd->refs->ref_id || !fd->refs->ref_id[i])
4484 return -1;
4485
4486 rlen = fd->refs->ref_id[i]->length;
4487 MD5_Init(&md5);
4488 ref = cram_get_ref(fd, i, 1, rlen);
4489 if (NULL == ref) return -1;
4490 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4491 MD5_Update(&md5, ref, rlen);
4492 MD5_Final(buf, &md5);
4493 cram_ref_decr(fd->refs, i);
4494
4495 for (j = 0; j < 16; j++) {
4496 buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
4497 buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
4498 }
4499 buf2[32] = 0;
4500 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
4501 return -1;
4502 }
4503
4504 if (fd->ref_fn) {
4505 char ref_fn[PATH_MAX];
4506 full_path(ref_fn, fd->ref_fn);
4507 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
4508 return -1;
4509 }
4510 }
4511 }
4512
4513 if (sam_hdr_rebuild(hdr))
4514 return -1;
4515
4516 /* Length */
4517 header_len = sam_hdr_length(hdr);
4518
4519 /* Create block(s) inside a container */
4520 cram_block *b = cram_new_block(FILE_HEADER, 0);
4521 cram_container *c = cram_new_container(0, 0);
4522 int padded_length;
4523 char *pads;
4524
4525 if (!b || !c) {
4526 if (b) cram_free_block(b);
4527 if (c) cram_free_container(c);
4528 return -1;
4529 }
4530
4531 int32_put(b, header_len);
4532 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
4533 BLOCK_UPLEN(b);
4534
4535 // Compress header block if V3.0 and above
4536 if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
4537 int method = 1<<GZIP;
4538 if (fd->use_bz2)
4539 method |= 1<<BZIP2;
4540 if (fd->use_bsc)
4541 method |= 1<<BSC;
4542 if (fd->use_fqz)
4543 method |= 1<<FQZ;
4544 if (fd->use_lzma)
4545 method |= 1<<LZMA;
4546 cram_compress_block(fd, NULL, b, NULL, method, fd->level);
4547 }
4548
4549 if (blank_block) {
4550 c->length = b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4551 itf8_size(b->content_id) +
4552 itf8_size(b->uncomp_size) +
4553 itf8_size(b->comp_size);
4554
4555 c->num_blocks = 2;
4556 c->num_landmarks = 2;
4557 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
4558 cram_free_block(b);
4559 cram_free_container(c);
4560 return -1;
4561 }
4562 c->landmark[0] = 0;
4563 c->landmark[1] = c->length;
4564
4565 // Plus extra storage for uncompressed secondary blank block
4566 padded_length = MIN(c->length*.5, 10000);
4567 c->length += padded_length + 2 + 4*IS_CRAM_3_VERS(fd) +
4568 itf8_size(b->content_id) +
4569 itf8_size(padded_length)*2;
4570 } else {
4571 // Pad the block instead.
4572 c->num_blocks = 1;
4573 c->num_landmarks = 1;
4574 if (!(c->landmark = malloc(sizeof(*c->landmark))))
4575 return -1;
4576 c->landmark[0] = 0;
4577
4578 padded_length = MAX(c->length*1.5, 10000) - c->length;
4579
4580 c->length = b->comp_size + padded_length +
4581 2 + 4*IS_CRAM_3_VERS(fd) +
4582 itf8_size(b->content_id) +
4583 itf8_size(b->uncomp_size) +
4584 itf8_size(b->comp_size);
4585
4586 if (NULL == (pads = calloc(1, padded_length))) {
4587 cram_free_block(b);
4588 cram_free_container(c);
4589 return -1;
4590 }
4591 BLOCK_APPEND(b, pads, padded_length);
4592 BLOCK_UPLEN(b);
4593 free(pads);
4594 }
4595
4596 if (-1 == cram_write_container(fd, c)) {
4597 cram_free_block(b);
4598 cram_free_container(c);
4599 return -1;
4600 }
4601
4602 if (-1 == cram_write_block(fd, b)) {
4603 cram_free_block(b);
4604 cram_free_container(c);
4605 return -1;
4606 }
4607
4608 if (blank_block) {
4609 BLOCK_RESIZE(b, padded_length);
4610 memset(BLOCK_DATA(b), 0, padded_length);
4611 BLOCK_SIZE(b) = padded_length;
4612 BLOCK_UPLEN(b);
4613 b->method = RAW;
4614 b->crc32 = 0;
4615 if (-1 == cram_write_block(fd, b)) {
4616 cram_free_block(b);
4617 cram_free_container(c);
4618 return -1;
4619 }
4620 }
4621
4622 cram_free_block(b);
4623 cram_free_container(c);
4624
4625 if (-1 == refs_from_header(fd->refs, fd, fd->header))
4626 return -1;
4627 if (-1 == refs2id(fd->refs, fd->header))
4628 return -1;
4629
4630 CRAM_IO_FLUSH(fd);
4631
4632 RP("=== Finishing saving header ===\n");
4633
4634 return 0;
4635 }
4636
4637 /* ----------------------------------------------------------------------
4638 * The top-level cram opening, closing and option handling
4639 */
4640
4641 /*
4642 * Initialises the lookup tables. These could be global statics, but they're
4643 * clumsy to setup in a multi-threaded environment unless we generate
4644 * verbatim code and include that.
4645 */
cram_init_tables(cram_fd * fd)4646 static void cram_init_tables(cram_fd *fd) {
4647 int i;
4648
4649 memset(fd->L1, 4, 256);
4650 fd->L1['A'] = 0; fd->L1['a'] = 0;
4651 fd->L1['C'] = 1; fd->L1['c'] = 1;
4652 fd->L1['G'] = 2; fd->L1['g'] = 2;
4653 fd->L1['T'] = 3; fd->L1['t'] = 3;
4654
4655 memset(fd->L2, 5, 256);
4656 fd->L2['A'] = 0; fd->L2['a'] = 0;
4657 fd->L2['C'] = 1; fd->L2['c'] = 1;
4658 fd->L2['G'] = 2; fd->L2['g'] = 2;
4659 fd->L2['T'] = 3; fd->L2['t'] = 3;
4660 fd->L2['N'] = 4; fd->L2['n'] = 4;
4661
4662 if (IS_CRAM_1_VERS(fd)) {
4663 for (i = 0; i < 0x200; i++) {
4664 int f = 0;
4665
4666 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
4667 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
4668 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
4669 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
4670 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
4671 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
4672 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
4673 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
4674 if (i & CRAM_FDUP) f |= BAM_FDUP;
4675
4676 fd->bam_flag_swap[i] = f;
4677 }
4678
4679 for (i = 0; i < 0x1000; i++) {
4680 int g = 0;
4681
4682 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
4683 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
4684 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
4685 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
4686 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
4687 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
4688 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
4689 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
4690 if (i & BAM_FDUP) g |= CRAM_FDUP;
4691
4692 fd->cram_flag_swap[i] = g;
4693 }
4694 } else {
4695 /* NOP */
4696 for (i = 0; i < 0x1000; i++)
4697 fd->bam_flag_swap[i] = i;
4698 for (i = 0; i < 0x1000; i++)
4699 fd->cram_flag_swap[i] = i;
4700 }
4701
4702 memset(fd->cram_sub_matrix, 4, 32*32);
4703 for (i = 0; i < 32; i++) {
4704 fd->cram_sub_matrix[i]['A'&0x1f]=0;
4705 fd->cram_sub_matrix[i]['C'&0x1f]=1;
4706 fd->cram_sub_matrix[i]['G'&0x1f]=2;
4707 fd->cram_sub_matrix[i]['T'&0x1f]=3;
4708 fd->cram_sub_matrix[i]['N'&0x1f]=4;
4709 }
4710 for (i = 0; i < 20; i+=4) {
4711 int j;
4712 for (j = 0; j < 20; j++) {
4713 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4714 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4715 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4716 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4717 }
4718 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
4719 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
4720 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
4721 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
4722 }
4723 }
4724
4725 // Default version numbers for CRAM
4726 static int major_version = 3;
4727 static int minor_version = 0;
4728
cram_default_version(void)4729 int cram_default_version(void) {
4730 return major_version*100 + minor_version;
4731 }
4732
cram_io_close(cram_fd * fd,int * fclose_result)4733 cram_fd * cram_io_close(cram_fd * fd, int * fclose_result)
4734 {
4735 if ( fd ) {
4736 if ( fd->fp_in ) {
4737 fclose(fd->fp_in);
4738 fd->fp_in = NULL;
4739 }
4740 if ( fd->fp_out ) {
4741 int const r = paranoid_fclose(fd->fp_out);
4742 if ( fclose_result )
4743 *fclose_result = r;
4744 fd->fp_out = NULL;
4745 }
4746
4747 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4748 if ( fd->fp_in_callbacks ) {
4749 fd->fp_in_callbacks = fd->fp_in_callback_deallocate_function(fd->fp_in_callbacks);
4750 }
4751 if ( fd->fp_in_buffer ) {
4752 fd->fp_in_buffer = cram_io_deallocate_input_buffer(fd->fp_in_buffer);
4753 }
4754 if ( fd->fp_out_callbacks ) {
4755 fd->fp_out_callbacks = fd->fp_out_callback_deallocate_function(fd->fp_out_callbacks);
4756 }
4757 if ( fd->fp_out_buffer ) {
4758 fd->fp_out_buffer = cram_io_deallocate_output_buffer(fd->fp_out_buffer);
4759 }
4760 #endif
4761
4762 free(fd);
4763 fd = NULL;
4764 }
4765 return fd;
4766 }
4767
4768 #if defined(CRAM_IO_CUSTOM_BUFFERING)
cram_io_open_by_callbacks(char const * filename,cram_io_allocate_read_input_t callback_allocate_function,cram_io_deallocate_read_input_t callback_deallocate_function,size_t const bufsize,int const decompress)4769 cram_fd * cram_io_open_by_callbacks(
4770 char const * filename,
4771 cram_io_allocate_read_input_t callback_allocate_function,
4772 cram_io_deallocate_read_input_t callback_deallocate_function,
4773 size_t const bufsize,
4774 int const decompress
4775 )
4776 {
4777 cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4778
4779 if ( ! fd )
4780 return cram_io_close(fd,0);
4781
4782 memset(fd,0,sizeof(cram_fd));
4783
4784 fd->fp_in_callback_allocate_function = callback_allocate_function;
4785 fd->fp_in_callback_deallocate_function = callback_deallocate_function;
4786
4787 fd->fp_in_callbacks =
4788 fd->fp_in_callback_allocate_function(filename,decompress);
4789
4790 if ( ! fd->fp_in_callbacks )
4791 return cram_io_close(fd,0);
4792
4793 fd->fp_in_buffer = cram_io_allocate_input_buffer(bufsize);
4794 if ( ! fd->fp_in_buffer ) {
4795 return cram_io_close(fd,0);
4796 }
4797
4798 return fd;
4799 }
4800
4801 // FIXME: make a shared interface with cram_io_open_by_callbacks above
cram_io_openw_by_callbacks(char const * filename,cram_io_allocate_write_output_t callback_allocate_function,cram_io_deallocate_write_output_t callback_deallocate_function,size_t const bufsize)4802 cram_fd * cram_io_openw_by_callbacks(
4803 char const * filename,
4804 cram_io_allocate_write_output_t callback_allocate_function,
4805 cram_io_deallocate_write_output_t callback_deallocate_function,
4806 size_t const bufsize
4807 )
4808 {
4809 cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4810
4811 if ( ! fd )
4812 return cram_io_close(fd,0);
4813
4814 memset(fd,0,sizeof(cram_fd));
4815
4816 fd->fp_out_callback_allocate_function = callback_allocate_function;
4817 fd->fp_out_callback_deallocate_function = callback_deallocate_function;
4818
4819 fd->fp_out_callbacks =
4820 fd->fp_out_callback_allocate_function(filename);
4821
4822 if ( ! fd->fp_out_callbacks )
4823 return cram_io_close(fd,0);
4824
4825 fd->fp_out_buffer = cram_io_allocate_output_buffer(bufsize);
4826 if ( ! fd->fp_out_buffer ) {
4827 return cram_io_close(fd,0);
4828 }
4829
4830 return fd;
4831 }
4832 #endif // CRAM_IO_CUSTOM_BUFFERING
4833
cram_io_open(char const * filename,char const * mode,char const * fmode)4834 cram_fd * cram_io_open(
4835 char const * filename,
4836 char const * mode,
4837 char const * fmode
4838 )
4839 {
4840 cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4841 if ( ! fd )
4842 return cram_io_close(fd,0);
4843
4844 memset(fd,0,sizeof(cram_fd));
4845
4846 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4847 fd->fp_in_callback_allocate_function = NULL;
4848 fd->fp_in_callback_deallocate_function = cram_IO_deallocate_cram_io_input;
4849 fd->fp_out_callback_allocate_function = NULL;
4850 fd->fp_out_callback_deallocate_function = cram_IO_deallocate_cram_io_output;
4851 #endif
4852
4853 if ( *mode == 'r' ) {
4854 size_t bufsize = 0;
4855 #if defined(CRAM_IO_CUSTOM_BUFFERING) && defined(HAVE_STDIO_EXT_H)
4856 int isreg = 0;
4857 #endif
4858
4859 if ( strcmp(filename,"-") == 0 ) {
4860 fd->fp_in = stdin;
4861 }
4862 else {
4863 fd->fp_in = fopen(filename, fmode);
4864 }
4865
4866 if ( ! fd->fp_in )
4867 return cram_io_close(fd,0);
4868
4869 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4870
4871 #if defined(HAVE_STDIO_EXT_H)
4872
4873 #if defined(HAVE_FILENO) && defined(HAVE_FSTAT)
4874 do {
4875 int const filedesc = fileno(fd->fp_in);
4876 struct stat sb;
4877 int const fdret = fstat(filedesc,&sb);
4878 isreg = (fdret == 0) && S_ISREG(sb.st_mode);
4879 } while ( 0 );
4880 #endif
4881
4882 /* get input buffer size */
4883 if ( isreg ) {
4884 /* read one character to force buffer to be set up */
4885 int const c = fgetc(fd->fp_in);
4886 /* EOF? */
4887 if ( c != EOF ) {
4888 /* get buffer size */
4889 int cc;
4890 bufsize = __fbufsize(fd->fp_in);
4891 /* put character back (C standard says this is guaranteed to
4892 * work for a single character)
4893 */
4894 cc = ungetc(c,fd->fp_in);
4895 /* check result anyway */
4896 if ( cc == EOF ) {
4897 return cram_io_close(fd,0);
4898 }
4899 }
4900 }
4901 #endif // HAVE_STDIO_EXT_H
4902
4903 fd->fp_in_callbacks = cram_IO_allocate_cram_io_input_from_C_FILE(fd->fp_in);
4904 if ( ! fd->fp_in_callbacks )
4905 return cram_io_close(fd,0);
4906
4907 if ( !bufsize )
4908 bufsize = 32*1024;
4909
4910 do {
4911 fd->fp_in_buffer = cram_io_allocate_input_buffer(bufsize);
4912 if ( ! fd->fp_in_buffer ) {
4913 return cram_io_close(fd,0);
4914 }
4915 else {
4916 setvbuf(fd->fp_in, NULL, _IONBF, 0);
4917 }
4918 } while ( 0 );
4919
4920 #endif // CRAM_IO_CUSTOM_BUFFERING
4921 } else {
4922 if (filename) {
4923 if ( strcmp(filename,"-") == 0 ) {
4924 fd->fp_out = stdout;
4925 } else {
4926 fd->fp_out = fopen(filename, fmode);
4927 }
4928
4929 if ( ! fd->fp_out )
4930 return cram_io_close(fd,0);
4931 } else {
4932 // E.g. opening a CRAM in-memory file.
4933 fd->fp_out = NULL;
4934 }
4935
4936 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4937 fd->fp_out_callbacks
4938 = cram_IO_allocate_cram_io_output_from_C_FILE(fd->fp_out);
4939
4940 if ( ! fd->fp_out_callbacks )
4941 return cram_io_close(fd,0);
4942
4943 int bufsize = 32*1024; // FIXME: use same bufsize calc as above
4944 fd->fp_out_buffer = cram_io_allocate_output_buffer(bufsize);
4945 if ( ! fd->fp_out_buffer ) {
4946 return cram_io_close(fd,0);
4947 } else if (fd->fp_out) {
4948 setvbuf(fd->fp_out, NULL, _IONBF, 0);
4949 }
4950
4951 #endif // CRAM_IO_CUSTOM_BUFFERING
4952
4953 }
4954
4955 return fd;
4956 }
4957
4958 /*
4959 * Opens a CRAM file for read (mode "rb") or write ("wb").
4960 * The filename may be "-" to indicate stdin or stdout.
4961 *
4962 * Returns file handle on success
4963 * NULL on failure.
4964 */
cram_open(const char * filename,const char * mode)4965 cram_fd *cram_open(const char *filename, const char *mode) {
4966 int i;
4967 char *cp;
4968 char fmode[3]= { mode[0], '\0', '\0' };
4969 cram_fd *fd = NULL;
4970
4971 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
4972 fmode[1] = 'b';
4973 }
4974
4975 fd = cram_io_open(filename,mode,fmode);
4976 if (!fd)
4977 return cram_io_close(fd,0);
4978
4979 fd->level = 5;
4980 if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9')
4981 fd->level = mode[2] - '0';
4982
4983 fd->mode = *mode;
4984 fd->first_container = 0;
4985
4986 if (fd->mode == 'r') {
4987 /* Reader */
4988
4989 if (!(fd->file_def = cram_read_file_def(fd)))
4990 goto err;
4991
4992 fd->version = fd->file_def->major_version * 256 +
4993 fd->file_def->minor_version;
4994
4995 if (!(fd->header = cram_read_SAM_hdr(fd))) {
4996 cram_free_file_def(fd->file_def);
4997 goto err;
4998 }
4999
5000 } else {
5001 /* Writer */
5002 cram_file_def def;
5003
5004 if (major_version == 1) {
5005 fprintf(stderr, "Unable to write to version 1.0\n");
5006 goto err;
5007 }
5008
5009 def.magic[0] = 'C';
5010 def.magic[1] = 'R';
5011 def.magic[2] = 'A';
5012 def.magic[3] = 'M';
5013 def.major_version = major_version;
5014 def.minor_version = minor_version;
5015 memset(def.file_id, 0, 20);
5016 strncpy(def.file_id, filename, 20);
5017 if (0 != cram_write_file_def(fd, &def))
5018 goto err;
5019
5020 fd->version = def.major_version * 256 + def.minor_version;
5021
5022 /* SAM header written later */
5023 }
5024
5025 cram_init_tables(fd);
5026
5027 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5028 if (!fd->prefix)
5029 goto err;
5030 fd->first_base = fd->last_base = -1;
5031 fd->record_counter = 0;
5032
5033 fd->ctr = NULL;
5034 fd->ctr_mt = NULL;
5035 fd->refs = refs_create();
5036 if (!fd->refs)
5037 goto err;
5038 fd->ref_id = -2;
5039 fd->ref = NULL;
5040
5041 fd->decode_md = 0;
5042 fd->verbose = 0;
5043 fd->seqs_per_slice = SEQS_PER_SLICE;
5044 fd->bases_per_slice = BASES_PER_SLICE;
5045 fd->slices_per_container = SLICE_PER_CNT;
5046 fd->embed_ref = 0;
5047 fd->no_ref = 0;
5048 fd->ignore_md5 = 0;
5049 fd->ignore_chksum = 1; // Some disagreement in the specification of these
5050 fd->lossy_read_names = 0;
5051 fd->use_bz2 = 0;
5052 fd->use_rans = IS_CRAM_3_VERS(fd);
5053 fd->use_bsc = 0;
5054 fd->use_lzma = 0;
5055 fd->multi_seq = 0;
5056 fd->unsorted = 0;
5057 fd->shared_ref = 0;
5058
5059 fd->index = NULL;
5060 fd->own_pool = 0;
5061 fd->pool = NULL;
5062 fd->rqueue = NULL;
5063 fd->job_pending = NULL;
5064 fd->ooc = 0;
5065 fd->binning = BINNING_NONE;
5066 fd->required_fields = INT_MAX;
5067
5068 for (i = 0; i < DS_END; i++)
5069 fd->m[i] = cram_new_metrics();
5070
5071 if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5072 goto err;
5073
5074 fd->range.refid = -2; // no ref.
5075 fd->eof = 1;
5076 fd->ref_fn = NULL;
5077
5078 fd->bl = NULL;
5079
5080 /* Initialise dummy refs from the @SQ headers */
5081 if (-1 == refs_from_header(fd->refs, fd, fd->header))
5082 goto err;
5083
5084 return fd;
5085
5086 err:
5087 fd = cram_io_close(fd,0);
5088
5089 return NULL;
5090 }
5091
5092 #if defined(CRAM_IO_CUSTOM_BUFFERING)
5093 /*
5094 * Opens a CRAM file for input via callbacks
5095 *
5096 * Returns file handle on success
5097 * NULL on failure.
5098 */
cram_open_by_callbacks(char const * filename,cram_io_allocate_read_input_t callback_allocate_function,cram_io_deallocate_read_input_t callback_deallocate_function,size_t const bufsize)5099 cram_fd *cram_open_by_callbacks(
5100 char const * filename,
5101 cram_io_allocate_read_input_t callback_allocate_function,
5102 cram_io_deallocate_read_input_t callback_deallocate_function,
5103 size_t const bufsize
5104 ) {
5105 int i;
5106 char *cp;
5107 cram_fd *fd = NULL;
5108
5109 fd = cram_io_open_by_callbacks(filename,
5110 callback_allocate_function,
5111 callback_deallocate_function,
5112 bufsize,
5113 0);
5114 if (!fd)
5115 return cram_io_close(fd,0);
5116
5117 fd->level = 5;
5118
5119 fd->mode = 'r';
5120 fd->first_container = 0;
5121
5122 /* Reader */
5123 if (!(fd->file_def = cram_read_file_def(fd)))
5124 goto err;
5125
5126 fd->version = fd->file_def->major_version * 256 +
5127 fd->file_def->minor_version;
5128
5129 if (!(fd->header = cram_read_SAM_hdr(fd)))
5130 goto err;
5131
5132 cram_init_tables(fd);
5133
5134 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5135 if (!fd->prefix)
5136 goto err;
5137 fd->first_base = fd->last_base = -1;
5138 fd->record_counter = 0;
5139
5140 fd->ctr = NULL;
5141 fd->ctr_mt = NULL;
5142 fd->refs = refs_create();
5143 if (!fd->refs)
5144 goto err;
5145 fd->ref_id = -2;
5146 fd->ref = NULL;
5147
5148 fd->decode_md = 0;
5149 fd->verbose = 0;
5150 fd->seqs_per_slice = SEQS_PER_SLICE;
5151 fd->bases_per_slice = BASES_PER_SLICE;
5152 fd->slices_per_container = SLICE_PER_CNT;
5153 fd->embed_ref = 0;
5154 fd->no_ref = 0;
5155 fd->ignore_md5 = 0;
5156 fd->ignore_chksum = 1; // Some disagreement in the specification of these
5157 fd->lossy_read_names = 0;
5158 fd->use_bz2 = 0;
5159 fd->use_rans = IS_CRAM_3_VERS(fd);
5160 fd->use_bsc = 0;
5161 fd->use_lzma = 0;
5162 fd->multi_seq = 0;
5163 fd->unsorted = 0;
5164 fd->shared_ref = 0;
5165
5166 fd->index = NULL;
5167 fd->own_pool = 0;
5168 fd->pool = NULL;
5169 fd->rqueue = NULL;
5170 fd->job_pending = NULL;
5171 fd->ooc = 0;
5172 fd->binning = BINNING_NONE;
5173 fd->required_fields = INT_MAX;
5174
5175 for (i = 0; i < DS_END; i++)
5176 fd->m[i] = cram_new_metrics();
5177
5178 if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5179 goto err;
5180
5181 fd->range.refid = -2; // no ref.
5182 fd->eof = 1;
5183 fd->ref_fn = NULL;
5184
5185 fd->bl = NULL;
5186
5187 /* Initialise dummy refs from the @SQ headers */
5188 if (-1 == refs_from_header(fd->refs, fd, fd->header))
5189 goto err;
5190
5191 return fd;
5192
5193 err:
5194 fd = cram_io_close(fd,0);
5195
5196 return NULL;
5197 }
5198
5199 /*
5200 * FIXME: make shared interface with code above.
5201 *
5202 * Opens a CRAM file for write via callbacks
5203 *
5204 * Returns file handle on success
5205 * NULL on failure.
5206 */
cram_openw_by_callbacks(char const * filename,cram_io_allocate_write_output_t callback_allocate_function,cram_io_deallocate_write_output_t callback_deallocate_function,size_t const bufsize)5207 cram_fd *cram_openw_by_callbacks(
5208 char const * filename,
5209 cram_io_allocate_write_output_t callback_allocate_function,
5210 cram_io_deallocate_write_output_t callback_deallocate_function,
5211 size_t const bufsize
5212 ) {
5213 int i;
5214 char *cp;
5215 cram_fd *fd = NULL;
5216
5217 fd = cram_io_openw_by_callbacks(filename,
5218 callback_allocate_function,
5219 callback_deallocate_function,
5220 bufsize);
5221 if (!fd)
5222 return cram_io_close(fd,0);
5223
5224 fd->level = 5;
5225
5226 fd->mode = 'w';
5227 fd->first_container = 0;
5228
5229 {
5230 /* Writer */
5231 cram_file_def def;
5232
5233 if (major_version == 1) {
5234 fprintf(stderr, "Unable to write to version 1.0\n");
5235 goto err;
5236 }
5237
5238 def.magic[0] = 'C';
5239 def.magic[1] = 'R';
5240 def.magic[2] = 'A';
5241 def.magic[3] = 'M';
5242 def.major_version = major_version;
5243 def.minor_version = minor_version;
5244 memset(def.file_id, 0, 20);
5245 if (filename)
5246 strncpy(def.file_id, filename, 20);
5247 if (0 != cram_write_file_def(fd, &def))
5248 goto err;
5249
5250 fd->version = def.major_version * 256 + def.minor_version;
5251
5252 /* SAM header written later */
5253 }
5254
5255 cram_init_tables(fd);
5256
5257 if (filename) {
5258 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5259 if (!fd->prefix)
5260 goto err;
5261 } else {
5262 fd->prefix = strdup("");
5263 }
5264 fd->first_base = fd->last_base = -1;
5265 fd->record_counter = 0;
5266
5267 fd->ctr = NULL;
5268 fd->refs = refs_create();
5269 if (!fd->refs)
5270 goto err;
5271 fd->ref_id = -2;
5272 fd->ref = NULL;
5273
5274 fd->decode_md = 0;
5275 fd->verbose = 0;
5276 fd->seqs_per_slice = SEQS_PER_SLICE;
5277 fd->bases_per_slice = BASES_PER_SLICE;
5278 fd->slices_per_container = SLICE_PER_CNT;
5279 fd->embed_ref = 0;
5280 fd->no_ref = 0;
5281 fd->ignore_md5 = 0;
5282 fd->use_bz2 = 0;
5283 fd->use_rans = IS_CRAM_3_VERS(fd);
5284 fd->use_bsc = 0;
5285 fd->use_lzma = 0;
5286 fd->multi_seq = 0;
5287 fd->unsorted = 0;
5288 fd->shared_ref = 0;
5289
5290 fd->index = NULL;
5291 fd->own_pool = 0;
5292 fd->pool = NULL;
5293 fd->rqueue = NULL;
5294 fd->job_pending = NULL;
5295 fd->ooc = 0;
5296 fd->binning = BINNING_NONE;
5297 fd->required_fields = INT_MAX;
5298
5299 for (i = 0; i < DS_END; i++)
5300 fd->m[i] = cram_new_metrics();
5301
5302 if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5303 goto err;
5304
5305 fd->range.refid = -2; // no ref.
5306 fd->eof = 1;
5307 fd->ref_fn = NULL;
5308
5309 fd->bl = NULL;
5310
5311 /* Initialise dummy refs from the @SQ headers */
5312 if (-1 == refs_from_header(fd->refs, fd, fd->header))
5313 goto err;
5314
5315 return fd;
5316
5317 err:
5318 fd = cram_io_close(fd,0);
5319
5320 return NULL;
5321 }
5322 #endif
5323
5324 /*
5325 * Flushes a CRAM file.
5326 * Useful for when writing to stdout without wishing to close the stream.
5327 *
5328 * Returns 0 on success
5329 * -1 on failure
5330 */
cram_flush(cram_fd * fd)5331 int cram_flush(cram_fd *fd) {
5332 if (!fd)
5333 return -1;
5334
5335 if (fd->mode == 'w' && fd->ctr) {
5336 if(fd->ctr->slice)
5337 cram_update_curr_slice(fd->ctr);
5338
5339 if (-1 == cram_flush_container_mt(fd, fd->ctr))
5340 return -1;
5341 }
5342
5343 return 0;
5344 }
5345
5346 /*
5347 * Writes an EOF block to a CRAM file.
5348 *
5349 * Returns 0 on success
5350 * -1 on failure
5351 */
cram_write_eof_block(cram_fd * fd)5352 int cram_write_eof_block(cram_fd *fd) {
5353 // 7-bit encoding version:
5354 // CRAM_IO_WRITE(
5355 // "\x0f\x00\x00\x00\x8f\xff\xff\xff" // Cont HDR
5356 // "\x7f\x82\x95\x9e\x46\x00\x00\x00" // Cont HDR
5357 // "\x00\x01\x00" // Cont HDR
5358 // "\xac\xd6\x05\xbc" // CRC32
5359 // "\x00\x01\x00\x06\x06" // Comp.HDR blk
5360 // "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
5361 // "\xee\x63\x01\x4b", // CRC32
5362 // 38, 1, fd);
5363 // return cram_io_flush_output_buffer(fd);
5364
5365 // EOF block is a container with
5366 // ref_seq_id -1
5367 // start pos 0x454f46 ("EOF")
5368 // span 0
5369 // nrec 0
5370 // counter 0
5371 // nbases 0
5372 // 1 block (landmark 0)
5373 // (CRC32)
5374 // followed by an empty compression header block
5375 // method raw (0)
5376 // type comp header (1)
5377 // content id 0
5378 // block contents size 6
5379 // raw size 6
5380 // empty preservation map (01 00)
5381 // empty data series map (01 00)
5382 // empty tag map (01 00)
5383 // block CRC
5384 if (IS_CRAM_3_VERS(fd)) {
5385 if (1 != CRAM_IO_WRITE(
5386 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
5387 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
5388 "\x00\x01\x00" // Cont HDR
5389 "\x05\xbd\xd9\x4f" // CRC32
5390 //"\xa8\x2a\x1b\xb9" // CRC32C
5391 "\x00\x01\x00\x06\x06" // Comp.HDR blk
5392 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
5393 "\xee\x63\x01\x4b", // CRC32
5394 //"\xe9\x70\xd3\x86", // CRC32C
5395 38, 1, fd)) {
5396 fd = cram_io_close(fd,0);
5397 return -1;
5398 }
5399 } else {
5400 if (1 != CRAM_IO_WRITE("\x0b\x00\x00\x00\xff\xff\xff\xff"
5401 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
5402 "\x00\x01\x00\x00\x01\x00\x06\x06"
5403 "\x01\x00\x01\x00\x01\x00", 30, 1, fd)) {
5404 fd = cram_io_close(fd,0);
5405 return -1;
5406 }
5407 }
5408
5409 #if defined(CRAM_IO_CUSTOM_BUFFERING)
5410 return cram_io_flush_output_buffer(fd);
5411 #else
5412 return 0;
5413 #endif
5414 }
5415
5416
5417 /*
5418 * Closes a CRAM file.
5419 * Returns 0 on success
5420 * -1 on failure
5421 */
cram_close(cram_fd * fd)5422 int cram_close(cram_fd *fd) {
5423 spare_bams *bl, *next;
5424 int i;
5425 int rclose = 0;
5426
5427 if (!fd) {
5428 fd = cram_io_close(fd,0);
5429 return -1;
5430 }
5431
5432 if (fd->mode == 'w' && fd->ctr) {
5433 if(fd->ctr->slice)
5434 cram_update_curr_slice(fd->ctr);
5435
5436 if (-1 == cram_flush_container_mt(fd, fd->ctr)) {
5437 fd = cram_io_close(fd,0);
5438 return -1;
5439 }
5440 }
5441
5442 if (fd->pool && fd->eof >= 0) {
5443 t_pool_flush(fd->pool);
5444
5445 if (0 != cram_flush_result(fd)) {
5446 fd = cram_io_close(fd,0);
5447 return -1;
5448 }
5449
5450 pthread_mutex_destroy(fd->metrics_lock);
5451 pthread_mutex_destroy(fd->ref_lock);
5452 pthread_mutex_destroy(fd->bam_list_lock);
5453 free(fd->metrics_lock);
5454 free(fd->ref_lock);
5455 free(fd->bam_list_lock);
5456
5457 if (fd->mode == 'w')
5458 fd->ctr = NULL; // prevent double freeing
5459
5460 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5461
5462 t_results_queue_destroy(fd->rqueue);
5463 }
5464
5465 if (fd->mode == 'w') {
5466 /* Write EOF block */
5467 if (0 != cram_write_eof_block(fd))
5468 return -1;
5469
5470 // if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff"
5471 // "\xff\xe0\x45\x4f\x46\x00\x00\x00"
5472 // "\x00\x00\x00", 19, 1, fd->fp))
5473 // return -1;
5474 }
5475
5476 for (bl = fd->bl; bl; bl = next) {
5477 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
5478
5479 next = bl->next;
5480 for (i = 0; i < max_rec; i++) {
5481 if (bl->bams[i])
5482 free(bl->bams[i]);
5483 }
5484 free(bl->bams);
5485 free(bl);
5486 }
5487
5488 if (fd->file_def)
5489 cram_free_file_def(fd->file_def);
5490
5491 if (fd->header)
5492 sam_hdr_free(fd->header);
5493
5494 free(fd->prefix);
5495
5496 if (fd->ctr)
5497 cram_free_container(fd->ctr);
5498
5499 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5500 cram_free_container(fd->ctr_mt);
5501
5502 if (fd->refs)
5503 refs_free(fd->refs);
5504 if (fd->ref_free)
5505 free(fd->ref_free);
5506
5507 for (i = 0; i < DS_END; i++)
5508 if (fd->m[i])
5509 free(fd->m[i]);
5510
5511 if (fd->tags_used)
5512 HashTableDestroy(fd->tags_used, 1);
5513
5514 if (fd->index)
5515 cram_index_free(fd);
5516
5517 if (fd->own_pool && fd->pool)
5518 t_pool_destroy(fd->pool, 0);
5519
5520 /* rclose == return value for flush and close in case of CRAM output */
5521 fd = cram_io_close(fd, &rclose);
5522
5523 return rclose;
5524 }
5525
5526
5527 /*
5528 * Returns 1 if we hit an EOF while reading.
5529 */
cram_eof(cram_fd * fd)5530 int cram_eof(cram_fd *fd) {
5531 return fd->eof;
5532 }
5533
5534
5535 /*
5536 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5537 * Use this immediately after opening.
5538 *
5539 * Returns 0 on success
5540 * -1 on failure
5541 */
cram_set_option(cram_fd * fd,enum cram_option opt,...)5542 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
5543 int r;
5544 va_list args;
5545
5546 va_start(args, opt);
5547 r = cram_set_voption(fd, opt, args);
5548 va_end(args);
5549
5550 return r;
5551 }
5552
5553 /*
5554 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5555 * Use this immediately after opening.
5556 *
5557 * Returns 0 on success
5558 * -1 on failure
5559 */
cram_set_voption(cram_fd * fd,enum cram_option opt,va_list args)5560 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
5561 refs_t *refs;
5562
5563 switch (opt) {
5564 case CRAM_OPT_DECODE_MD:
5565 fd->decode_md = va_arg(args, int);
5566 break;
5567
5568 case CRAM_OPT_PREFIX:
5569 if (fd->prefix)
5570 free(fd->prefix);
5571 if (!(fd->prefix = strdup(va_arg(args, char *))))
5572 return -1;
5573 break;
5574
5575 case CRAM_OPT_VERBOSITY:
5576 fd->verbose = va_arg(args, int);
5577 break;
5578
5579 case CRAM_OPT_SEQS_PER_SLICE:
5580 fd->seqs_per_slice = va_arg(args, int);
5581 break;
5582
5583 case CRAM_OPT_BASES_PER_SLICE:
5584 fd->bases_per_slice = va_arg(args, int);
5585 break;
5586
5587 case CRAM_OPT_SLICES_PER_CONTAINER:
5588 fd->slices_per_container = va_arg(args, int);
5589 break;
5590
5591 case CRAM_OPT_EMBED_REF:
5592 fd->embed_ref = va_arg(args, int);
5593 break;
5594
5595 case CRAM_OPT_NO_REF:
5596 fd->no_ref = va_arg(args, int);
5597 break;
5598
5599 case CRAM_OPT_IGNORE_MD5:
5600 fd->ignore_md5 = va_arg(args, int);
5601 break;
5602
5603 case CRAM_OPT_IGNORE_CHKSUM:
5604 fd->ignore_chksum = va_arg(args, int);
5605 break;
5606
5607 case CRAM_OPT_LOSSY_READ_NAMES:
5608 fd->lossy_read_names = va_arg(args, int);
5609 break;
5610
5611 case CRAM_OPT_USE_BZIP2:
5612 fd->use_bz2 = va_arg(args, int);
5613 break;
5614
5615 case CRAM_OPT_USE_ARITH:
5616 case CRAM_OPT_USE_RANS:
5617 fd->use_rans = va_arg(args, int);
5618 break;
5619
5620 case CRAM_OPT_USE_BSC:
5621 fd->use_bsc = va_arg(args, int);
5622 break;
5623
5624 case CRAM_OPT_USE_FQZ:
5625 fd->use_fqz = va_arg(args, int);
5626 break;
5627
5628 case CRAM_OPT_USE_LZMA:
5629 fd->use_lzma = va_arg(args, int);
5630 break;
5631
5632 case CRAM_OPT_SHARED_REF:
5633 fd->shared_ref = 1;
5634 refs = va_arg(args, refs_t *);
5635 if (refs != fd->refs) {
5636 if (fd->refs)
5637 refs_free(fd->refs);
5638 fd->refs = refs;
5639 fd->refs->count++;
5640 }
5641 break;
5642
5643 case CRAM_OPT_RANGE: {
5644 int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5645 if (fd->range.refid != -2)
5646 fd->required_fields |= SAM_POS;
5647 return r;
5648 }
5649
5650 case CRAM_OPT_REFERENCE:
5651 return cram_load_reference(fd, va_arg(args, char *));
5652
5653 case CRAM_OPT_VERSION: {
5654 int major, minor;
5655 char *s = va_arg(args, char *);
5656 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5657 fprintf(stderr, "Malformed version string %s\n", s);
5658 return -1;
5659 }
5660 if (!((major == 1 && minor == 0) ||
5661 (major == 2 && (minor == 0 || minor == 1)) ||
5662 (major == 3 && (minor == 0 || minor == 1)) ||
5663 (major == 4 && minor == 0))) {
5664 fprintf(stderr, "Unknown version string; "
5665 "use 1.0 (deprecated), 2.0, 2.1 or 3.0 "
5666 "(experimentally 3.1 or 4.0)\n");
5667 errno = EINVAL;
5668 return -1;
5669 }
5670 if (major == 1 && minor == 0 && fd && fd->mode != 'r') {
5671 fprintf(stderr, "Unable to write to version 1.0\n");
5672 return -1;
5673 }
5674 major_version = major;
5675 minor_version = minor;
5676 break;
5677 }
5678
5679 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5680 fd->multi_seq = va_arg(args, int);
5681 break;
5682
5683 case CRAM_OPT_NTHREADS: {
5684 int nthreads = va_arg(args, int);
5685 if (nthreads > 1) {
5686 if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
5687 return -1;
5688
5689 fd->rqueue = t_results_queue_init();
5690 fd->metrics_lock = malloc(sizeof(pthread_mutex_t));
5691 fd->ref_lock = malloc(sizeof(pthread_mutex_t));
5692 fd->bam_list_lock = malloc(sizeof(pthread_mutex_t));
5693 pthread_mutex_init(fd->metrics_lock, NULL);
5694 pthread_mutex_init(fd->ref_lock, NULL);
5695 pthread_mutex_init(fd->bam_list_lock, NULL);
5696 fd->shared_ref = 1;
5697 fd->own_pool = 1;
5698 }
5699 break;
5700 }
5701
5702 case CRAM_OPT_THREAD_POOL:
5703 fd->pool = va_arg(args, t_pool *);
5704 if (fd->pool) {
5705 fd->rqueue = t_results_queue_init();
5706 fd->metrics_lock = malloc(sizeof(pthread_mutex_t));
5707 fd->ref_lock = malloc(sizeof(pthread_mutex_t));
5708 fd->bam_list_lock = malloc(sizeof(pthread_mutex_t));
5709 pthread_mutex_init(fd->metrics_lock, NULL);
5710 pthread_mutex_init(fd->ref_lock, NULL);
5711 pthread_mutex_init(fd->bam_list_lock, NULL);
5712 }
5713 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5714 fd->own_pool = 0;
5715
5716 //fd->qsize = 1;
5717 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5718 //t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
5719 break;
5720
5721 case CRAM_OPT_BINNING:
5722 fd->binning = va_arg(args, int);
5723 break;
5724
5725 case CRAM_OPT_REQUIRED_FIELDS:
5726 fd->required_fields = va_arg(args, int);
5727 if (fd->range.refid != -2)
5728 fd->required_fields |= SAM_POS;
5729 break;
5730
5731 case CRAM_OPT_PRESERVE_AUX_ORDER:
5732 fd->preserve_aux_order = va_arg(args, int);
5733 break;
5734
5735 case CRAM_OPT_PRESERVE_AUX_SIZE:
5736 fd->preserve_aux_size = va_arg(args, int);
5737 break;
5738
5739 default:
5740 fprintf(stderr, "Unknown CRAM option code %d\n", opt);
5741 return -1;
5742 }
5743
5744 return 0;
5745 }
5746
5747