1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36  *
37  * CRAM I/O primitives.
38  *
39  * - ITF8 encoding and decoding.
40  * - Block based I/O
41  * - Zlib inflating and deflating (memory)
42  * - CRAM basic data structure reading and writing
43  * - File opening / closing
44  * - Reference sequence handling
45  */
46 
47 /*
48  * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
49  * a way to return errors for when malloc fails.
50  */
51 
52 #ifdef HAVE_CONFIG_H
53 #include "io_lib_config.h"
54 #endif
55 
56 #include <stdio.h>
57 #include <errno.h>
58 #include <assert.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #include <unistd.h>
62 #include <zlib.h>
63 #ifdef HAVE_LIBBZ2
64 #include <bzlib.h>
65 #endif
66 #ifdef HAVE_LIBLZMA
67 #include <lzma.h>
68 #endif
69 #ifdef HAVE_LIBBSC
70 #include <libbsc/libbsc.h>
71 #endif
72 #include <sys/types.h>
73 #include <sys/stat.h>
74 #include <math.h>
75 #include <ctype.h>
76 
77 #ifdef _MSC_VER
78 #include <direct.h>
79 #include <io.h>
80 
81 #define chmod _chmod
82 #define getcwd _getcwd
83 
84 //_mkdir does not take a mode argument on windows
85 //but all calls below are followed up by calls to chmod
86 //so this should not matter.
87 #define mkdir(path,mode) _mkdir(path)
88 #endif
89 
90 #include "io_lib/cram.h"
91 #include "io_lib/os.h"
92 #include "io_lib/md5.h"
93 #include "io_lib/crc32.h"
94 #include "io_lib/open_trace_file.h"
95 #include "io_lib/rANS_static.h"
96 #include "io_lib/rANS_static4x16.h"
97 #include "io_lib/tokenise_name3.h"
98 
99 // Enable if we want V3.1 support.  TODO: add a configure param for this
100 #define HAVE_FQZ
101 
102 #ifdef HAVE_FQZ
103 #include "fqzcomp_qual.h"
104 #endif
105 
106 #if defined(HAVE_STDIO_EXT_H)
107 #include <stdio_ext.h>
108 #endif
109 
110 //#define REF_DEBUG
111 
112 #ifdef REF_DEBUG
113 #define RP(...) fprintf (stderr, __VA_ARGS__)
114 #include <sys/syscall.h>
115 #define gettid() (int)syscall(SYS_gettid)
116 #else
117 #define RP(...)
118 #define gettid() 0
119 #endif
120 
121 #define TRIAL_SPAN 50
122 #define NTRIALS 3
123 
124 /* ----------------------------------------------------------------------
125  * custom buffering helper routines
126  */
127 #if defined(CRAM_IO_CUSTOM_BUFFERING)
cram_io_C_FILE_fread(void * ptr,size_t size,size_t nmemb,void * stream)128 static size_t cram_io_C_FILE_fread(void *ptr, size_t size, size_t nmemb, void *stream)
129 {
130     return fread(ptr,size,nmemb,(FILE *)stream);
131 }
132 
cram_io_C_FILE_fwrite(void * ptr,size_t size,size_t nmemb,void * stream)133 static size_t cram_io_C_FILE_fwrite(void *ptr, size_t size, size_t nmemb, void *stream)
134 {
135     return fwrite(ptr,size,nmemb,(FILE *)stream);
136 }
137 
cram_io_C_FILE_fseek(void * fd,off_t offset,int whence)138 static int cram_io_C_FILE_fseek(void * fd, off_t offset, int whence)
139 {
140     return fseeko((FILE *)fd,offset,whence);
141 }
142 
cram_io_C_FILE_ftell(void * fd)143 static off_t cram_io_C_FILE_ftell(void * fd)
144 {
145     return ftello((FILE *)fd);
146 }
147 
148 /* ----------------------------------------------------------------------
149  * Input buffering
150  */
151 
152 /* fill empty buffer */
cram_io_fill_input_buffer(cram_fd * fd)153 static void cram_io_fill_input_buffer(cram_fd * fd)
154 {
155     /* buffer need to be empty */
156     assert ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe );
157 
158     /* read up to buffer size bytes */
159     do {
160         /* C-IO fread */
161         size_t const r =
162 	    (fd->fp_in_callbacks->fread_callback)(fd->fp_in_buffer->fp_in_buf_pa,
163 						  1,
164 						  fd->fp_in_buffer->fp_in_buf_size,
165 						  fd->fp_in_callbacks->user_data);
166         /* move offset */
167         fd->fp_in_buffer->fp_in_buf_start += (fd->fp_in_buffer->fp_in_buf_pe -
168 					      fd->fp_in_buffer->fp_in_buf_pa);
169         /* set end of window */
170         fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa + r;
171         /* set current input */
172         fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
173     }
174     while ( 0 ) ;
175 }
176 
177 /* fill buffer and return next byte or EOF */
cram_io_input_buffer_underflow(cram_fd * fd)178 int cram_io_input_buffer_underflow(cram_fd * fd)
179 {
180     cram_io_fill_input_buffer(fd);
181 
182     if ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe )
183         return EOF;
184     else
185         return (int)((unsigned char )(*(fd->fp_in_buffer->fp_in_buf_pc++)));
186 }
187 
188 /* integer minimum */
imin(size_t const a,size_t const b)189 static inline size_t imin(size_t const a, size_t const b)
190 {
191     return (a<b)?a:b;
192 }
193 
194 /* fread simulation */
cram_io_input_buffer_read(void * vptr,size_t size,size_t nmemb,cram_fd * fd)195 size_t cram_io_input_buffer_read(void *vptr, size_t size, size_t nmemb, cram_fd * fd)
196 {
197     size_t toread = size * nmemb; /* number of bytes still to be read */
198     size_t r = 0; /* number of bytes copied to ptr */
199     size_t inbuf = fd->fp_in_buffer->fp_in_buf_pe -
200 	           fd->fp_in_buffer->fp_in_buf_pc; /* number of bytes in buffer */
201     size_t tocopy = imin(toread,inbuf); /* number of bytes to be copied from buffer */
202     size_t blockread = 0;
203     char *ptr = (char *)vptr;
204 
205     /* copy bytes still in buffer and update values */
206     memcpy(ptr,fd->fp_in_buffer->fp_in_buf_pc,tocopy);
207     toread -= tocopy;
208     r += tocopy;
209     ptr += tocopy;
210     fd->fp_in_buffer->fp_in_buf_pc += tocopy;
211 
212     /* read whole blocks without copying to buffer first, C-IO fread */
213     while ( (toread >= fd->fp_in_buffer->fp_in_buf_size) &&
214 	    ((blockread = ((fd->fp_in_callbacks->
215 			    fread_callback))(ptr,
216 					     1,
217 					     fd->fp_in_buffer->fp_in_buf_size,
218 					     fd->fp_in_callbacks->user_data))!=0) ) {
219         toread -= blockread;
220         r += blockread;
221         ptr += blockread;
222         fd->fp_in_buffer->fp_in_buf_start += blockread;
223     }
224 
225     /* read rest of bytes using buffer */
226     while ( toread ) {
227         /* buffer should be empty */
228         assert ( fd->fp_in_buffer->fp_in_buf_pc == fd->fp_in_buffer->fp_in_buf_pe );
229         /* fill buffer */
230         cram_io_fill_input_buffer(fd);
231         /* number of bytes in buffer after filling */
232         inbuf = fd->fp_in_buffer->fp_in_buf_pe-fd->fp_in_buffer->fp_in_buf_pc;
233         /* number of bytes to copy */
234         tocopy = imin(toread,inbuf);
235 
236         /* break if there is no more data */
237         if ( ! inbuf )
238             break;
239 
240         memcpy(ptr,fd->fp_in_buffer->fp_in_buf_pc,tocopy);
241         toread -= tocopy;
242         r += tocopy;
243         ptr += tocopy;
244         fd->fp_in_buffer->fp_in_buf_pc += tocopy;
245     }
246 
247     return size ? (r / size) : r;
248 }
249 
cram_io_input_buffer_seek(cram_fd * fd,off_t offset,int whence)250 int cram_io_input_buffer_seek(cram_fd * fd, off_t offset, int whence)
251 {
252     int r = -1;
253 
254     if ( whence == SEEK_CUR )
255     {
256         /* current absolute input position in buffer */
257         uint64_t const curpos = fd->fp_in_buffer->fp_in_buf_start +
258 	                       (fd->fp_in_buffer->fp_in_buf_pc -
259 				fd->fp_in_buffer->fp_in_buf_pa);
260         /* absolute buffer low */
261         uint64_t const bufferlow = fd->fp_in_buffer->fp_in_buf_start;
262         /* absolute buffer high */
263         uint64_t const bufferhigh = fd->fp_in_buffer->fp_in_buf_start +
264 	                           (fd->fp_in_buffer->fp_in_buf_pe -
265 				    fd->fp_in_buffer->fp_in_buf_pa);
266         /* absolute seek target position */
267         int64_t const abstarget = ((int64_t)curpos) + offset;
268 
269         /* if target is inside buffer, then just adjust the current pointer */
270         if ( abstarget >= bufferlow && abstarget <= bufferhigh ) {
271             /* update current pointer */
272             fd->fp_in_buffer->fp_in_buf_pc += offset;
273             assert ( fd->fp_in_buffer->fp_in_buf_pc >= fd->fp_in_buffer->fp_in_buf_pa );
274             assert ( fd->fp_in_buffer->fp_in_buf_pc <= fd->fp_in_buffer->fp_in_buf_pe );
275             /* seek successful */
276             return 0;
277         }
278         else {
279             /* current position of underlying input stream */
280             int64_t const filepos = fd->fp_in_buffer->fp_in_buf_start +
281 		                   (fd->fp_in_buffer->fp_in_buf_pe -
282 				    fd->fp_in_buffer->fp_in_buf_pa);
283             int64_t const seekoffset = abstarget - filepos;
284 
285             /* perform seek */
286             r = fd->fp_in_callbacks->fseek_callback(fd->fp_in_callbacks->user_data,
287 						    seekoffset, SEEK_CUR);
288 
289             /* seek successful */
290             if ( ! r ) {
291                 /* mark buffer as empty */
292                 fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
293                 fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa;
294                 /* set new buffer start offset */
295                 fd->fp_in_buffer->fp_in_buf_start = abstarget;
296                 return 0;
297             } else {
298                 return -1;
299             }
300         }
301     }
302 
303     /* mark buffer as empty */
304     fd->fp_in_buffer->fp_in_buf_pc = fd->fp_in_buffer->fp_in_buf_pa;
305     fd->fp_in_buffer->fp_in_buf_pe = fd->fp_in_buffer->fp_in_buf_pa;
306 
307     /* perform seek, C-IO fseek */
308     r = fd->fp_in_callbacks->fseek_callback(fd->fp_in_callbacks->user_data,
309 					    offset, whence);
310 
311     /* get new offset if seek was successful */
312     if ( !r )
313         /* C-IO ftell */
314         fd->fp_in_buffer->fp_in_buf_start =
315 	    fd->fp_in_callbacks->ftell_callback(fd->fp_in_callbacks->user_data);
316 
317     return r;
318 }
319 
320 static cram_io_input_t *
cram_IO_deallocate_cram_io_input(cram_io_input_t * obj)321 cram_IO_deallocate_cram_io_input(cram_io_input_t * obj)
322 {
323     if ( obj ) {
324         free(obj);
325         obj = NULL;
326     }
327 
328     return obj;
329 }
330 
331 static cram_io_input_t *
cram_IO_allocate_cram_io_input()332 cram_IO_allocate_cram_io_input()
333 {
334     cram_io_input_t * obj = (cram_io_input_t *)malloc(sizeof(cram_io_input_t));
335     if ( ! obj ) {
336         return cram_IO_deallocate_cram_io_input(obj);
337     }
338     obj->user_data = NULL;
339     obj->fread_callback = NULL;
340     obj->fseek_callback = NULL;
341     obj->ftell_callback = NULL;
342     return obj;
343 }
344 
345 static cram_io_input_t *
cram_IO_allocate_cram_io_input_from_C_FILE(FILE * file)346 cram_IO_allocate_cram_io_input_from_C_FILE(FILE * file)
347 {
348     cram_io_input_t * obj = cram_IO_allocate_cram_io_input();
349     if ( ! obj ) {
350         return cram_IO_deallocate_cram_io_input(obj);
351     }
352     obj->user_data = file;
353     obj->fread_callback = cram_io_C_FILE_fread;
354     obj->fseek_callback = cram_io_C_FILE_fseek;
355     obj->ftell_callback = cram_io_C_FILE_ftell;
356     return obj;
357 }
358 
359 static cram_fd_input_buffer *
cram_io_deallocate_input_buffer(cram_fd_input_buffer * buffer)360 cram_io_deallocate_input_buffer(cram_fd_input_buffer * buffer)
361 {
362     if ( buffer ) {
363         if ( buffer->fp_in_buffer ) {
364             free(buffer->fp_in_buffer);
365             buffer->fp_in_buffer = NULL;
366         }
367         free(buffer);
368         buffer = NULL;
369     }
370     return buffer;
371 }
372 
373 static cram_fd_input_buffer *
cram_io_allocate_input_buffer(size_t const bufsize)374 cram_io_allocate_input_buffer(size_t const bufsize)
375 {
376     cram_fd_input_buffer * buffer =
377 	(cram_fd_input_buffer *)malloc(sizeof(cram_fd_input_buffer));
378 
379     if ( ! buffer )
380         return cram_io_deallocate_input_buffer(buffer);
381 
382     memset(buffer,0,sizeof(cram_fd_input_buffer));
383 
384     buffer->fp_in_buf_size = bufsize;
385     buffer->fp_in_buffer   = (char *)malloc(buffer->fp_in_buf_size);
386 
387     if ( ! buffer->fp_in_buffer ) {
388         return cram_io_deallocate_input_buffer(buffer);
389     }
390 
391     buffer->fp_in_buf_pa   = buffer->fp_in_buffer;
392     buffer->fp_in_buf_pc   = buffer->fp_in_buffer;
393     buffer->fp_in_buf_pe   = buffer->fp_in_buffer;
394 
395     return buffer;
396 }
397 
cram_io_input_buffer_fgets(char * s,int size,cram_fd * fd)398 char * cram_io_input_buffer_fgets(char * s, int size, cram_fd * fd)
399 {
400      int linelen = 0;
401 
402      while ( linelen < size-1 ) {
403          int const c = CRAM_IO_GETC(fd);
404 
405          if ( c == EOF ) {
406              break;
407          }
408          else {
409              s[linelen++] = c;
410          }
411 
412          if ( c == '\n' )
413              break;
414      }
415 
416      if ( ! linelen )
417          return NULL;
418 
419      s[linelen++] = 0;
420 
421      return s;
422 }
423 
424 /* ----------------------------------------------------------------------
425  * Output buffering
426  */
427 
428 /*
429  * Flush buffer.
430  *
431  * Returns 0 on success,
432  *        -1 on failure.
433  */
cram_io_flush_output_buffer(cram_fd * fd)434 int cram_io_flush_output_buffer(cram_fd *fd)
435 {
436     size_t r;
437     char  *dat;
438     size_t olen;
439     size_t len;
440 
441     if (!fd->fp_out_buffer)
442 	return 0;
443 
444     dat  = fd->fp_out_buffer->fp_out_buf_pa;
445     olen = fd->fp_out_buffer->fp_out_buf_pc - dat;
446     len  = olen;
447 
448     /* write up to buffer size bytes */
449     /* C-IO fwrite */
450     if (len) {
451 	r = fd->fp_out_callbacks->fwrite_callback
452 	    (dat, 1, len, fd->fp_out_callbacks->user_data);
453 
454 	dat += r;
455 	len -= r;
456 	fd->fp_out_buffer->fp_out_buf_start += r; /* move offset */
457 
458 	if (r < olen) {
459 	    /* Write failed, possible partial */
460 	    if (r > 0) {
461 		memmove(fd->fp_out_buffer->fp_out_buf_pa, dat, len);
462 		fd->fp_out_buffer->fp_out_buf_pc
463 		    = fd->fp_out_buffer->fp_out_buf_pa + len;
464 	    }
465 
466 	    /* Output is probably unfixable now so return error */
467 	    return -1;
468 	}
469     }
470 
471     /* reset current output */
472     fd->fp_out_buffer->fp_out_buf_pc = fd->fp_out_buffer->fp_out_buf_pa;
473 
474     return 0;
475 }
476 
477 
478 /* fwrite simulation */
cram_io_output_buffer_write(void * vptr,size_t size,size_t nmemb,cram_fd * fd)479 size_t cram_io_output_buffer_write(void *vptr, size_t size, size_t nmemb,
480 				   cram_fd *fd)
481 {
482     size_t towrite = size * nmemb; /* number of bytes still to be written */
483     size_t r = 0;                  /* number of bytes copied to ptr */
484 
485     /* number of bytes in buffer */
486     size_t outbuf = fd->fp_out_buffer->fp_out_buf_pe -
487 	            fd->fp_out_buffer->fp_out_buf_pc;
488 
489     /* number of bytes to be copied from buffer */
490     size_t tocopy = imin(towrite, outbuf);
491     size_t blockwrite = 0;
492     char *ptr = (char *)vptr;
493 
494     /* place as many bytes in out_buffer as will fit */
495     memcpy(fd->fp_out_buffer->fp_out_buf_pc, ptr, tocopy);
496     towrite -= tocopy;
497     r += tocopy;
498     ptr += tocopy;
499     fd->fp_out_buffer->fp_out_buf_pc += tocopy;
500 
501     if (towrite) /* Still some left over */
502         if (cram_io_flush_output_buffer(fd) < 0)
503 	    goto partial_write;
504 
505     /* Write any remaining whole blocks without buffer copy, C-IO fwrite */
506     while (towrite >= fd->fp_out_buffer->fp_out_buf_size) {
507 	blockwrite = fd->fp_out_callbacks->fwrite_callback
508 	    (ptr,
509 	     1,
510 	     fd->fp_out_buffer->fp_out_buf_size,
511 	     fd->fp_out_callbacks->user_data);
512 
513 	towrite -= blockwrite;
514 	ptr += blockwrite;
515         r += blockwrite;
516 	fd->fp_out_buffer->fp_out_buf_start += blockwrite;
517 
518 	if (blockwrite < fd->fp_out_buffer->fp_out_buf_size)
519 	    goto partial_write;
520     }
521 
522     /* Push any remaining bytes into the output buffer */
523     if (towrite) {
524         /* buffer should be empty */
525         assert(fd->fp_out_buffer->fp_out_buf_pc ==
526 	       fd->fp_out_buffer->fp_out_buf_pa);
527 
528 	/* buffer should be large enough */
529 	assert(towrite <= fd->fp_out_buffer->fp_out_buf_size);
530 
531 	memcpy(fd->fp_out_buffer->fp_out_buf_pc, ptr, towrite);
532         r += towrite;
533         fd->fp_out_buffer->fp_out_buf_pc += towrite;
534     }
535 
536  partial_write:
537     return size ? (r / size) : r;
538 }
539 
540 static cram_io_output_t *
cram_IO_deallocate_cram_io_output(cram_io_output_t * obj)541 cram_IO_deallocate_cram_io_output(cram_io_output_t * obj)
542 {
543     if ( obj ) {
544         free(obj);
545         obj = NULL;
546     }
547 
548     return obj;
549 }
550 
551 static cram_io_output_t *
cram_IO_allocate_cram_io_output()552 cram_IO_allocate_cram_io_output()
553 {
554     cram_io_output_t *obj
555 	= (cram_io_output_t *)malloc(sizeof(cram_io_output_t));
556 
557     if ( ! obj ) {
558         return cram_IO_deallocate_cram_io_output(obj);
559     }
560     obj->user_data = NULL;
561     obj->fwrite_callback = NULL;
562     obj->ftell_callback = NULL;
563     return obj;
564 }
565 
566 static cram_io_output_t *
cram_IO_allocate_cram_io_output_from_C_FILE(FILE * file)567 cram_IO_allocate_cram_io_output_from_C_FILE(FILE * file)
568 {
569     cram_io_output_t *obj = cram_IO_allocate_cram_io_output();
570     if ( ! obj ) {
571         return cram_IO_deallocate_cram_io_output(obj);
572     }
573     obj->user_data = file;
574     obj->fwrite_callback = cram_io_C_FILE_fwrite;
575     obj->ftell_callback  = cram_io_C_FILE_ftell;
576     return obj;
577 }
578 
579 cram_fd_output_buffer *
cram_io_deallocate_output_buffer(cram_fd_output_buffer * buffer)580 cram_io_deallocate_output_buffer(cram_fd_output_buffer * buffer)
581 {
582     if ( buffer ) {
583         if ( buffer->fp_out_buffer ) {
584             free(buffer->fp_out_buffer);
585             buffer->fp_out_buffer = NULL;
586         }
587         free(buffer);
588         buffer = NULL;
589     }
590     return buffer;
591 }
592 
593 cram_fd_output_buffer *
cram_io_allocate_output_buffer(size_t const bufsize)594 cram_io_allocate_output_buffer(size_t const bufsize)
595 {
596     cram_fd_output_buffer * buffer =
597 	(cram_fd_output_buffer *)malloc(sizeof(cram_fd_output_buffer));
598 
599     if ( ! buffer )
600         return cram_io_deallocate_output_buffer(buffer);
601 
602     // FIXME: is memset really needed here? I suspect pa/pc is sufficient.
603     memset(buffer,0,sizeof(cram_fd_output_buffer));
604 
605     buffer->fp_out_buf_size = bufsize;
606     buffer->fp_out_buffer   = (char *)malloc(buffer->fp_out_buf_size);
607 
608     if ( ! buffer->fp_out_buffer ) {
609         return cram_io_deallocate_output_buffer(buffer);
610     }
611 
612     buffer->fp_out_buf_pa   = buffer->fp_out_buffer;
613     buffer->fp_out_buf_pc   = buffer->fp_out_buffer;
614     buffer->fp_out_buf_pe   = buffer->fp_out_buffer + bufsize;
615 
616     return buffer;
617 }
618 
619 // FIXME: Currently inefficient
cram_io_output_buffer_putc(int c,cram_fd * fd)620 int cram_io_output_buffer_putc(int c, cram_fd * fd)
621 {
622     char cc = c;
623 
624     if (cram_io_output_buffer_write(&cc, 1, 1, fd) == 1)
625 	return c;
626     else
627 	return EOF;
628 }
629 
630 #endif
631 
632 #ifndef USE_INT7_ENCODING
633 /* ----------------------------------------------------------------------
634  * ITF8 encoding and decoding.
635  *
636 * Also see the itf8_get and itf8_put macros in cram_io.h
637  */
638 
639 /*
640  * LEGACY: consider using itf8_decode_crc.
641  *
642  * Reads an integer in ITF-8 encoding from 'cp' and stores it in
643  * *val.
644  *
645  * Returns the number of bytes read on success
646  *        -1 on failure
647  */
itf8_decode(cram_fd * fd,int32_t * val_p)648 int itf8_decode(cram_fd *fd, int32_t *val_p) {
649     static int nbytes[16] = {
650 	0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
651 	1,1,1,1,                                        // 1000xxxx - 1011xxxx
652 	2,2,                                            // 1100xxxx - 1101xxxx
653 	3,                                              // 1110xxxx
654 	4,                                              // 1111xxxx
655     };
656 
657     static int nbits[16] = {
658 	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
659 	0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
660 	0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
661 	0x0f,                                           // 1110xxxx
662 	0x0f,                                           // 1111xxxx
663     };
664 
665     int32_t val = CRAM_IO_GETC(fd);
666     if (val == -1)
667 	return -1;
668 
669     int i = nbytes[val>>4];
670     val &= nbits[val>>4];
671 
672     switch(i) {
673     case 0:
674 	*val_p = val;
675 	return 1;
676 
677     case 1:
678 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
679 	*val_p = val;
680 	return 2;
681 
682     case 2:
683 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
684 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
685 	*val_p = val;
686 	return 3;
687 
688     case 3:
689 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
690 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
691 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
692 	*val_p = val;
693 	return 4;
694 
695     case 4: // really 3.5 more, why make it different?
696 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
697 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
698 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
699 	val = (val<<4) | (((unsigned char)CRAM_IO_GETC(fd)) & 0x0f);
700 	*val_p = val;
701     }
702 
703     return 5;
704 }
705 
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)706 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
707     static int nbytes[16] = {
708 	0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
709 	1,1,1,1,                                        // 1000xxxx - 1011xxxx
710 	2,2,                                            // 1100xxxx - 1101xxxx
711 	3,                                              // 1110xxxx
712 	4,                                              // 1111xxxx
713     };
714 
715     static int nbits[16] = {
716 	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
717 	0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
718 	0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
719 	0x0f,                                           // 1110xxxx
720 	0x0f,                                           // 1111xxxx
721     };
722     unsigned char c[5];
723 
724     int32_t val = CRAM_IO_GETC(fd);
725     if (val == -1)
726 	return -1;
727 
728     c[0]=val;
729 
730     int i = nbytes[val>>4];
731     val &= nbits[val>>4];
732 
733     switch(i) {
734     case 0:
735 	*val_p = val;
736 	*crc = iolib_crc32(*crc, c, 1);
737 	return 1;
738 
739     case 1:
740 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
741 	*val_p = val;
742 	*crc = iolib_crc32(*crc, c, 2);
743 	return 2;
744 
745     case 2:
746 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
747 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));
748 	*val_p = val;
749 	*crc = iolib_crc32(*crc, c, 3);
750 	return 3;
751 
752     case 3:
753 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));
754 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));
755 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));
756 	*val_p = val;
757 	*crc = iolib_crc32(*crc, c, 4);
758 	return 4;
759 
760     case 4: // really 3.5 more, why make it different?
761 	val = (val<<8) |   (c[1]=CRAM_IO_GETC(fd));
762 	val = (val<<8) |   (c[2]=CRAM_IO_GETC(fd));
763 	val = (val<<8) |   (c[3]=CRAM_IO_GETC(fd));
764 	val = (val<<4) | (((c[4]=CRAM_IO_GETC(fd))) & 0x0f);
765 	*val_p = val;
766 	*crc = iolib_crc32(*crc, c, 5);
767     }
768 
769     return 5;
770 }
771 
772 /*
773  * Encodes and writes a single integer in ITF-8 format.
774  * Returns 0 on success
775  *        -1 on failure
776  */
itf8_encode(cram_fd * fd,int32_t val)777 int itf8_encode(cram_fd *fd, int32_t val) {
778     char buf[5];
779     int len = itf8_put(buf, val);
780     return CRAM_IO_WRITE(buf, 1, len, fd) == len ? 0 : -1;
781 }
782 
783 const int itf8_bytes[16] = {
784     1, 1, 1, 1,  1, 1, 1, 1,
785     2, 2, 2, 2,  3, 3, 4, 5
786 };
787 
788 const int ltf8_bytes[256] = {
789     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
790     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
791     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
792     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
793 
794     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
795     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
796     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
797     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
798 
799     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
800     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
801     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
802     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
803 
804     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
805     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
806 
807     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
808 
809     5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
810 };
811 
812 #ifndef ITF8_MACROS
813 /*
814  * As above, but decoding from memory
815  */
itf8_get(char * cp,int32_t * val_p)816 int itf8_get(char *cp, int32_t *val_p) {
817     unsigned char *up = (unsigned char *)cp;
818 
819     if (up[0] < 0x80) {
820 	*val_p =   up[0];
821 	return 1;
822     } else if (up[0] < 0xc0) {
823 	*val_p = ((up[0] <<8) |  up[1])                           & 0x3fff;
824 	return 2;
825     } else if (up[0] < 0xe0) {
826 	*val_p = ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
827 	return 3;
828     } else if (up[0] < 0xf0) {
829 	*val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
830 	return 4;
831     } else {
832 	*val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
833 	return 5;
834     }
835 }
836 
837 /*
838  * Stores a value to memory in ITF-8 format.
839  *
840  * Returns the number of bytes required to store the number.
841  * This is a maximum of 5 bytes.
842  */
itf8_put(char * cp,int32_t val)843 int itf8_put(char *cp, int32_t val) {
844     if        (!(val & ~0x00000007f)) { // 1 byte
845 	*cp = val;
846 	return 1;
847     } else if (!(val & ~0x00003fff)) { // 2 byte
848 	*cp++ = (val >> 8 ) | 0x80;
849 	*cp   = val & 0xff;
850 	return 2;
851     } else if (!(val & ~0x01fffff)) { // 3 byte
852 	*cp++ = (val >> 16) | 0xc0;
853 	*cp++ = (val >> 8 ) & 0xff;
854 	*cp   = val & 0xff;
855 	return 3;
856     } else if (!(val & ~0x0fffffff)) { // 4 byte
857 	*cp++ = (val >> 24) | 0xe0;
858 	*cp++ = (val >> 16) & 0xff;
859 	*cp++ = (val >> 8 ) & 0xff;
860 	*cp   = val & 0xff;
861 	return 4;
862     } else {                           // 5 byte
863 	*cp++ = 0xf0 | ((val>>28) & 0xff);
864 	*cp++ = (val >> 20) & 0xff;
865 	*cp++ = (val >> 12) & 0xff;
866 	*cp++ = (val >> 4 ) & 0xff;
867 	*cp = val & 0x0f;
868 	return 5;
869     }
870 }
871 #endif
872 
873 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)874 int ltf8_put(char *cp, int64_t val) {
875     if        (!(val & ~((1LL<<7)-1))) {
876 	*cp = val;
877 	return 1;
878     } else if (!(val & ~((1LL<<(6+8))-1))) {
879 	*cp++ = (val >> 8 ) | 0x80;
880 	*cp   = val & 0xff;
881 	return 2;
882     } else if (!(val & ~((1LL<<(5+2*8))-1))) {
883 	*cp++ = (val >> 16) | 0xc0;
884 	*cp++ = (val >> 8 ) & 0xff;
885 	*cp   = val & 0xff;
886 	return 3;
887     } else if (!(val & ~((1LL<<(4+3*8))-1))) {
888 	*cp++ = (val >> 24) | 0xe0;
889 	*cp++ = (val >> 16) & 0xff;
890 	*cp++ = (val >> 8 ) & 0xff;
891 	*cp   = val & 0xff;
892 	return 4;
893     } else if (!(val & ~((1LL<<(3+4*8))-1))) {
894 	*cp++ = (val >> 32) | 0xf0;
895 	*cp++ = (val >> 24) & 0xff;
896 	*cp++ = (val >> 16) & 0xff;
897 	*cp++ = (val >> 8 ) & 0xff;
898 	*cp   = val & 0xff;
899 	return 5;
900     } else if (!(val & ~((1LL<<(2+5*8))-1))) {
901 	*cp++ = (val >> 40) | 0xf8;
902 	*cp++ = (val >> 32) & 0xff;
903 	*cp++ = (val >> 24) & 0xff;
904 	*cp++ = (val >> 16) & 0xff;
905 	*cp++ = (val >> 8 ) & 0xff;
906 	*cp   = val & 0xff;
907 	return 6;
908     } else if (!(val & ~((1LL<<(1+6*8))-1))) {
909 	*cp++ = (val >> 48) | 0xfc;
910 	*cp++ = (val >> 40) & 0xff;
911 	*cp++ = (val >> 32) & 0xff;
912 	*cp++ = (val >> 24) & 0xff;
913 	*cp++ = (val >> 16) & 0xff;
914 	*cp++ = (val >> 8 ) & 0xff;
915 	*cp   = val & 0xff;
916 	return 7;
917     } else if (!(val & ~((1LL<<(7*8))-1))) {
918 	*cp++ = (val >> 56) | 0xfe;
919 	*cp++ = (val >> 48) & 0xff;
920 	*cp++ = (val >> 40) & 0xff;
921 	*cp++ = (val >> 32) & 0xff;
922 	*cp++ = (val >> 24) & 0xff;
923 	*cp++ = (val >> 16) & 0xff;
924 	*cp++ = (val >> 8 ) & 0xff;
925 	*cp   = val & 0xff;
926 	return 8;
927     } else {
928 	*cp++ = 0xff;
929 	*cp++ = (val >> 56) & 0xff;
930 	*cp++ = (val >> 48) & 0xff;
931 	*cp++ = (val >> 40) & 0xff;
932 	*cp++ = (val >> 32) & 0xff;
933 	*cp++ = (val >> 24) & 0xff;
934 	*cp++ = (val >> 16) & 0xff;
935 	*cp++ = (val >> 8 ) & 0xff;
936 	*cp   = val & 0xff;
937 	return 9;
938     }
939 }
940 
ltf8_get(char * cp,int64_t * val_p)941 int ltf8_get(char *cp, int64_t *val_p) {
942     unsigned char *up = (unsigned char *)cp;
943 
944     if (up[0] < 0x80) {
945 	*val_p =   up[0];
946 	return 1;
947     } else if (up[0] < 0xc0) {
948 	*val_p = (((uint64_t)up[0]<< 8) |
949 		   (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
950 	return 2;
951     } else if (up[0] < 0xe0) {
952 	*val_p = (((uint64_t)up[0]<<16) |
953 		  ((uint64_t)up[1]<< 8) |
954 		   (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
955 	return 3;
956     } else if (up[0] < 0xf0) {
957 	*val_p = (((uint64_t)up[0]<<24) |
958 		  ((uint64_t)up[1]<<16) |
959 		  ((uint64_t)up[2]<< 8) |
960 		   (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
961 	return 4;
962     } else if (up[0] < 0xf8) {
963 	*val_p = (((uint64_t)up[0]<<32) |
964 		  ((uint64_t)up[1]<<24) |
965 		  ((uint64_t)up[2]<<16) |
966 		  ((uint64_t)up[3]<< 8) |
967 		   (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
968 	return 5;
969     } else if (up[0] < 0xfc) {
970 	*val_p = (((uint64_t)up[0]<<40) |
971 		  ((uint64_t)up[1]<<32) |
972 		  ((uint64_t)up[2]<<24) |
973 		  ((uint64_t)up[3]<<16) |
974 		  ((uint64_t)up[4]<< 8) |
975 		   (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
976 	return 6;
977     } else if (up[0] < 0xfe) {
978 	*val_p = (((uint64_t)up[0]<<48) |
979 		  ((uint64_t)up[1]<<40) |
980 		  ((uint64_t)up[2]<<32) |
981 		  ((uint64_t)up[3]<<24) |
982 		  ((uint64_t)up[4]<<16) |
983 		  ((uint64_t)up[5]<< 8) |
984 		   (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
985 	return 7;
986     } else if (up[0] < 0xff) {
987 	*val_p = (((uint64_t)up[1]<<48) |
988 		  ((uint64_t)up[2]<<40) |
989 		  ((uint64_t)up[3]<<32) |
990 		  ((uint64_t)up[4]<<24) |
991 		  ((uint64_t)up[5]<<16) |
992 		  ((uint64_t)up[6]<< 8) |
993 		   (uint64_t)up[7]) & ((1LL<<(7*8))-1);
994 	return 8;
995     } else {
996 	*val_p = (((uint64_t)up[1]<<56) |
997 		  ((uint64_t)up[2]<<48) |
998 		  ((uint64_t)up[3]<<40) |
999 		  ((uint64_t)up[4]<<32) |
1000 		  ((uint64_t)up[5]<<24) |
1001 		  ((uint64_t)up[6]<<16) |
1002 		  ((uint64_t)up[7]<< 8) |
1003 		   (uint64_t)up[8]);
1004 	return 9;
1005     }
1006 }
1007 
1008 /*
1009  * LEGACY: consider using ltf8_decode_crc.
1010  */
ltf8_decode(cram_fd * fd,int64_t * val_p)1011 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
1012     int c = CRAM_IO_GETC(fd);
1013     int64_t val = (unsigned char)c;
1014     if (c == -1)
1015 	return -1;
1016 
1017     if (val < 0x80) {
1018 	*val_p =   val;
1019 	return 1;
1020 
1021     } else if (val < 0xc0) {
1022 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1023 	*val_p = val & (((1LL<<(6+8)))-1);
1024 	return 2;
1025 
1026     } else if (val < 0xe0) {
1027 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1028 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1029 	*val_p = val & ((1LL<<(5+2*8))-1);
1030 	return 3;
1031 
1032     } else if (val < 0xf0) {
1033 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1034 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1035 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1036 	*val_p = val & ((1LL<<(4+3*8))-1);
1037 	return 4;
1038 
1039     } else if (val < 0xf8) {
1040 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1041 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1042 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1043 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1044 	*val_p = val & ((1LL<<(3+4*8))-1);
1045 	return 5;
1046 
1047     } else if (val < 0xfc) {
1048 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1049 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1050 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1051 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1052 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1053 	*val_p = val & ((1LL<<(2+5*8))-1);
1054 	return 6;
1055 
1056     } else if (val < 0xfe) {
1057 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1058 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1059 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1060 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1061 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1062 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1063 	*val_p = val & ((1LL<<(1+6*8))-1);
1064 	return 7;
1065 
1066     } else if (val < 0xff) {
1067 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1068 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1069 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1070 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1071 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1072 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1073 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1074 	*val_p = val & ((1LL<<(7*8))-1);
1075 	return 8;
1076 
1077     } else {
1078 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1079 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1080 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1081 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1082 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1083 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1084 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1085 	val = (val<<8) | (unsigned char)CRAM_IO_GETC(fd);
1086 	*val_p = val;
1087     }
1088 
1089     return 9;
1090 }
1091 
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)1092 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
1093     unsigned char c[9];
1094     int64_t val = (unsigned char)CRAM_IO_GETC(fd);
1095     if (val == -1)
1096 	return -1;
1097 
1098     c[0] = val;
1099 
1100     if (val < 0x80) {
1101 	*val_p =   val;
1102 	*crc = iolib_crc32(*crc, c, 1);
1103 	return 1;
1104 
1105     } else if (val < 0xc0) {
1106 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1107 	*val_p = val & (((1LL<<(6+8)))-1);
1108 	*crc = iolib_crc32(*crc, c, 2);
1109 	return 2;
1110 
1111     } else if (val < 0xe0) {
1112 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1113 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1114 	*val_p = val & ((1LL<<(5+2*8))-1);
1115 	*crc = iolib_crc32(*crc, c, 3);
1116 	return 3;
1117 
1118     } else if (val < 0xf0) {
1119 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1120 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1121 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1122 	*val_p = val & ((1LL<<(4+3*8))-1);
1123 	*crc = iolib_crc32(*crc, c, 4);
1124 	return 4;
1125 
1126     } else if (val < 0xf8) {
1127 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1128 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1129 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1130 	val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1131 	*val_p = val & ((1LL<<(3+4*8))-1);
1132 	*crc = iolib_crc32(*crc, c, 5);
1133 	return 5;
1134 
1135     } else if (val < 0xfc) {
1136 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1137 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1138 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1139 	val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1140 	val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1141 	*val_p = val & ((1LL<<(2+5*8))-1);
1142 	*crc = iolib_crc32(*crc, c, 6);
1143 	return 6;
1144 
1145     } else if (val < 0xfe) {
1146 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1147 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1148 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1149 	val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1150 	val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1151 	val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1152 	*val_p = val & ((1LL<<(1+6*8))-1);
1153 	*crc = iolib_crc32(*crc, c, 7);
1154 	return 7;
1155 
1156     } else if (val < 0xff) {
1157 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1158 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1159 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1160 	val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1161 	val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1162 	val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1163 	val = (val<<8) | (c[7]=CRAM_IO_GETC(fd));;
1164 	*val_p = val & ((1LL<<(7*8))-1);
1165 	*crc = iolib_crc32(*crc, c, 8);
1166 	return 8;
1167 
1168     } else {
1169 	val = (val<<8) | (c[1]=CRAM_IO_GETC(fd));;
1170 	val = (val<<8) | (c[2]=CRAM_IO_GETC(fd));;
1171 	val = (val<<8) | (c[3]=CRAM_IO_GETC(fd));;
1172 	val = (val<<8) | (c[4]=CRAM_IO_GETC(fd));;
1173 	val = (val<<8) | (c[5]=CRAM_IO_GETC(fd));;
1174 	val = (val<<8) | (c[6]=CRAM_IO_GETC(fd));;
1175 	val = (val<<8) | (c[7]=CRAM_IO_GETC(fd));;
1176 	val = (val<<8) | (c[8]=CRAM_IO_GETC(fd));;
1177 	*crc = iolib_crc32(*crc, c, 9);
1178 	*val_p = val;
1179     }
1180 
1181     return 9;
1182 }
1183 
1184 /*
1185  * Pushes a value in ITF8 format onto the end of a block.
1186  * This shouldn't be used for high-volume data as it is not the fastest
1187  * method.
1188  *
1189  * Returns the number of bytes written
1190  */
itf8_put_blk(cram_block * blk,int32_t val)1191 int itf8_put_blk(cram_block *blk, int32_t val) {
1192     char buf[5];
1193     int sz;
1194 
1195     sz = itf8_put(buf, val);
1196     BLOCK_APPEND(blk, buf, sz);
1197     return sz;
1198 }
1199 
ltf8_put_blk(cram_block * blk,int64_t val)1200 int ltf8_put_blk(cram_block *blk, int64_t val) {
1201     char buf[9];
1202     int sz;
1203 
1204     sz = ltf8_put(buf, val);
1205     BLOCK_APPEND(blk, buf, sz);
1206     return sz;
1207 }
1208 
1209 #else
1210 
uint7_encode(cram_fd * fd,int64_t val)1211 int uint7_encode(cram_fd *fd, int64_t val) {
1212     char buf[10];
1213     int len = uint7_put(buf, NULL, val);
1214     return CRAM_IO_WRITE(buf, 1, len, fd) == len ? 0 : -1;
1215 }
1216 
uint7_put_blk(cram_block * blk,uint64_t v)1217 int uint7_put_blk(cram_block *blk, uint64_t v) {
1218     uint8_t buf[10];
1219     int sz = uint7_put(buf, buf+10, v);
1220     BLOCK_APPEND(blk, buf, sz);
1221     return sz;
1222 }
1223 
sint7_put_blk(cram_block * blk,int64_t v)1224 int sint7_put_blk(cram_block *blk, int64_t v) {
1225     uint8_t buf[10];
1226     int sz = sint7_put(buf, buf+10, v);
1227     BLOCK_APPEND(blk, buf, sz);
1228     return sz;
1229 }
1230 
uint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)1231 int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
1232     uint8_t b[5], i = 0;
1233     int c;
1234     uint32_t v = 0;
1235 
1236     do {
1237 	b[i++] = c = CRAM_IO_GETC(fd);
1238 	if (c < 0)
1239 	    return -1;
1240 	v = (v<<7) | (c & 0x7f);
1241     } while (i < 5 && (c & 0x80));
1242     *crc = iolib_crc32(*crc, b, i);
1243 
1244     *val_p = v;
1245     return i;
1246 }
1247 
uint7_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)1248 int uint7_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
1249     uint8_t b[10], i = 0;
1250     int c;
1251     uint64_t v = 0;
1252 
1253     do {
1254 	b[i++] = c = CRAM_IO_GETC(fd);
1255 	if (c < 0)
1256 	    return -1;
1257 	v = (v<<7) | (c & 0x7f);
1258     } while (i < 10 && (c & 0x80));
1259     *crc = iolib_crc32(*crc, b, i);
1260 
1261     *val_p = v;
1262     return i;
1263 }
1264 #endif
1265 
1266 /*
1267  * Decodes a 32-bit little endian value from fd and stores in val.
1268  *
1269  * Returns the number of bytes read on success
1270  *         -1 on failure
1271  */
int32_decode(cram_fd * fd,int32_t * val)1272 int int32_decode(cram_fd *fd, int32_t *val) {
1273     int32_t i;
1274     if (1 != CRAM_IO_READ(&i, 4, 1, fd))
1275 	return -1;
1276 
1277     *val = le_int4(i);
1278     return 4;
1279 }
1280 
1281 /*
1282  * Encodes a 32-bit little endian value 'val' and writes to fd.
1283  *
1284  * Returns the number of bytes written on success
1285  *         -1 on failure
1286  */
int32_encode(cram_fd * fd,int32_t val)1287 int int32_encode(cram_fd *fd, int32_t val) {
1288     val = le_int4(val);
1289     if (1 != CRAM_IO_WRITE(&val, 4, 1, fd))
1290 	return -1;
1291 
1292     return 4;
1293 }
1294 
1295 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)1296 int int32_get_blk(cram_block *b, int32_t *val) {
1297     if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1298 	return -1;
1299 
1300     *val =
1301 	 b->data[b->byte  ]        |
1302 	(b->data[b->byte+1] <<  8) |
1303 	(b->data[b->byte+2] << 16) |
1304 	(b->data[b->byte+3] << 24);
1305     BLOCK_SIZE(b) += 4;
1306     return 4;
1307 }
1308 
1309 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put(cram_block * b,int32_t val)1310 int int32_put(cram_block *b, int32_t val) {
1311     unsigned char cp[4];
1312     cp[0] = ( val      & 0xff);
1313     cp[1] = ((val>>8)  & 0xff);
1314     cp[2] = ((val>>16) & 0xff);
1315     cp[3] = ((val>>24) & 0xff);
1316 
1317     BLOCK_APPEND(b, cp, 4);
1318     return b->data ? 0 : -1;
1319 }
1320 
1321 /* ----------------------------------------------------------------------
1322  * zlib compression code - from Gap5's tg_iface_g.c
1323  * They're static here as they're only used within the cram_compress_block
1324  * and cram_uncompress_block functions, which are the external interface.
1325  */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1326 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1327     z_stream s;
1328     unsigned char *data = NULL; /* Uncompressed output */
1329     int data_alloc = 0;
1330     int err;
1331 
1332     /* Starting point at uncompressed size, and scale after that */
1333     data = malloc(data_alloc = csize*1.2+100);
1334     if (!data)
1335 	return NULL;
1336 
1337     /* Initialise zlib stream */
1338     s.zalloc = Z_NULL; /* use default allocation functions */
1339     s.zfree  = Z_NULL;
1340     s.opaque = Z_NULL;
1341     s.next_in  = (unsigned char *)cdata;
1342     s.avail_in = csize;
1343     s.total_in = 0;
1344     s.next_out  = data;
1345     s.avail_out = data_alloc;
1346     s.total_out = 0;
1347 
1348     //err = inflateInit(&s);
1349     err = inflateInit2(&s, 15 + 32);
1350     if (err != Z_OK) {
1351 	fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
1352 	free(data);
1353 	return NULL;
1354     }
1355 
1356     /* Decode to 'data' array */
1357     for (;s.avail_in;) {
1358 	unsigned char *data_tmp;
1359 	int alloc_inc;
1360 
1361 	s.next_out = &data[s.total_out];
1362 	err = inflate(&s, Z_NO_FLUSH);
1363 	if (err == Z_STREAM_END)
1364 	    break;
1365 
1366 	if (err != Z_OK) {
1367 	    fprintf(stderr, "zlib inflate error: %s\n", s.msg);
1368 	    if (data)
1369 		free(data);
1370 	    return NULL;
1371 	}
1372 
1373 	/* More to come, so realloc based on growth so far */
1374 	alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1375 	data = realloc((data_tmp = data), data_alloc += alloc_inc);
1376 	if (!data) {
1377 	    free(data_tmp);
1378 	    return NULL;
1379 	}
1380 	s.avail_out += alloc_inc;
1381     }
1382     inflateEnd(&s);
1383 
1384     *size = s.total_out;
1385     return (char *)data;
1386 }
1387 
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1388 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1389 			      int level, int strat) {
1390     z_stream s;
1391     unsigned char *cdata = NULL; /* Compressed output */
1392     int cdata_alloc = 0;
1393     int cdata_pos = 0;
1394     int err;
1395 
1396     cdata = malloc(cdata_alloc = size*1.05+100);
1397     if (!cdata)
1398 	return NULL;
1399     cdata_pos = 0;
1400 
1401     /* Initialise zlib stream */
1402     s.zalloc = Z_NULL; /* use default allocation functions */
1403     s.zfree  = Z_NULL;
1404     s.opaque = Z_NULL;
1405     s.next_in  = (unsigned char *)data;
1406     s.avail_in = size;
1407     s.total_in = 0;
1408     s.next_out  = cdata;
1409     s.avail_out = cdata_alloc;
1410     s.total_out = 0;
1411     s.data_type = Z_BINARY;
1412 
1413     err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1414     if (err != Z_OK) {
1415 	fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
1416 	return NULL;
1417     }
1418 
1419     /* Encode to 'cdata' array */
1420     for (;s.avail_in;) {
1421 	s.next_out = &cdata[cdata_pos];
1422 	s.avail_out = cdata_alloc - cdata_pos;
1423 	if (cdata_alloc - cdata_pos <= 0) {
1424 	    fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
1425 	    return NULL;
1426 	}
1427 	err = deflate(&s, Z_NO_FLUSH);
1428 	cdata_pos = cdata_alloc - s.avail_out;
1429 	if (err != Z_OK) {
1430 	    fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1431 	    break;
1432 	}
1433     }
1434     if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1435 	fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1436     }
1437     *cdata_size = s.total_out;
1438 
1439     if (deflateEnd(&s) != Z_OK) {
1440 	fprintf(stderr, "zlib deflate error: %s\n", s.msg);
1441     }
1442     return (char *)cdata;
1443 }
1444 
1445 #ifdef HAVE_LIBLZMA
1446 /* ------------------------------------------------------------------------ */
1447 /*
1448  * Data compression routines using liblzma (xz)
1449  *
1450  * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1451  * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1452  * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1453  * as compression times.
1454  *
1455  * For now we disable this functionality. If it's to be reenabled make sure you
1456  * improve the mem_inflate implementation as it's just a test hack at the
1457  * moment.
1458  */
1459 
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)1460 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1461 			      int level) {
1462     char *out;
1463     size_t out_size = lzma_stream_buffer_bound(size);
1464     *cdata_size = 0;
1465 
1466     out = malloc(out_size);
1467 
1468     /* Single call compression */
1469     if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1470 					   (uint8_t *)data, size,
1471 					   (uint8_t *)out, cdata_size,
1472 					   out_size))
1473     	return NULL;
1474 
1475     return out;
1476 }
1477 
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)1478 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1479     lzma_stream strm = LZMA_STREAM_INIT;
1480     size_t out_size = 0, out_pos = 0;
1481     char *out = NULL;
1482     int r;
1483 
1484     /* Initiate the decoder */
1485     if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1486 	return NULL;
1487 
1488     /* Decode loop */
1489     strm.avail_in = csize;
1490     strm.next_in = (uint8_t *)cdata;
1491 
1492     for (;strm.avail_in;) {
1493 	if (strm.avail_in > out_size - out_pos) {
1494 	    out_size += strm.avail_in * 4 + 32768;
1495 	    out = realloc(out, out_size);
1496 	}
1497 	strm.avail_out = out_size - out_pos;
1498 	strm.next_out = (uint8_t *)&out[out_pos];
1499 
1500 	r = lzma_code(&strm, LZMA_RUN);
1501 	if (LZMA_OK != r && LZMA_STREAM_END != r) {
1502 	    fprintf(stderr, "r=%d\n", r);
1503 	    fprintf(stderr, "mem=%"PRId64"\n", (int64_t)lzma_memusage(&strm));
1504 	    return NULL;
1505 	}
1506 
1507 	out_pos = strm.total_out;
1508 
1509 	if (r == LZMA_STREAM_END)
1510 	    break;
1511     }
1512 
1513     /* finish up any unflushed data; necessary? */
1514     r = lzma_code(&strm, LZMA_FINISH);
1515     if (r != LZMA_OK && r != LZMA_STREAM_END) {
1516 	fprintf(stderr, "r=%d\n", r);
1517 	return NULL;
1518     }
1519 
1520     out = realloc(out, strm.total_out);
1521     *size = strm.total_out;
1522 
1523     lzma_end(&strm);
1524 
1525     return out;
1526 }
1527 #endif
1528 
1529 /* ----------------------------------------------------------------------
1530  * CRAM blocks - the dynamically growable data block. We have code to
1531  * create, update, (un)compress and read/write.
1532  *
1533  * These are derived from the deflate_interlaced.c blocks, but with the
1534  * CRAM extension of content types and IDs.
1535  */
1536 
1537 /*
1538  * Allocates a new cram_block structure with a specified content_type and
1539  * id.
1540  *
1541  * Returns block pointer on success
1542  *         NULL on failure
1543  */
cram_new_block(enum cram_content_type content_type,int content_id)1544 cram_block *cram_new_block(enum cram_content_type content_type,
1545 			   int content_id) {
1546     cram_block *b = malloc(sizeof(*b));
1547     if (!b)
1548 	return NULL;
1549     b->method = b->orig_method = RAW;
1550     b->content_type = content_type;
1551     b->content_id = content_id;
1552     b->comp_size = 0;
1553     b->uncomp_size = 0;
1554     b->data = NULL;
1555     b->alloc = 0;
1556     b->byte = 0;
1557     b->bit = 7; // MSB
1558     b->crc32 = 0;
1559 
1560     return b;
1561 }
1562 
1563 /*
1564  * Reads a block from a cram file.
1565  * Returns cram_block pointer on success.
1566  *         NULL on failure
1567  */
cram_read_block(cram_fd * fd)1568 cram_block *cram_read_block(cram_fd *fd) {
1569     cram_block *b = malloc(sizeof(*b));
1570     unsigned char c;
1571     uint32_t crc = 0;
1572     if (!b)
1573 	return NULL;
1574 
1575     //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1576 
1577     if (-1 == (b->method       = (c=CRAM_IO_GETC(fd)))) { free(b); return NULL; }
1578     crc = iolib_crc32(crc, &c, 1);
1579     if (-1 == (b->content_type = (c=CRAM_IO_GETC(fd)))) { free(b); return NULL; }
1580     crc = iolib_crc32(crc, &c, 1);
1581     if (-1 == itf8_decode_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1582     if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1583     if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1584 
1585     //    fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1586     //	    b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1587 
1588     if (b->method == RAW) {
1589 	b->alloc = b->uncomp_size;
1590 	if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1591 	if (b->uncomp_size != CRAM_IO_READ(b->data, 1, b->uncomp_size, fd)) {
1592 	    free(b->data);
1593 	    free(b);
1594 	    return NULL;
1595 	}
1596     } else {
1597 	b->alloc = b->comp_size;
1598 	if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1599 	if (b->comp_size != CRAM_IO_READ(b->data, 1, b->comp_size, fd)) {
1600 	    free(b->data);
1601 	    free(b);
1602 	    return NULL;
1603 	}
1604     }
1605 
1606     if (IS_CRAM_3_VERS(fd)) {
1607 	if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1608 	    free(b);
1609 	    return NULL;
1610 	}
1611 
1612 	// Check later, if and only if we do decompression of this block
1613 	b->crc32_checked = fd->ignore_md5;
1614 	b->crc_part = crc;
1615     } else {
1616 	b->crc32_checked = 1; // CRC not present
1617     }
1618 
1619     b->orig_method = b->method;
1620     b->idx = 0;
1621     b->byte = 0;
1622     b->bit = 7; // MSB
1623 
1624     return b;
1625 }
1626 
1627 /*
1628  * Writes a CRAM block.
1629  * Returns 0 on success
1630  *        -1 on failure
1631  */
cram_write_block(cram_fd * fd,cram_block * b)1632 int cram_write_block(cram_fd *fd, cram_block *b) {
1633     assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1634 
1635     if (CRAM_IO_PUTC(b->method,       fd)   == EOF) return -1;
1636     if (CRAM_IO_PUTC(b->content_type, fd)   == EOF) return -1;
1637     if (itf8_encode(fd, b->content_id)  ==  -1) return -1;
1638     if (itf8_encode(fd, b->comp_size)   ==  -1) return -1;
1639     if (itf8_encode(fd, b->uncomp_size) ==  -1) return -1;
1640 
1641     if (b->method == RAW) {
1642 	if (b->uncomp_size != CRAM_IO_WRITE(b->data, 1, b->uncomp_size, fd))
1643 	    return -1;
1644     } else {
1645 	if (b->comp_size != CRAM_IO_WRITE(b->data, 1, b->comp_size, fd))
1646 	    return -1;
1647     }
1648 
1649     if (IS_CRAM_3_VERS(fd)) {
1650 	unsigned char dat[100], *cp = dat;;
1651 	uint32_t crc;
1652 
1653 	*cp++ = b->method;
1654 	*cp++ = b->content_type;
1655 	cp += itf8_put(cp, b->content_id);
1656 	cp += itf8_put(cp, b->comp_size);
1657 	cp += itf8_put(cp, b->uncomp_size);
1658 	crc = iolib_crc32(0L, dat, cp-dat);
1659 
1660 	if (!b->crc32) {
1661 	    if (b->method == RAW) {
1662 		b->crc32 = iolib_crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1663 	    } else {
1664 		b->crc32 = iolib_crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1665 	    }
1666 	}
1667 
1668 	if (-1 == int32_encode(fd, b->crc32))
1669 	    return -1;
1670     }
1671 
1672     return 0;
1673 }
1674 
1675 /*
1676  * Frees a CRAM block, deallocating internal data too.
1677  */
cram_free_block(cram_block * b)1678 void cram_free_block(cram_block *b) {
1679     if (!b)
1680 	return;
1681     if (b->data)
1682 	free(b->data);
1683     free(b);
1684 }
1685 
1686 #ifdef HAVE_LIBBSC
1687 #define BSC_FEATURES LIBBSC_FEATURE_FASTMODE
1688 pthread_once_t bsc_once = PTHREAD_ONCE_INIT;
bsc_init_once(void)1689 static void bsc_init_once(void) {
1690     bsc_init(BSC_FEATURES);
1691 }
1692 #endif
1693 
1694 /*
1695  * Uncompresses a CRAM block, if compressed.
1696  */
cram_uncompress_block(cram_block * b)1697 int cram_uncompress_block(cram_block *b) {
1698     char *uncomp;
1699     size_t uncomp_size = 0;
1700 
1701     if (b->crc32_checked == 0) {
1702 	uint32_t crc = iolib_crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1703 	b->crc32_checked = 1;
1704 	if (crc != b->crc32) {
1705 	    fprintf(stderr, "Block CRC32 failure\n");
1706 	    return -1;
1707 	}
1708     }
1709 
1710     if (b->uncomp_size == 0) {
1711 	// blank block
1712 	b->method = RAW;
1713 	return 0;
1714     }
1715 
1716     switch (b->method) {
1717     case RAW:
1718 	//b->uncomp_size = b->comp_size;
1719 	return 0;
1720 
1721     case GZIP:
1722 	uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1723 	if (!uncomp)
1724 	    return -1;
1725 	if ((int)uncomp_size != b->uncomp_size) {
1726 	    free(uncomp);
1727 	    return -1;
1728 	}
1729 	free(b->data);
1730 	b->data = (unsigned char *)uncomp;
1731 	b->alloc = uncomp_size;
1732 	b->method = RAW;
1733 	break;
1734 
1735 #ifdef HAVE_LIBBZ2
1736     case BZIP2: {
1737 	unsigned int usize = b->uncomp_size;
1738 	if (!(uncomp = malloc(usize)))
1739 	    return -1;
1740 	if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1741 						(char *)b->data, b->comp_size,
1742 						0, 0)) {
1743 	    free(uncomp);
1744 	    return -1;
1745 	}
1746 	free(b->data);
1747 	b->data = (unsigned char *)uncomp;
1748 	b->alloc = usize;
1749 	b->method = RAW;
1750 	b->uncomp_size = usize; // Just incase it differs
1751 	break;
1752     }
1753 #else
1754     case BZIP2:
1755 	fprintf(stderr, "Bzip2 compression is not compiled into this "
1756 		"version.\nPlease rebuild and try again.\n");
1757 	return -1;
1758 #endif
1759 
1760 #ifdef HAVE_LIBBSC
1761     case BSC: {
1762 	int block_size, data_size;
1763 
1764 	pthread_once(&bsc_once, bsc_init_once);
1765 
1766 	if (bsc_block_info(b->data, LIBBSC_HEADER_SIZE,
1767 			   &block_size, &data_size,
1768 			   BSC_FEATURES) != LIBBSC_NO_ERROR)
1769 	    return -1;
1770 
1771 	uncomp_size = data_size;
1772 	if (!(uncomp = malloc(data_size)))
1773 	    return -1;
1774 
1775 	assert(block_size == b->comp_size);
1776 	if (bsc_decompress(b->data, block_size,
1777 			   (unsigned char *)uncomp, data_size,
1778 			   BSC_FEATURES) != LIBBSC_NO_ERROR) {
1779 	    free(uncomp);
1780 	    return -1;
1781 	}
1782 
1783 	free(b->data);
1784 	b->data = (unsigned char *)uncomp;
1785 	b->alloc = data_size;
1786 	b->method = RAW;
1787 	b->uncomp_size = data_size; // Just incase it differs
1788 	break;
1789     }
1790 #else
1791     case BSC:
1792 	fprintf(stderr, "Libbsc compression is not compiled into this "
1793 		"version.\nPlease rebuild and try again.\n");
1794 	return -1;
1795 #endif
1796 
1797 #ifdef HAVE_FQZ
1798     case FQZ: {
1799 	uncomp_size = b->uncomp_size;
1800 	uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size);
1801 	if (!uncomp)
1802 	    return -1;
1803 	free(b->data);
1804 	b->data = (unsigned char *)uncomp;
1805 	b->alloc = uncomp_size;
1806 	b->method = RAW;
1807 	break;
1808     }
1809 #else
1810     case FQZ:
1811 	fprintf(stderr, "Fqzcomp compression is not compiled into this "
1812 		"version.\nPlease rebuild and try again.\n");
1813 	return -1;
1814 #endif
1815 
1816 #ifdef HAVE_LIBLZMA
1817     case LZMA:
1818 	uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1819 	if (!uncomp)
1820 	    return -1;
1821 	if ((int)uncomp_size != b->uncomp_size)
1822 	    return -1;
1823 	free(b->data);
1824 	b->data = (unsigned char *)uncomp;
1825 	b->alloc = uncomp_size;
1826 	b->method = RAW;
1827 	break;
1828 #else
1829     case LZMA:
1830 	fprintf(stderr, "Lzma compression is not compiled into this "
1831 		"version.\nPlease rebuild and try again.\n");
1832 	return -1;
1833 	break;
1834 #endif
1835 
1836     case RANS0: {
1837 	unsigned int usize = b->uncomp_size, usize2;
1838 	if (*b->data == 1) b->orig_method = RANS1; // useful in debugging
1839 	uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2, 0);
1840 	if (!uncomp || usize != usize2)
1841 	    return -1;
1842 	b->orig_method = b->data[0]&1 ? RANS1 : RANS0;
1843 	free(b->data);
1844 	b->data = (unsigned char *)uncomp;
1845 	b->alloc = usize2;
1846 	b->method = RAW;
1847 	b->uncomp_size = usize2; // Just incase it differs
1848 	//fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1849 	break;
1850     }
1851 
1852     case RANS_PR0: {
1853 	unsigned int usize = b->uncomp_size, usize2;
1854 	uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2, 0);
1855 	if (!uncomp || usize != usize2)
1856 	    return -1;
1857 	b->orig_method = RANS_PR0 + (b->data[0]&1)
1858 	    + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1859 	free(b->data);
1860 	b->data = (unsigned char *)uncomp;
1861 	b->alloc = usize2;
1862 	b->method = RAW;
1863 	b->uncomp_size = usize2; // Just incase it differs
1864 	//fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1865 	break;
1866     }
1867 
1868     case NAME_TOK3: {
1869 	int out_len;
1870 	uint8_t *cp = decode_names(b->data, b->comp_size, &out_len);
1871 	b->orig_method = NAME_TOK3;
1872 	b->method = RAW;
1873 	free(b->data);
1874 	b->data = cp;
1875 	b->alloc = out_len;
1876 	b->uncomp_size = out_len;
1877 	break;
1878     }
1879 
1880 
1881 
1882     default:
1883 	return -1;
1884     }
1885 
1886     return 0;
1887 }
1888 
1889 #define EBASE 65536
1890 //static double entropy16(unsigned short *data, int len) {
1891 //    double E[EBASE];
1892 //    double P[EBASE];
1893 //    double e = 0;
1894 //    int i;
1895 //
1896 //    for (i = 0; i < EBASE; i++)
1897 //        P[i] = 0;
1898 //
1899 //    for (i = 0; i < len; i++)
1900 //        P[data[i]]++;
1901 //
1902 //    for (i = 0; i < EBASE; i++) {
1903 //        if (P[i]) {
1904 //            P[i] /= len;
1905 //            E[i] = -(log(P[i])/log(EBASE));
1906 //        } else {
1907 //            E[i] = 0;
1908 //        }
1909 //    }
1910 //
1911 //    for (e = i = 0; i < len; i++)
1912 //        e += E[data[i]];
1913 //
1914 //    return e * log(EBASE)/log(256);
1915 //}
1916 //
1917 //#define EBASE2 256
1918 //static double entropy8(unsigned char *data, int len) {
1919 //    int F[EBASE2];
1920 //    double e = 0;
1921 //    int i;
1922 //
1923 //    for (i = 0; i < EBASE2; i++)
1924 //        F[i] = 0;
1925 //
1926 //    for (i = 0; i < len; i++)
1927 //        F[data[i]]++;
1928 //
1929 //    for (i = 0; i < EBASE2; i++) {
1930 //        if (F[i]) {
1931 //	    e += -log((double)F[i]/len) * F[i];
1932 //        }
1933 //    }
1934 //
1935 //    return e / log(EBASE2);
1936 //}
1937 
cram_compress_by_method(cram_slice * s,char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1938 static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1939 				     size_t *out_size,
1940 				     enum cram_block_method method,
1941 				     int level, int strat) {
1942     switch (method) {
1943     case GZIP:
1944     case GZIP_RLE:
1945     case GZIP_1:
1946 	return zlib_mem_deflate(in, in_size, out_size, level, strat);
1947 
1948     case BZIP2: {
1949 #ifdef HAVE_LIBBZ2
1950 	unsigned int comp_size = in_size*1.01 + 600;
1951 	char *comp = malloc(comp_size);
1952 	if (!comp)
1953 	    return NULL;
1954 
1955 	if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1956 					      in, in_size,
1957 					      level, 0, 30)) {
1958 	    free(comp);
1959 	    return NULL;
1960 	}
1961 	*out_size = comp_size;
1962 	return comp;
1963 #else
1964 	return NULL;
1965 #endif
1966     }
1967 
1968     case BSC: {
1969 #ifdef HAVE_LIBBSC
1970 	unsigned int comp_size = in_size*1.01 + LIBBSC_HEADER_SIZE;
1971 	char *comp = malloc(comp_size);
1972 	if (!comp)
1973 	    return NULL;
1974 
1975 	pthread_once(&bsc_once, bsc_init_once);
1976 
1977 	*out_size = bsc_compress((unsigned char *)in,
1978 				 (unsigned char *)comp,
1979 				 in_size,
1980 				 LIBBSC_DEFAULT_LZPHASHSIZE,
1981 				 LIBBSC_DEFAULT_LZPMINLEN,
1982 				 LIBBSC_BLOCKSORTER_BWT,
1983 				 level >= 5
1984 				   ? LIBBSC_CODER_QLFC_ADAPTIVE // maybe 50% slower?
1985 				   : LIBBSC_CODER_QLFC_STATIC,
1986 				 BSC_FEATURES);
1987 
1988 	assert(comp_size >= *out_size);
1989 
1990 	return comp;
1991 #else
1992 	return NULL;
1993 #endif
1994     }
1995 
1996     case FQZ:
1997 #ifdef HAVE_FQZ
1998 	return fqz_compress(strat, s, in, in_size, out_size, level);
1999 #else
2000 	return NULL;
2001 #endif
2002 
2003     case LZMA:
2004 #ifdef HAVE_LIBLZMA
2005 	return lzma_mem_deflate(in, in_size, out_size, level);
2006 #else
2007 	return NULL;
2008 #endif
2009 
2010     case RANS0:
2011     case RANS1: {
2012 	unsigned int out_size_i;
2013 	unsigned char *cp;
2014 
2015 	cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
2016 			   method == RANS0 ? 0 : 1);
2017 	*out_size = out_size_i;
2018 	return (char *)cp;
2019     }
2020 
2021     case RANS_PR0:
2022     case RANS_PR1:
2023     case RANS_PR64:
2024     case RANS_PR65:
2025     case RANS_PR128:
2026     case RANS_PR129:
2027     case RANS_PR192:
2028     case RANS_PR193: {
2029 	unsigned int out_size_i;
2030 	unsigned char *cp;
2031 
2032 	// see enum cram_block. We map RANS_* methods to order bit-fields
2033 	static int methmap[] = {
2034 	    0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, // non-ranspr
2035 	    0,1, 64,65, 128,129, 192,193
2036 	};
2037 
2038 	cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, methmap[method]);
2039 	*out_size = out_size_i;
2040 	return (char *)cp;
2041     }
2042 
2043     case NAME_TOK3: {
2044 	int out_len;
2045 	uint8_t *cp = encode_names(in, in_size, &out_len, NULL);
2046 	*out_size = out_len;
2047 	return (char *)cp;
2048     }
2049 
2050     case RAW:
2051 	break;
2052 
2053     default:
2054         return NULL;
2055     }
2056 
2057     return NULL;
2058 }
2059 
2060 /*
2061  * Compresses a block using one of two different zlib strategies. If we only
2062  * want one choice set strat2 to be -1.
2063  *
2064  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
2065  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
2066  * significantly faster.
2067  */
cram_compress_block(cram_fd * fd,cram_slice * s,cram_block * b,cram_metrics * metrics,int method,int level)2068 int cram_compress_block(cram_fd *fd, cram_slice *s,
2069 			cram_block *b, cram_metrics *metrics,
2070 			int method, int level) {
2071 
2072     char *comp = NULL;
2073     size_t comp_size = 0;
2074     int strat = 0;
2075 
2076     // Internally we have parameterised methods that externally map
2077     // to the same CRAM method value.
2078     // See enum_cram_block_method.
2079     int methmap[] = {
2080 	RAW, GZIP, BZIP2, LZMA, RANS0, BSC, FQZ,
2081 	BM_ERROR, BM_ERROR, BM_ERROR,
2082 	RANS0, // RANS1
2083 	GZIP, GZIP, // GZIP_RLE and GZIP_1
2084 	RANS_PR0, RANS_PR0, RANS_PR0, RANS_PR0, // RANS_PR1-193
2085 	RANS_PR0, RANS_PR0, RANS_PR0, RANS_PR0,
2086 	NAME_TOK3,
2087     };
2088 
2089     if (b->method != RAW) {
2090 	// Maybe already compressed if s->block[0] was compressed and
2091 	// we have e.g. s->block[DS_BA] set to s->block[0] due to only
2092 	// one base type present and hence using E_HUFFMAN on block 0.
2093 	// A second explicit attempt to compress the same block then
2094 	// occurs.
2095 	return 0;
2096     }
2097 
2098     //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
2099 
2100     if (method == RAW || level == 0 || b->uncomp_size == 0) {
2101 	b->method = RAW;
2102 	b->comp_size = b->uncomp_size;
2103 	//fprintf(stderr, "Skip block id %d\n", b->content_id);
2104 	return 0;
2105     }
2106 
2107     if (metrics) {
2108 	if (fd->metrics_lock) pthread_mutex_lock(fd->metrics_lock);
2109 	if (fd->unsorted == 2)
2110 	    metrics->next_trial = 0; // force recheck on mode switch.
2111 	if (metrics->trial > 0 || --metrics->next_trial <= 0) {
2112 	    int m;
2113 	    size_t sz_best = INT_MAX;
2114 	    size_t sz[CRAM_MAX_METHOD] = {0};
2115 	    int method_best = 0;
2116 	    char *c_best = NULL, *c = NULL;
2117 
2118 	    if (metrics->revised_method)
2119 		method = metrics->revised_method;
2120 	    else
2121 		metrics->revised_method = method;
2122 
2123 	    if (metrics->next_trial <= 0) {
2124 		metrics->next_trial = TRIAL_SPAN;
2125 		metrics->trial = NTRIALS;
2126 		for (m = 0; m < CRAM_MAX_METHOD; m++)
2127 		    metrics->sz[m] /= 2;
2128 	    }
2129 
2130 	    if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2131 
2132             // Compress this block using the best method
2133 	    if (metrics->stats && metrics->stats->nvals > 16) {
2134 		// No point trying bit-pack if 17+ symbols.
2135 		if (method & (1<<RANS_PR128))
2136 		    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2137 		if (method & (1<<RANS_PR129))
2138 		    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2139 		if (method & (1<<RANS_PR192))
2140 		    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2141 		if (method & (1<<RANS_PR193))
2142 		    method = (method|(1<<RANS_PR65))&~(1<<RANS_PR193);
2143 	    }
2144             for (m = 0; m < CRAM_MAX_METHOD; m++) {
2145 		if (method & (1<<m)) {
2146 		    int lvl = level;
2147 		    switch (m) {
2148 		    case GZIP:     strat = Z_FILTERED; break;
2149 		    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2150 		    case GZIP_RLE: strat = Z_RLE; break;
2151 		    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2152 		    default:       strat = 0;
2153 		    }
2154 
2155 		    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2156 						&sz[m], m, lvl, strat);
2157                     if (fd->verbose > 1)
2158                         fprintf(stderr, "Try compression of block ID %d from %d to %d by method %s, strat %d\n",
2159                                 b->content_id, b->uncomp_size, (int)sz[m], cram_block_method2str(m), strat);
2160 
2161 		    if (c && sz_best > sz[m]) {
2162 			sz_best = sz[m];
2163 			method_best = m;
2164 			if (c_best)
2165 			    free(c_best);
2166 			c_best = c;
2167 		    } else if (c) {
2168 			free(c);
2169 		    } else {
2170 			sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2171 		    }
2172 		}
2173 	    }
2174 
2175 	    //fprintf(stderr, "sz_best = %d\n", sz_best);
2176 
2177 	    free(b->data);
2178 	    b->data = (unsigned char *)c_best;
2179 	    //printf("method_best = %s\n", cram_block_method2str(method_best));
2180 
2181 	    b->method = method_best;
2182 	    b->comp_size = sz_best;
2183 	    assert(sz_best != INT_MAX);
2184 
2185 	    // Accumulate stats for all methods tried
2186 	    if (fd->metrics_lock) pthread_mutex_lock(fd->metrics_lock);
2187             for (m = 0; m < CRAM_MAX_METHOD; m++)
2188                 metrics->sz[m] += sz[m];
2189 
2190 	    // When enough trials performed, find the best on average
2191 	    if (--metrics->trial == 0) {
2192 		int best_method = RAW;
2193 		int best_sz = INT_MAX;
2194 
2195 		// Relative costs of methods. See enum_cram_block_method.
2196 		double meth_cost[32] = {
2197 		    1,    // raw
2198 		    1.04, // gzip
2199 		    1.08, // bzip2
2200 		    1.10, // lzma
2201 		    1.00, // rans0
2202 		    1.09, // bsc
2203 		    1.05, // fqz
2204 		    1,1,1,               // unused
2205 		    1.02, // rans1
2206 		    1.00, // gzip rle
2207 		    1.02, // gzip -1
2208 		    1.00, // rans_pr0
2209 		    1.03, // rans_pr1
2210 		    1.00, // rans_pr64; if smaller, usually fast
2211 		    1.00, // rans_pr65
2212 		    1.00, // rans_pr128
2213 		    1.00, // rans_pr129
2214 		    1.00, // rans_pr192
2215 		    1.00, // rans_pr193
2216 		    1,1,1,1,1,1,1,1,1,1,1    // unused
2217 		};
2218 
2219 		// Scale methods by cost based on compression level
2220 		if (fd->level <= 1) {
2221 		    for (m = 0; m < CRAM_MAX_METHOD; m++)
2222 			metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2223 		} else if (fd->level <= 3) {
2224 		    for (m = 0; m < CRAM_MAX_METHOD; m++)
2225 			metrics->sz[m] *= 1+(meth_cost[m]-1);
2226 		} else if (fd->level <= 6) {
2227 		    for (m = 0; m < CRAM_MAX_METHOD; m++)
2228 			metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2229 		} // else cost is ignored
2230 
2231                 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2232 		    if ((!metrics->sz[m]) || (!(method & (1<<m))))
2233 			continue;
2234 
2235 		    if (best_sz > metrics->sz[m])
2236 			best_sz = metrics->sz[m], best_method = m;
2237 		}
2238 
2239 		if (fd->verbose > 1)
2240 		    fprintf(stderr, "Choosing method %s (was %s), strat %d for block ID %d\n",
2241 			    cram_block_method2str(best_method),
2242 			    cram_block_method2str(metrics->method),
2243 			    strat, b->content_id);
2244 
2245 		if (best_method != metrics->method) {
2246 		    metrics->trial = (NTRIALS+1)/2; // be sure
2247 		    //metrics->next_trial /= 1.5;
2248 		    metrics->consistency = 0;
2249 		} else {
2250 		    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2251 		    metrics->consistency++;
2252 		}
2253 
2254 		metrics->method = best_method;
2255 		switch (best_method) {
2256 		case GZIP:     strat = Z_FILTERED; break;
2257 		case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2258 		case GZIP_RLE: strat = Z_RLE; break;
2259 		case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2260 		default:       strat = 0;
2261 		}
2262 		metrics->strat  = strat;
2263 
2264 		// If we see at least MAXFAIL trials in a row for a specific
2265 		// compression method with more than MAXDELTA aggregate
2266 		// size then we drop this from the list of methods used
2267 		// for this block type.
2268 #define MAXDELTA 0.20
2269 #define MAXFAILS 4
2270                 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2271                     if (best_method == m) {
2272                         metrics->cnt[m] = 0;
2273                         metrics->extra[m] = 0;
2274                     } else if (best_sz < metrics->sz[m]) {
2275                         double r = (double)metrics->sz[m] / best_sz - 1;
2276                         if (++metrics->cnt[m] >= MAXFAILS &&
2277                             (metrics->extra[m] += r) >= MAXDELTA)
2278                             method &= ~(1<<m);
2279                     }
2280                 }
2281 
2282 		if (fd->verbose > 1 && method != metrics->revised_method)
2283 		    fprintf(stderr, "%d: revising method from %x to %x\n",
2284 			    b->content_id, metrics->revised_method, method);
2285 		metrics->revised_method = method;
2286 	    }
2287 	    if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2288 	} else {
2289 	    strat = metrics->strat;
2290 	    method = metrics->method;
2291 
2292 	    if (fd->metrics_lock) pthread_mutex_unlock(fd->metrics_lock);
2293 	    comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2294 					   &comp_size, method,
2295 					   method == GZIP_1 ? 1 : level,
2296 					   strat);
2297 	    if (!comp)
2298 		return -1;
2299 
2300 	    if (comp_size < b->uncomp_size) {
2301 		free(b->data);
2302 		b->data = (unsigned char *)comp;
2303 		b->comp_size = comp_size;
2304 		b->method = method;
2305 	    } else {
2306 		free(comp);
2307 	    }
2308 	}
2309 
2310     } else {
2311 	// no cached metrics, so just do zlib?
2312 	comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2313 				       &comp_size, GZIP, level, Z_FILTERED);
2314 	if (!comp) {
2315 	    fprintf(stderr, "Compression failed!\n");
2316 	    return -1;
2317 	}
2318 
2319 	if (comp_size < b->uncomp_size) {
2320 	    free(b->data);
2321 	    b->data = (unsigned char *)comp;
2322 	    b->comp_size = comp_size;
2323 	    b->method = GZIP;
2324 	} else {
2325 	    free(comp);
2326 	}
2327 	strat = Z_FILTERED;
2328     }
2329 
2330     if (fd->verbose)
2331 	fprintf(stderr, "Compressed block ID %d from %d to %d by method %s strat %d\n",
2332 		b->content_id, b->uncomp_size, b->comp_size,
2333 		cram_block_method2str(b->method), strat);
2334 
2335     b->method = methmap[b->method];
2336 
2337     return 0;
2338 }
2339 
cram_new_metrics(void)2340 cram_metrics *cram_new_metrics(void) {
2341     cram_metrics *m = calloc(1, sizeof(*m));
2342     if (!m)
2343 	return NULL;
2344     m->trial = NTRIALS;
2345     m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2346     m->method = RAW;
2347     m->strat = 0;
2348     m->revised_method = 0;
2349     m->consistency = 0;
2350     m->stats = NULL;
2351 
2352     return m;
2353 }
2354 
cram_block_method2str(enum cram_block_method m)2355 char *cram_block_method2str(enum cram_block_method m) {
2356     switch(m) {
2357     case RAW:	     return "RAW";
2358     case GZIP:	     return "GZIP";
2359     case BZIP2:	     return "BZIP2";
2360     case BSC:	     return "BSC";
2361     case FQZ:	     return "FQZ";
2362     case LZMA:       return "LZMA";
2363     case RANS0:      return "RANS0";
2364     case RANS1:      return "RANS1";
2365     case GZIP_RLE:   return "GZIP_RLE";
2366     case GZIP_1:     return "GZIP-1";
2367     case RANS_PR0:   return "RANS_PR0";
2368     case RANS_PR1:   return "RANS_PR1";
2369     case RANS_PR64:  return "RANS_PR64";
2370     case RANS_PR65:  return "RANS_PR65";
2371     case RANS_PR128: return "RANS_PR128";
2372     case RANS_PR129: return "RANS_PR129";
2373     case RANS_PR192: return "RANS_PR192";
2374     case RANS_PR193: return "RANS_PR193";
2375     default:         break;
2376     }
2377     return "?";
2378 }
2379 
cram_content_type2str(enum cram_content_type t)2380 char *cram_content_type2str(enum cram_content_type t) {
2381     switch (t) {
2382     case FILE_HEADER:         return "FILE_HEADER";
2383     case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2384     case MAPPED_SLICE:        return "MAPPED_SLICE";
2385     case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2386     case EXTERNAL:            return "EXTERNAL";
2387     case CORE:                return "CORE";
2388     case CT_ERROR:            break;
2389     }
2390     return "?";
2391 }
2392 
2393 /*
2394  * Extra error checking on fclose to really ensure data is written.
2395  * Care needs to be taken to handle pipes vs real files.
2396  *
2397  * Returns 0 on success
2398  *        -1 on failure.
2399  */
paranoid_fclose(FILE * fp)2400 int paranoid_fclose(FILE *fp) {
2401     if (-1 == fflush(fp) && errno != EBADF) {
2402 	fclose(fp);
2403 	return -1;
2404     }
2405 
2406     errno = 0;
2407 #ifdef HAVE_FSYNC
2408     if (-1 == fsync(fileno(fp))) {
2409 	if (errno != EINVAL) { // eg pipe
2410 	    fclose(fp);
2411 	    return -1;
2412 	}
2413     }
2414 #endif
2415 
2416     return fclose(fp);
2417 }
2418 
2419 /* ----------------------------------------------------------------------
2420  * Reference sequence handling
2421  *
2422  * These revolve around the refs_t structure, which may potentially be
2423  * shared between multiple cram_fd.
2424  *
2425  * We start with refs_create() to allocate an empty refs_t and then
2426  * populate it with @SQ line data using refs_from_header(). This is done on
2427  * cram_open().  Also at start up we can call cram_load_reference() which
2428  * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2429  * new one specified. In either case refs2id() is then called which
2430  * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2431  *
2432  * Later, possibly within a thread, we will want to know the actual ref
2433  * seq itself, obtained by calling cram_get_ref().  This may use the
2434  * UR: or M5: fields or the filename specified in the original
2435  * cram_load_reference() call.
2436  *
2437  * Given the potential for multi-threaded reference usage, we have
2438  * reference counting (sorry for the confusing double use of "ref") to
2439  * track the number of callers interested in any specific reference.
2440  */
2441 
2442 /*
2443  * Frees/unmaps a reference sequence and associated file handles.
2444  */
ref_entry_free_seq(ref_entry * e)2445 static void ref_entry_free_seq(ref_entry *e) {
2446     if (e->mf)
2447 	mfclose(e->mf);
2448     if (e->seq && !e->mf)
2449 	free(e->seq);
2450 
2451     e->seq = NULL;
2452     e->mf = NULL;
2453 }
2454 
refs_free(refs_t * r)2455 void refs_free(refs_t *r) {
2456     RP("refs_free()\n");
2457 
2458     if (--r->count > 0)
2459 	return;
2460 
2461     if (!r)
2462 	return;
2463 
2464     if (r->pool)
2465 	string_pool_destroy(r->pool);
2466 
2467     if (r->h_meta) {
2468 	HashIter *iter = HashTableIterCreate();
2469 	HashItem *hi;
2470 
2471         while ((hi = HashTableIterNext(r->h_meta, iter))) {
2472 	    ref_entry *e = (ref_entry *)hi->data.p;
2473 	    if (!e)
2474 		continue;
2475 	    ref_entry_free_seq(e);
2476 	    free(e);
2477 	}
2478 
2479 	HashTableIterDestroy(iter);
2480 	HashTableDestroy(r->h_meta, 0);
2481     }
2482 
2483     if (r->ref_id)
2484 	free(r->ref_id);
2485 
2486     if (r->fp)
2487 	bzi_close(r->fp);
2488 
2489     pthread_mutex_destroy(&r->lock);
2490 
2491     free(r);
2492 }
2493 
refs_create(void)2494 static refs_t *refs_create(void) {
2495     refs_t *r = calloc(1, sizeof(*r));
2496 
2497     RP("refs_create()\n");
2498 
2499     if (!r)
2500 	return NULL;
2501 
2502     if (!(r->pool = string_pool_create(8192)))
2503 	goto err;
2504 
2505     r->ref_id = NULL; // see refs2id() to populate.
2506     r->count = 1;
2507     r->last = NULL;
2508     r->last_id = -1;
2509 
2510     r->h_meta = HashTableCreate(16, HASH_DYNAMIC_SIZE | HASH_NONVOLATILE_KEYS);
2511     if (!r->h_meta)
2512 	goto err;
2513 
2514     pthread_mutex_init(&r->lock, NULL);
2515 
2516     return r;
2517 
2518  err:
2519     refs_free(r);
2520     return NULL;
2521 }
2522 
2523 /*
2524  * Loads a FAI file for a reference.fasta.
2525  * "is_err" indicates whether failure to load is worthy of emitting an
2526  * error message. In some cases (eg with embedded references) we
2527  * speculatively load, just incase, and silently ignore errors.
2528  *
2529  * Returns the refs_t struct on success (maybe newly allocated);
2530  *         NULL on failure
2531  */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)2532 refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
2533     struct stat sb;
2534     FILE *fp = NULL;
2535     HashData hd;
2536     char fai_fn[PATH_MAX];
2537     char line[8192];
2538     refs_t *r = r_orig;
2539     int id = 0, id_alloc = 0;
2540 
2541     RP("refs_load_fai %s\n", fn);
2542 
2543     if (!r)
2544 	if (!(r = refs_create()))
2545 	    goto err;
2546 
2547     /* Open reference, for later use */
2548     if (stat(fn, &sb) != 0) {
2549 	if (is_err)
2550 	    perror(fn);
2551 	goto err;
2552     }
2553 
2554     if (r->fp)
2555 	bzi_close(r->fp);
2556     r->fp = NULL;
2557 
2558     if (!(r->fn = string_dup(r->pool, fn)))
2559 	goto err;
2560 
2561     if (!(r->fp = bzi_open(fn, "r"))) {
2562 	if (is_err)
2563 	    perror(fn);
2564 	goto err;
2565     }
2566 
2567     /* Parse .fai file and load meta-data */
2568     sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn);
2569     if (stat(fai_fn, &sb) != 0) {
2570 	if (is_err)
2571 	    perror(fai_fn);
2572 	goto err;
2573     }
2574     if (!(fp = fopen(fai_fn, "r"))) {
2575 	if (is_err)
2576 	    perror(fai_fn);
2577 	goto err;
2578     }
2579     while (fgets(line, 8192, fp) != NULL) {
2580 	ref_entry *e = malloc(sizeof(*e));
2581 	char *cp;
2582 	int n;
2583 	HashItem *hi;
2584 
2585 	if (!e)
2586 	    return NULL;
2587 
2588 	// id
2589 	for (cp = line; *cp && !isspace(*cp); cp++)
2590 	    ;
2591 	*cp++ = 0;
2592 	e->name = string_dup(r->pool, line);
2593 
2594 	// length
2595 	while (*cp && isspace(*cp))
2596 	    cp++;
2597 	e->length = strtoll(cp, &cp, 10);
2598 
2599 	// offset
2600 	while (*cp && isspace(*cp))
2601 	    cp++;
2602 	e->offset = strtoll(cp, &cp, 10);
2603 
2604 	// bases per line
2605 	while (*cp && isspace(*cp))
2606 	    cp++;
2607 	e->bases_per_line = strtol(cp, &cp, 10);
2608 
2609 	// line length
2610 	while (*cp && isspace(*cp))
2611 	    cp++;
2612 	e->line_length = strtol(cp, &cp, 10);
2613 
2614 	// filename
2615 	e->fn = r->fn;
2616 
2617 	e->count = 0;
2618 	e->seq = NULL;
2619 	e->mf = NULL;
2620 
2621 	hd.p = e;
2622 	if (!(hi = HashTableAdd(r->h_meta, e->name, strlen(e->name), hd, &n))){
2623 	    free(e);
2624 	    return NULL;
2625 	}
2626 
2627 	if (!n) {
2628 	    /* Replace old one if needed. */
2629 	    ref_entry *r = (ref_entry *)hi->data.p;
2630 
2631 	    if (r && (r->count != 0 || r->length != 0)) {
2632 		/* Keep old one */
2633 		free(e);
2634 	    } else {
2635 		if (r)
2636 		    free(r);
2637 		hi->data.p = e;
2638 	    }
2639 	}
2640 
2641 	if (id >= id_alloc) {
2642 	    int x;
2643 
2644 	    id_alloc = id_alloc ?id_alloc*2 : 16;
2645 	    r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2646 
2647 	    for (x = id; x < id_alloc; x++)
2648 		r->ref_id[x] = NULL;
2649 	}
2650 	r->ref_id[id] = e;
2651 	r->nref = ++id;
2652     }
2653 
2654     RP("refs_load_fai %s END (success)\n", fn);
2655 
2656     fclose(fp);
2657     return r;
2658 
2659  err:
2660     RP("refs_load_fai %s END (fail)\n", fn);
2661 
2662     if (fp)
2663 	fclose(fp);
2664 
2665     if (!r_orig)
2666 	refs_free(r);
2667 
2668     return NULL;
2669 }
2670 
2671 /*
2672  * Verifies that the CRAM @SQ lines and .fai files match.
2673  */
sanitise_SQ_lines(cram_fd * fd)2674 static void sanitise_SQ_lines(cram_fd *fd) {
2675     int i;
2676 
2677     if (!fd->header)
2678 	return;
2679 
2680     if (!fd->refs || !fd->refs->h_meta)
2681 	return;
2682 
2683     for (i = 0; i < fd->header->nref; i++) {
2684 	char *name = fd->header->ref[i].name;
2685 	HashItem *hi = HashTableSearch(fd->refs->h_meta, name, 0);
2686 	ref_entry *r;
2687 
2688 	// We may have @SQ lines which have no known .fai, but do not
2689 	// in themselves pose a problem because they are unused in the file.
2690 	if (!hi)
2691 	    continue;
2692 
2693 	if (!(r = (ref_entry *)hi->data.p))
2694 	    continue;
2695 
2696 	if (r->length && r->length != fd->header->ref[i].len) {
2697 	    assert(strcmp(r->name, fd->header->ref[i].name) == 0);
2698 
2699 	    // Should we also check MD5sums here to ensure the correct
2700 	    // reference was given?
2701 	    fprintf(stderr, "WARNING: Header @SQ length mismatch for "
2702 		    "ref %s, %d vs %d\n",
2703 		    r->name, fd->header->ref[i].len, (int)r->length);
2704 
2705 	    // Fixing the parsed @SQ header will make MD:Z: strings work
2706 	    // and also stop it producing N for the sequence.
2707 	    fd->header->ref[i].len = r->length;
2708 	}
2709     }
2710 }
2711 
2712 /*
2713  * Indexes references by the order they appear in a BAM file. This may not
2714  * necessarily be the same order they appear in the fasta reference file.
2715  *
2716  * Returns 0 on success
2717  *        -1 on failure
2718  */
refs2id(refs_t * r,SAM_hdr * h)2719 int refs2id(refs_t *r, SAM_hdr *h) {
2720     int i;
2721 
2722     if (r->ref_id)
2723 	free(r->ref_id);
2724     if (r->last)
2725 	r->last = NULL;
2726 
2727     r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2728     if (!r->ref_id)
2729 	return -1;
2730 
2731     r->nref = h->nref;
2732     for (i = 0; i < h->nref; i++) {
2733 	HashItem *hi;
2734 	if ((hi = HashTableSearch(r->h_meta, h->ref[i].name, 0))) {
2735 	    r->ref_id[i] = hi->data.p;
2736 	} else {
2737 	    fprintf(stderr, "Unable to find ref name '%s'\n",
2738 		    h->ref[i].name);
2739 	}
2740     }
2741 
2742     return 0;
2743 }
2744 
2745 /*
2746  * Generates refs_t entries based on @SQ lines in the header.
2747  * Returns 0 on success
2748  *         -1 on failure
2749  */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)2750 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
2751     int i, j;
2752 
2753     if (!r)
2754 	return -1;
2755 
2756     if (!h || h->nref == 0)
2757 	return 0;
2758 
2759     //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2760 
2761     /* Existing refs are fine, as long as they're compatible with the hdr. */
2762     if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
2763 	return -1;
2764 
2765     /* Copy info from h->ref[i] over to r */
2766     for (i = 0, j = r->nref; i < h->nref; i++) {
2767 	SAM_hdr_type *ty;
2768 	SAM_hdr_tag *tag;
2769 	HashData hd;
2770 	int n;
2771 
2772 	if (!h->ref[i].name) {
2773 	    fprintf(stderr, "refs_from_header: no sequence name found for reference\n");
2774 	    return -1;
2775 	}
2776 
2777 	if (HashTableSearch(r->h_meta, h->ref[i].name, strlen(h->ref[i].name)))
2778 	    // Ref already known about
2779 	    continue;
2780 
2781 	if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2782 	    return -1;
2783 
2784 	if (!h->ref[i].name)
2785 	    return -1;
2786 
2787 	r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
2788 	r->ref_id[j]->length = 0; // marker for not yet loaded
2789 
2790 	/* Initialise likely filename if known */
2791 	if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
2792 	    if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
2793 		r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2794 		//fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[j]->name, r->ref_id[j]->fn);
2795 	    }
2796 	}
2797 
2798 	hd.p = r->ref_id[j];
2799 	if (!HashTableAdd(r->h_meta, r->ref_id[j]->name,
2800 			  strlen(r->ref_id[j]->name), hd, &n))
2801 	    return -1;
2802 	if (!n)
2803 	    return -1;
2804 
2805 	j++;
2806     }
2807     r->nref = j;
2808 
2809     return 0;
2810 }
2811 
2812 /*
2813  * Converts a directory and a filename into an expanded path, replacing %s
2814  * in directory with the filename and %[0-9]+s with portions of the filename
2815  * Any remaining parts of filename are added to the end with /%s.
2816  */
expand_cache_path(char * path,char * dir,char * fn)2817 void expand_cache_path(char *path, char *dir, char *fn) {
2818     char *cp;
2819 
2820     while ((cp = strchr(dir, '%'))) {
2821 	strncpy(path, dir, cp-dir);
2822 	path += cp-dir;
2823 
2824 	if (*++cp == 's') {
2825 	    strcpy(path, fn);
2826 	    path += strlen(fn);
2827 	    fn += strlen(fn);
2828 	    cp++;
2829 	} else if (*cp >= '0' && *cp <= '9') {
2830 	    char *endp;
2831 	    long l;
2832 
2833 	    l = strtol(cp, &endp, 10);
2834 	    l = MIN(l, strlen(fn));
2835 	    if (*endp == 's') {
2836 		strncpy(path, fn, l);
2837 		path += l;
2838 		fn += l;
2839 		*path = 0;
2840 		cp = endp+1;
2841 	    } else {
2842 		*path++ = '%';
2843 		*path++ = *cp++;
2844 	    }
2845 	} else {
2846 	    *path++ = '%';
2847 	    *path++ = *cp++;
2848 	}
2849 	dir = cp;
2850     }
2851     strcpy(path, dir);
2852     path += strlen(dir);
2853     if (*fn && path[-1] != '/')
2854 	*path++ = '/';
2855     strcpy(path, fn);
2856 }
2857 
2858 /*
2859  * Make the directory containing path and any prefix directories.
2860  */
mkdir_prefix(char * path,int mode)2861 void mkdir_prefix(char *path, int mode) {
2862     char *cp = strrchr(path, '/');
2863     if (!cp)
2864 	return;
2865 
2866     *cp = 0;
2867     if (is_directory(path)) {
2868 	*cp = '/';
2869 	return;
2870     }
2871 
2872     if (mkdir(path, mode) == 0) {
2873 	chmod(path, mode);
2874 	*cp = '/';
2875 	return;
2876     }
2877 
2878     mkdir_prefix(path, mode);
2879     mkdir(path, mode);
2880     chmod(path, mode);
2881     *cp = '/';
2882 }
2883 
2884 /*
2885  * Queries the M5 string from the header and attempts to populate the
2886  * reference from this using the REF_PATH environment.
2887  *
2888  * Returns 0 on sucess
2889  *        -1 on failure
2890  */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2891 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2892     char *ref_path = getenv("REF_PATH");
2893     SAM_hdr_type *ty;
2894     SAM_hdr_tag *tag;
2895     char path[PATH_MAX], path_tmp[PATH_MAX+20];
2896     char *local_cache = getenv("REF_CACHE");
2897     mFILE *mf;
2898 
2899     if (fd->verbose)
2900 	fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
2901 
2902     if (!ref_path)
2903 	ref_path = ".";
2904 
2905     if (!r->name)
2906 	return -1;
2907 
2908     if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2909 	return -1;
2910 
2911     if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2912 	goto no_M5;
2913 
2914     if (fd->verbose)
2915 	fprintf(stderr, "Querying ref %s\n", tag->str+3);
2916 
2917     /* Use cache if available */
2918     if (local_cache && *local_cache) {
2919 	struct stat sb;
2920 	bzi_FILE *fp;
2921 
2922 	expand_cache_path(path, local_cache, tag->str+3);
2923 
2924 	if (0 == stat(path, &sb) && (fp = bzi_open(path, "r"))) {
2925 	    r->length = sb.st_size;
2926 	    r->offset = r->line_length = r->bases_per_line = 0;
2927 
2928 	    r->fn = string_dup(fd->refs->pool, path);
2929 
2930 	    if (fd->refs->fp)
2931 		bzi_close(fd->refs->fp);
2932 	    fd->refs->fp = fp;
2933 	    fd->refs->fn = r->fn;
2934 
2935 	    // Fall back to cram_get_ref() where it'll do the actual
2936 	    // reading of the file.
2937 	    return 0;
2938 	}
2939     }
2940 
2941     /* Otherwise search */
2942     if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2943 	size_t sz;
2944 	r->seq = mfsteal(mf, &sz);
2945 	if (r->seq) {
2946 	    r->mf = NULL;
2947 	} else {
2948 	    // keep mf around as we couldn't detach
2949 	    r->seq = mf->data;
2950 	    r->mf = mf;
2951 	}
2952 	r->length = sz;
2953     } else {
2954 	refs_t *refs;
2955 	char *fn;
2956 
2957     no_M5:
2958 	/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2959 	if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2960 	    return -1;
2961 
2962 	fn = (strncmp(tag->str+3, "file:", 5) == 0)
2963 	    ? tag->str+8
2964 	    : tag->str+3;
2965 
2966 	if (fd->refs->fp) {
2967 	    bzi_close(fd->refs->fp);
2968 	    fd->refs->fp = NULL;
2969 	}
2970 	if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2971 	    return -1;
2972 	sanitise_SQ_lines(fd);
2973 
2974 	fd->refs = refs;
2975 	if (fd->refs->fp) {
2976 	    bzi_close(fd->refs->fp);
2977 	    fd->refs->fp = NULL;
2978 	}
2979 
2980 	if (!fd->refs->fn)
2981 	    return -1;
2982 
2983 	if (-1 == refs2id(fd->refs, fd->header))
2984 	    return -1;
2985 	if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2986 	    return -1;
2987 
2988 	// Local copy already, so fall back to cram_get_ref().
2989 	return 0;
2990     }
2991 
2992     /* Populate the local disk cache if required */
2993     if (local_cache && *local_cache) {
2994 	FILE *fp;
2995 	int i;
2996 
2997 	expand_cache_path(path, local_cache, tag->str+3);
2998 	if (fd->verbose)
2999 	    fprintf(stderr, "Path='%s'\n", path);
3000 	mkdir_prefix(path, 01777);
3001 
3002 	i = 0;
3003 	do {
3004 	    sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
3005 	    i++;
3006 	    fp = fopen(path_tmp, "wx");
3007 	} while (fp == NULL && errno == EEXIST);
3008 	if (!fp) {
3009 	    perror(path_tmp);
3010 
3011 	    // Not fatal - we have the data already so keep going.
3012 	    return 0;
3013 	}
3014 
3015 	if (r->length != fwrite(r->seq, 1, r->length, fp)) {
3016 	    perror(path);
3017 	}
3018 	if (-1 == paranoid_fclose(fp)) {
3019 	    unlink(path_tmp);
3020 	} else {
3021 	    if (0 == chmod(path_tmp, 0444))
3022 		rename(path_tmp, path);
3023 	    else
3024 		unlink(path_tmp);
3025 	}
3026     }
3027 
3028     return 0;
3029 }
3030 
cram_ref_incr_locked(refs_t * r,int id)3031 static void cram_ref_incr_locked(refs_t *r, int id) {
3032     RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
3033 
3034     if (id >= 0 && id >= r->nref)
3035 	return;
3036 
3037     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3038 	return;
3039 
3040     if (r->last_id == id)
3041 	r->last_id = -1;
3042 
3043     ++r->ref_id[id]->count;
3044 }
3045 
cram_ref_incr(refs_t * r,int id)3046 void cram_ref_incr(refs_t *r, int id) {
3047     pthread_mutex_lock(&r->lock);
3048     cram_ref_incr_locked(r, id);
3049     pthread_mutex_unlock(&r->lock);
3050 }
3051 
cram_ref_decr_locked(refs_t * r,int id)3052 static void cram_ref_decr_locked(refs_t *r, int id) {
3053     RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
3054 
3055     if (id >= 0 && id >= r->nref)
3056 	return;
3057 
3058     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3059 	assert(id < 0 || !r->ref_id[id] || r->ref_id[id]->count >= 0);
3060 	return;
3061     }
3062 
3063     if (--r->ref_id[id]->count <= 0) {
3064 	assert(r->ref_id[id]->count == 0);
3065 	if (r->last_id >= 0) {
3066 	    if (r->ref_id[r->last_id]->count <= 0 &&
3067 		r->ref_id[r->last_id]->seq) {
3068 		RP("%d FREE REF %d (%p)\n", gettid(),
3069 		   r->last_id, r->ref_id[r->last_id]->seq);
3070 		ref_entry_free_seq(r->ref_id[r->last_id]);
3071 		r->ref_id[r->last_id]->length = 0;
3072 	    }
3073 	}
3074 	r->last_id = id;
3075     }
3076 }
3077 
cram_ref_decr(refs_t * r,int id)3078 void cram_ref_decr(refs_t *r, int id) {
3079     pthread_mutex_lock(&r->lock);
3080     cram_ref_decr_locked(r, id);
3081     pthread_mutex_unlock(&r->lock);
3082 }
3083 
3084 /*
3085  * Used by cram_ref_load and cram_ref_get. The file handle will have
3086  * already been opened, so we can catch it. The ref_entry *e informs us
3087  * of whether this is a multi-line fasta file or a raw MD5 style file.
3088  * Either way we create a single contiguous sequence.
3089  *
3090  * Returns all or part of a reference sequence on success (malloced);
3091  *         NULL on failure.
3092  */
load_ref_portion(bzi_FILE * fp,ref_entry * e,int start,int end)3093 char *load_ref_portion(bzi_FILE *fp, ref_entry *e, int start, int end) {
3094     off_t offset, len;
3095     char *seq;
3096 
3097     if (end < start)
3098 	end = start;
3099 
3100     /*
3101      * Compute locations in file. This is trivial for the MD5 files, but
3102      * is still necessary for the fasta variants.
3103      */
3104     offset = e->line_length
3105 	? e->offset + (start-1)/e->bases_per_line * e->line_length +
3106 	  (start-1) % e->bases_per_line
3107 	: start-1;
3108 
3109     len = (e->line_length
3110 	   ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3111 	     (end-1) % e->bases_per_line
3112 	   : end-1) - offset + 1;
3113 
3114     if (0 != bzi_seek(fp, offset, SEEK_SET)) {
3115 	perror("fseeko() on reference file");
3116 	return NULL;
3117     }
3118 
3119     if (len == 0 || !(seq = malloc(len))) {
3120 	return NULL;
3121     }
3122 
3123     if (len != bzi_read(seq, 1, len, fp)) {
3124 	perror("fread() on reference file");
3125 	free(seq);
3126 	return NULL;
3127     }
3128 
3129     /* Strip white-space if required. */
3130     if (len != end-start+1) {
3131 	int i, j;
3132 	char *cp = seq;
3133 	char *cp_to;
3134 
3135 	for (i = j = 0; i < len; i++) {
3136 	    if (cp[i] >= '!' && cp[i] <= '~')
3137 		cp[j++] = toupper(cp[i]);
3138 	}
3139 	cp_to = cp+j;
3140 
3141 	if (cp_to - seq != end-start+1) {
3142 	    fprintf(stderr, "Malformed reference file?\n");
3143 	    free(seq);
3144 	    return NULL;
3145 	}
3146     } else {
3147 	int i;
3148 	for (i = 0; i < len; i++) {
3149 	    seq[i] = toupper(seq[i]);
3150 	}
3151     }
3152 
3153     return seq;
3154 }
3155 
3156 /*
3157  * Load the entire reference 'id'.
3158  * This also increments the reference count by 1.
3159  *
3160  * Returns ref_entry on success;
3161  *         NULL on failure
3162  */
cram_ref_load(refs_t * r,int id)3163 ref_entry *cram_ref_load(refs_t *r, int id) {
3164     ref_entry *e = r->ref_id[id];
3165     int start = 1, end = e->length;
3166     char *seq;
3167 
3168     if (e->seq) {
3169 	return e;
3170     }
3171 
3172     assert(e->count == 0);
3173 
3174     if (r->last) {
3175 #ifdef REF_DEBUG
3176 	int idx = 0;
3177 	for (idx = 0; idx < r->nref; idx++)
3178 	    if (r->last == r->ref_id[idx])
3179 		break;
3180 	RP("%d cram_ref_load DECR %d => %d\n", gettid(), idx, r->last->count-1);
3181 #endif
3182 
3183 	assert(r->last->count > 0);
3184 	if (--r->last->count <= 0) {
3185 	    RP("%d FREE REF %d (%p)\n", gettid(), id, r->last->seq);
3186 	    if (r->last->seq)
3187 		ref_entry_free_seq(r->last);
3188 	}
3189     }
3190 
3191     /* Open file if it's not already the current open reference */
3192     if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3193 	if (r->fp)
3194 	    bzi_close(r->fp);
3195 	r->fn = e->fn;
3196 	if (!(r->fp = bzi_open(r->fn, "r"))) {
3197 	    perror(r->fn);
3198 	    return NULL;
3199 	}
3200     }
3201 
3202     RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3203 
3204     if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3205 	return NULL;
3206     }
3207 
3208     RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3209 
3210     RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
3211     e->seq = seq;
3212     e->mf = NULL;
3213     e->count++;
3214 
3215     /*
3216      * Also keep track of last used ref so incr/decr loops on the same
3217      * sequence don't cause load/free loops.
3218      */
3219     RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
3220     r->last = e;
3221     e->count++;
3222 
3223     return e;
3224 }
3225 
3226 /*
3227  * Returns a portion of a reference sequence from start to end inclusive.
3228  * The returned pointer is owned by either the cram_file fd or by the
3229  * internal refs_t structure and should not be freed  by the caller.
3230  *
3231  * The difference is whether or not this refs_t is in use by just the one
3232  * cram_fd or by multiples, or whether we have multiple threads accessing
3233  * references. In either case fd->shared will be true and we start using
3234  * reference counting to track the number of users of a specific reference
3235  * sequence.
3236  *
3237  * Otherwise the ref seq returned is allocated as part of cram_fd itself
3238  * and will be freed up on the next call to cram_get_ref or cram_close.
3239  *
3240  * To return the entire reference sequence, specify start as 1 and end
3241  * as 0.
3242  *
3243  * To cease using a reference, call cram_ref_decr().
3244  *
3245  * Returns reference on success,
3246  *         NULL on failure
3247  */
cram_get_ref(cram_fd * fd,int id,int start,int end)3248 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
3249     ref_entry *r;
3250     char *seq;
3251     int ostart = start;
3252 
3253     if (id == -1)
3254 	return NULL;
3255 
3256     /* FIXME: axiomatic query of r->seq being true?
3257      * Or shortcut for unsorted data where we load once and never free?
3258      */
3259 
3260     //fd->shared_ref = 1; // hard code for now to simplify things
3261 
3262     if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3263 
3264     RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3265 
3266     /*
3267      * Unsorted data implies we want to fetch an entire reference at a time.
3268      * We just deal with this at the moment by claiming we're sharing
3269      * references instead, which has the same requirement.
3270      */
3271     if (fd->unsorted)
3272 	fd->shared_ref = 1;
3273 
3274 
3275     /* Sanity checking: does this ID exist? */
3276     if (id >= fd->refs->nref) {
3277 	fprintf(stderr, "No reference found for id %d\n", id);
3278 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3279 	return NULL;
3280     }
3281 
3282     if (!fd->refs || !fd->refs->ref_id[id]) {
3283 	fprintf(stderr, "No reference found for id %d\n", id);
3284 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3285 	return NULL;
3286     }
3287 
3288     if (!(r = fd->refs->ref_id[id])) {
3289 	fprintf(stderr, "No reference found for id %d\n", id);
3290 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3291 	return NULL;
3292     }
3293 
3294 
3295     /*
3296      * It has an entry, but may not have been populated yet.
3297      * Any manually loaded .fai files have their lengths known.
3298      * A ref entry computed from @SQ lines (M5 or UR field) will have
3299      * r->length == 0 unless it's been loaded once and verified that we have
3300      * an on-disk filename for it.
3301      *
3302      * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3303      * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3304      * rewrite my code to have one curl handle per thread.
3305      */
3306     pthread_mutex_lock(&fd->refs->lock);
3307     if (r->length == 0) {
3308 	if (cram_populate_ref(fd, id, r) == -1) {
3309 	    fprintf(stderr, "Failed to populate reference for id %d\n", id);
3310 	    pthread_mutex_unlock(&fd->refs->lock);
3311 	    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3312 	    return NULL;
3313 	}
3314 	r = fd->refs->ref_id[id];
3315 	if (fd->unsorted)
3316 	    cram_ref_incr_locked(fd->refs, id);
3317     }
3318 
3319 
3320     /*
3321      * We now know that we the filename containing the reference, so check
3322      * for limits. If it's over half the reference we'll load all of it in
3323      * memory as this will speed up subsequent calls.
3324      */
3325     if (end < 1)
3326 	end = r->length;
3327     if (end >= r->length)
3328 	end  = r->length;
3329     if (start < 1)
3330 	return NULL;
3331 
3332     if (end - start >= 0.5*r->length || fd->shared_ref) {
3333 	start = 1;
3334 	end = r->length;
3335     }
3336 
3337     /*
3338      * Maybe we have it cached already? If so use it.
3339      *
3340      * Alternatively if we don't have the sequence but we're sharing
3341      * references and/or are asking for the entire length of it, then
3342      * load the full reference into the refs structure and return
3343      * a pointer to that one instead.
3344      */
3345     if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3346 	char *cp;
3347 
3348 	if (id >= 0) {
3349 	    if (r->seq) {
3350 		cram_ref_incr_locked(fd->refs, id);
3351 	    } else {
3352 		ref_entry *e;
3353 		if (!(e = cram_ref_load(fd->refs, id))) {
3354 		    pthread_mutex_unlock(&fd->refs->lock);
3355 		    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3356 		    return NULL;
3357 		}
3358 
3359 		/* unsorted data implies cache ref indefinitely, to avoid
3360 		 * continually loading and unloading.
3361 		 */
3362 		if (fd->unsorted)
3363 		    cram_ref_incr_locked(fd->refs, id);
3364 	    }
3365 
3366 	    fd->ref = NULL; /* We never access it directly */
3367 	    fd->ref_start = 1;
3368 	    fd->ref_end   = r->length;
3369 	    fd->ref_id    = id;
3370 
3371 	    cp = fd->refs->ref_id[id]->seq + ostart-1;
3372 	} else {
3373 	    fd->ref = NULL;
3374 	    cp = NULL;
3375 	}
3376 
3377 	RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3378 
3379 	pthread_mutex_unlock(&fd->refs->lock);
3380 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3381 	return cp;
3382     }
3383 
3384     /*
3385      * Otherwise we're not sharing, we don't have a copy of it already and
3386      * we're only asking for a small portion of it.
3387      *
3388      * In this case load up just that segment ourselves, freeing any old
3389      * small segments in the process.
3390      */
3391 
3392     /* Unmapped ref ID */
3393     if (id < 0) {
3394 	if (fd->ref_free) {
3395 	    free(fd->ref_free);
3396 	    fd->ref_free = NULL;
3397 	}
3398 	fd->ref = NULL;
3399 	fd->ref_id = id;
3400 	pthread_mutex_unlock(&fd->refs->lock);
3401 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3402 	return NULL;
3403     }
3404 
3405     /* Open file if it's not already the current open reference */
3406     if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3407 	if (fd->refs->fp)
3408 	    bzi_close(fd->refs->fp);
3409 	fd->refs->fn = r->fn;
3410 	if (!(fd->refs->fp = bzi_open(fd->refs->fn, "r"))) {
3411 	    perror(fd->refs->fn);
3412 	    pthread_mutex_unlock(&fd->refs->lock);
3413 	    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3414 	    return NULL;
3415 	}
3416     }
3417 
3418     if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3419 	pthread_mutex_unlock(&fd->refs->lock);
3420 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3421 	return NULL;
3422     }
3423 
3424     if (fd->ref_free)
3425 	free(fd->ref_free);
3426 
3427     fd->ref_id    = id;
3428     fd->ref_start = start;
3429     fd->ref_end   = end;
3430     fd->ref_free = fd->ref;
3431     seq = fd->ref;
3432 
3433     pthread_mutex_unlock(&fd->refs->lock);
3434     if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3435 
3436     return seq + ostart - start;
3437 }
3438 
3439 /*
3440  * If fd has been opened for reading, it may be permitted to specify 'fn'
3441  * as NULL and let the code auto-detect the reference by parsing the
3442  * SAM header @SQ lines.
3443  */
cram_load_reference(cram_fd * fd,char * fn)3444 int cram_load_reference(cram_fd *fd, char *fn) {
3445     int ret = 0;
3446 
3447     if (fn) {
3448 	fd->refs = refs_load_fai(fd->refs, fn,
3449 				 !(fd->embed_ref && fd->mode == 'r'));
3450 	fn = fd->refs ? fd->refs->fn : NULL;
3451         if (!fn)
3452             ret = -1;
3453 	sanitise_SQ_lines(fd);
3454     }
3455     fd->ref_fn = fn;
3456 
3457     if (!fd->refs && fd->header) {
3458 	if (!(fd->refs = refs_create()))
3459 	    return -1;
3460 	if (-1 == refs_from_header(fd->refs, fd, fd->header))
3461 	    return -1;
3462     }
3463 
3464     if (-1 == refs2id(fd->refs, fd->header))
3465 	return -1;
3466 
3467     return ret;
3468 }
3469 
3470 /* ----------------------------------------------------------------------
3471  * Containers
3472  */
3473 
3474 /*
3475  * Creates a new container, specifying the maximum number of slices
3476  * and records permitted.
3477  *
3478  * Returns cram_container ptr on success
3479  *         NULL on failure
3480  */
cram_new_container(int nrec,int nslice)3481 cram_container *cram_new_container(int nrec, int nslice) {
3482     cram_container *c = calloc(1, sizeof(*c));
3483     enum cram_DS_ID id;
3484 
3485     if (!c)
3486 	return NULL;
3487 
3488     c->curr_ref = -2;
3489 
3490     c->max_c_rec = nrec * nslice;
3491     c->curr_c_rec = 0;
3492 
3493     c->max_rec = nrec;
3494     c->record_counter = 0;
3495     c->num_bases = 0;
3496     c->s_num_bases = 0;
3497 
3498     c->max_slice = nslice;
3499     c->curr_slice = 0;
3500 
3501     c->pos_sorted = 1;
3502     c->max_apos   = 0;
3503     c->multi_seq  = 0;
3504 
3505     c->bams = NULL;
3506 
3507     if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
3508 	goto err;
3509     c->slice = NULL;
3510 
3511     if (!(c->comp_hdr = cram_new_compression_header()))
3512 	goto err;
3513     c->comp_hdr_block = NULL;
3514 
3515     for (id = DS_RN; id < DS_TN; id++)
3516 	if (!(c->stats[id] = cram_stats_create())) goto err;
3517 
3518     //c->aux_B_stats = cram_stats_create();
3519 
3520     if (!(c->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
3521 	goto err;
3522     c->refs_used = 0;
3523 
3524     c->last_name = "";
3525 
3526     c->crc32 = 0;
3527 
3528     return c;
3529 
3530  err:
3531     if (c) {
3532 	if (c->slices)
3533 	    free(c->slices);
3534 	free(c);
3535     }
3536     return NULL;
3537 }
3538 
cram_free_container(cram_container * c)3539 void cram_free_container(cram_container *c) {
3540     enum cram_DS_ID id;
3541     int i;
3542 
3543     if (!c)
3544 	return;
3545 
3546     if (c->refs_used)
3547 	free(c->refs_used);
3548 
3549     if (c->landmark)
3550 	free(c->landmark);
3551 
3552     if (c->comp_hdr)
3553 	cram_free_compression_header(c->comp_hdr);
3554 
3555     if (c->comp_hdr_block)
3556 	cram_free_block(c->comp_hdr_block);
3557 
3558     if (c->slices) {
3559 	for (i = 0; i < c->max_slice; i++)
3560 	    if (c->slices[i])
3561 		cram_free_slice(c->slices[i]);
3562 	free(c->slices);
3563     }
3564 
3565     for (id = DS_RN; id < DS_TN; id++)
3566 	if (c->stats[id]) cram_stats_free(c->stats[id]);
3567 
3568     //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3569 
3570     if (c->tags_used) {
3571         HashItem *hi;
3572         HashIter *iter = HashTableIterCreate();
3573 
3574 	while (iter && (hi = HashTableIterNext(c->tags_used, iter))) {
3575 	    cram_tag_map *tm = (cram_tag_map *)hi->data.p;
3576 	    cram_codec *c = tm->codec;
3577 
3578 	    if (c && c->free) c->free(c);
3579 	    free(tm);
3580 	}
3581 
3582 	HashTableDestroy(c->tags_used, 0);
3583 	HashTableIterDestroy(iter);
3584     }
3585 
3586     free(c);
3587 }
3588 
3589 /*
3590  * Reads a container header.
3591  *
3592  * Returns cram_container on success
3593  *         NULL on failure or no container left (fd->err == 0).
3594  */
cram_read_container(cram_fd * fd)3595 cram_container *cram_read_container(cram_fd *fd) {
3596     cram_container c2, *c;
3597     int i, s;
3598     size_t rd = 0;
3599     uint32_t crc = 0;
3600 
3601     fd->err = 0;
3602     fd->eof = 0;
3603 
3604     memset(&c2, 0, sizeof(c2));
3605     if (IS_CRAM_1_VERS(fd)) {
3606 	if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
3607 	    fd->eof = 1;
3608 	    return NULL;
3609 	} else {
3610 	    rd+=s;
3611 	}
3612     } else {
3613 	uint32_t len;
3614 	if ((s = int32_decode(fd, &c2.length)) == -1) {
3615 	    if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3616 		CRAM_MINOR_VERS(fd->version) == 0)
3617 		fd->eof = 1; // EOF blocks arrived in v2.1
3618 	    else
3619 		fd->eof = fd->empty_container ? 1 : 2;
3620 	    return NULL;
3621 	} else {
3622 	    rd+=s;
3623 	}
3624 	len = le_int4(c2.length);
3625 	crc = iolib_crc32(0L, (unsigned char *)&len, 4);
3626     }
3627     if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3628     if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3629 	int64_t i64;
3630 	if ((s = ltf8_decode_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3631 	c2.ref_seq_start = i64;
3632 	if ((s = ltf8_decode_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3633 	c2.ref_seq_span = i64;
3634     } else {
3635 	int32_t i32;
3636 	if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3637 	c2.ref_seq_start = i32;
3638 	if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3639 	c2.ref_seq_span = i32;
3640     }
3641     if ((s = itf8_decode_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3642 
3643     if (IS_CRAM_1_VERS(fd)) {
3644 	c2.record_counter = 0;
3645 	c2.num_bases = 0;
3646     } else {
3647 	if (IS_CRAM_3_VERS(fd)) {
3648 	    if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
3649 		return NULL;
3650 	    else
3651 		rd += s;
3652 	} else {
3653 	    int32_t i32;
3654 	    if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
3655 		return NULL;
3656 	    else
3657 		rd += s;
3658 	    c2.record_counter = i32;
3659 	}
3660 
3661 	if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
3662 	    return NULL;
3663 	else
3664 	    rd += s;
3665     }
3666     if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc))   == -1) return NULL; else rd+=s;
3667     if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
3668 
3669     if (!(c = calloc(1, sizeof(*c))))
3670 	return NULL;
3671 
3672     *c = c2;
3673 
3674     if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
3675 	c->num_landmarks) {
3676 	fd->err = errno;
3677 	cram_free_container(c);
3678 	return NULL;
3679     }
3680     for (i = 0; i < c->num_landmarks; i++) {
3681 	if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
3682 	    cram_free_container(c);
3683 	    return NULL;
3684 	} else {
3685 	    rd += s;
3686 	}
3687     }
3688 
3689     if (IS_CRAM_3_VERS(fd)) {
3690 	if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
3691 	    return NULL;
3692 	else
3693 	    rd+=4;
3694 
3695 	if (!fd->ignore_md5 && crc != c->crc32) {
3696 	    fprintf(stderr, "Container header CRC32 failure\n");
3697 	    cram_free_container(c);
3698 	    return NULL;
3699 	}
3700     }
3701 
3702     c->offset = rd;
3703     c->slices = NULL;
3704     c->curr_slice = 0;
3705     c->max_slice = c->num_landmarks;
3706     c->slice_rec = 0;
3707     c->curr_rec = 0;
3708     c->max_rec = 0;
3709 
3710     if (c->ref_seq_id == -2) {
3711 	c->multi_seq = 1;
3712 	fd->multi_seq = 1;
3713     }
3714 
3715     fd->empty_container =
3716 	(c->num_records == 0 &&
3717 	 c->ref_seq_id == -1 &&
3718 	 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3719 
3720     return c;
3721 }
3722 
3723 /*
3724  * Writes a container structure.
3725  *
3726  * Returns 0 on success
3727  *        -1 on failure
3728  */
cram_write_container(cram_fd * fd,cram_container * c)3729 int cram_write_container(cram_fd *fd, cram_container *c) {
3730     char buf_a[1024], *buf = buf_a, *cp;
3731     int i;
3732 
3733     // worse case sizes given 32-bit & 64-bit quantities.
3734     if (61 + c->num_landmarks * 10 >= 1024)
3735 	buf = malloc(61 + c->num_landmarks * 10);
3736     cp = buf;
3737 
3738     *(int32_t *)cp = le_int4(c->length);
3739     cp += 4;
3740 
3741     if (c->multi_seq) {
3742 	cp += itf8_put(cp, -2);
3743 	cp += itf8_put(cp, 0);
3744 	cp += itf8_put(cp, 0);
3745     } else {
3746 	cp += itf8_put(cp, c->ref_seq_id);
3747 	if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3748 	    cp += ltf8_put(cp, c->ref_seq_start);
3749 	    cp += ltf8_put(cp, c->ref_seq_span);
3750 	} else {
3751 	    cp += itf8_put(cp, c->ref_seq_start);
3752 	    cp += itf8_put(cp, c->ref_seq_span);
3753 	}
3754     }
3755     cp += itf8_put(cp, c->num_records);
3756     if (IS_CRAM_3_VERS(fd))
3757 	cp += ltf8_put(cp, c->record_counter);
3758     else
3759 	cp += itf8_put(cp, c->record_counter);
3760     cp += ltf8_put(cp, c->num_bases);
3761     cp += itf8_put(cp, c->num_blocks);
3762     cp += itf8_put(cp, c->num_landmarks);
3763     for (i = 0; i < c->num_landmarks; i++)
3764 	cp += itf8_put(cp, c->landmark[i]);
3765 
3766     if (IS_CRAM_3_VERS(fd)) {
3767 	c->crc32 = iolib_crc32(0L, (uc *)buf, cp-buf);
3768 	cp[0] =  c->crc32        & 0xff;
3769 	cp[1] = (c->crc32 >>  8) & 0xff;
3770 	cp[2] = (c->crc32 >> 16) & 0xff;
3771 	cp[3] = (c->crc32 >> 24) & 0xff;
3772 	cp += 4;
3773     }
3774 
3775     if (cp-buf != CRAM_IO_WRITE(buf, 1, cp-buf, fd)) {
3776 	if (buf != buf_a)
3777 	    free(buf);
3778 	return -1;
3779     }
3780 
3781     if (buf != buf_a)
3782 	free(buf);
3783 
3784     return 0;
3785 }
3786 
3787 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3788 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3789     int i, j;
3790 
3791     if (c->curr_slice > 0 && !c->slices)
3792 	return -1;
3793 
3794     //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3795 
3796     /* Write the container struct itself */
3797     if (0 != cram_write_container(fd, c))
3798 	return -1;
3799 
3800     /* And the compression header */
3801     if (0 != cram_write_block(fd, c->comp_hdr_block))
3802 	return -1;
3803 
3804     /* Followed by the slice blocks */
3805     for (i = 0; i < c->curr_slice; i++) {
3806 	cram_slice *s = c->slices[i];
3807 
3808 	if (0 != cram_write_block(fd, s->hdr_block))
3809 	    return -1;
3810 
3811 	for (j = 0; j < s->hdr->num_blocks; j++) {
3812 	    if (0 != cram_write_block(fd, s->block[j]))
3813 		return -1;
3814 	}
3815     }
3816 
3817     return CRAM_IO_FLUSH(fd) == 0 ? 0 : -1;
3818 }
3819 
3820 /*
3821  * Flushes a completely or partially full container to disk, writing
3822  * container structure, header and blocks. This also calls the encoder
3823  * functions.
3824  *
3825  * Returns 0 on success
3826  *        -1 on failure
3827  */
cram_flush_container(cram_fd * fd,cram_container * c)3828 int cram_flush_container(cram_fd *fd, cram_container *c) {
3829     /* Encode the container blocks and generate compression header */
3830     if (0 != cram_encode_container(fd, c))
3831 	return -1;
3832 
3833     return cram_flush_container2(fd, c);
3834 }
3835 
3836 typedef struct {
3837     cram_fd *fd;
3838     cram_container *c;
3839 } cram_job;
3840 
cram_flush_thread(void * arg)3841 void *cram_flush_thread(void *arg) {
3842     cram_job *j = (cram_job *)arg;
3843 
3844     /* Encode the container blocks and generate compression header */
3845     if (0 != cram_encode_container(j->fd, j->c)) {
3846 	fprintf(stderr, "cram_encode_container failed\n");
3847 	return NULL;
3848     }
3849 
3850     return arg;
3851 }
3852 
cram_flush_result(cram_fd * fd)3853 static int cram_flush_result(cram_fd *fd) {
3854     int i, ret = 0;
3855     t_pool_result *r;
3856     cram_container *lc = NULL;
3857 
3858     // NB: we can have one result per slice, not per container,
3859     // so we need to free the container only after all slices
3860     // within it have been freed.  (Automatic via reference counting.)
3861     while ((r = t_pool_next_result(fd->rqueue))) {
3862 	cram_job *j = (cram_job *)r->data;
3863 	cram_container *c;
3864 
3865 	if (!j) {
3866 	    t_pool_delete_result(r, 0);
3867 	    return -1;
3868 	}
3869 
3870 	fd = j->fd;
3871 	c = j->c;
3872 
3873 	if (fd->mode == 'w')
3874 	    if (0 != cram_flush_container2(fd, c))
3875 		return -1;
3876 
3877 	/* Free the container */
3878 	for (i = 0; i < c->max_slice; i++) {
3879 	    if (c->slices && c->slices[i]) {
3880 		cram_free_slice(c->slices[i]);
3881 		c->slices[i] = NULL;
3882 	    }
3883 	}
3884 
3885 	c->slice = NULL;
3886 	c->curr_slice = 0;
3887 
3888 	// Our jobs will be in order, so we free the last
3889 	// container when our job has switched to a new one.
3890 	if (c != lc) {
3891 	    if (lc) {
3892 		cram_free_container(lc);
3893 		if (fd->ctr == lc)
3894 		    fd->ctr = NULL;
3895 		if (fd->ctr_mt == lc)
3896 		    fd->ctr_mt = NULL;
3897 	    }
3898 	    lc = c;
3899 	}
3900 
3901 	ret |= CRAM_IO_FLUSH(fd) == 0 ? 0 : -1;
3902 
3903 	t_pool_delete_result(r, 1);
3904     }
3905     if (lc) {
3906 	cram_free_container(lc);
3907 	if (fd->ctr == lc)
3908 	    fd->ctr = NULL;
3909 	if (fd->ctr_mt == lc)
3910 	    fd->ctr_mt = NULL;
3911     }
3912 
3913     return ret;
3914 }
3915 
cram_flush_container_mt(cram_fd * fd,cram_container * c)3916 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3917     cram_job *j;
3918 
3919     if (!fd->pool)
3920 	return cram_flush_container(fd, c);
3921 
3922     if (!(j = malloc(sizeof(*j))))
3923 	return -1;
3924     j->fd = fd;
3925     j->c = c;
3926 
3927     t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
3928 
3929     return cram_flush_result(fd);
3930 }
3931 
3932 /* ----------------------------------------------------------------------
3933  * Compression headers; the first part of the container
3934  */
3935 
3936 /*
3937  * Creates a new blank container compression header
3938  *
3939  * Returns header ptr on success
3940  *         NULL on failure
3941  */
cram_new_compression_header(void)3942 cram_block_compression_hdr *cram_new_compression_header(void) {
3943     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3944     if (!hdr)
3945 	return NULL;
3946 
3947     if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3948 	free(hdr);
3949 	return NULL;
3950     }
3951 
3952     if (!(hdr->TD = HashTableCreate(16, HASH_DYNAMIC_SIZE))) {
3953 	cram_free_block(hdr->TD_blk);
3954 	free(hdr);
3955 	return NULL;
3956     }
3957 
3958     return hdr;
3959 }
3960 
cram_free_compression_header(cram_block_compression_hdr * hdr)3961 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3962     int i;
3963 
3964     if (hdr->landmark)
3965 	free(hdr->landmark);
3966 
3967     if (hdr->preservation_map)
3968 	HashTableDestroy(hdr->preservation_map, 0);
3969 
3970     for (i = 0; i < CRAM_MAP_HASH; i++) {
3971 	cram_map *m, *m2;
3972 	for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3973 	    m2 = m->next;
3974 	    if (m->codec)
3975 		m->codec->free(m->codec);
3976 	    free(m);
3977 	}
3978     }
3979 
3980     for (i = 0; i < CRAM_MAP_HASH; i++) {
3981 	cram_map *m, *m2;
3982 	for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3983 	    m2 = m->next;
3984 	    if (m->codec)
3985 		m->codec->free(m->codec);
3986 	    free(m);
3987 	}
3988     }
3989 
3990     for (i = 0; i < DS_END; i++) {
3991 	if (hdr->codecs[i])
3992 	    hdr->codecs[i]->free(hdr->codecs[i]);
3993     }
3994 
3995     if (hdr->TL)
3996 	free(hdr->TL);
3997     if (hdr->TD_blk)
3998 	cram_free_block(hdr->TD_blk);
3999     if (hdr->TD)
4000 	HashTableDestroy(hdr->TD, 0);
4001 
4002     free(hdr);
4003 }
4004 
4005 
4006 /* ----------------------------------------------------------------------
4007  * Slices and slice headers
4008  */
4009 
cram_free_slice_header(cram_block_slice_hdr * hdr)4010 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4011     if (!hdr)
4012 	return;
4013 
4014     if (hdr->block_content_ids)
4015 	free(hdr->block_content_ids);
4016 
4017     if (hdr->tags)
4018 	HashTableDestroy(hdr->tags, 0);
4019 
4020     free(hdr);
4021 
4022     return;
4023 }
4024 
cram_free_slice(cram_slice * s)4025 void cram_free_slice(cram_slice *s) {
4026     if (!s)
4027 	return;
4028 
4029     if (s->bl)
4030 	free(s->bl);
4031 
4032     if (s->hdr_block)
4033 	cram_free_block(s->hdr_block);
4034 
4035     if (s->block) {
4036 	int i;
4037 
4038 	if (s->hdr) {
4039 	    for (i = 0; i < s->hdr->num_blocks; i++) {
4040 		cram_free_block(s->block[i]);
4041 	    }
4042 	}
4043 	free(s->block);
4044     }
4045 
4046     if (s->block_by_id)
4047 	free(s->block_by_id);
4048 
4049     if (s->hdr)
4050 	cram_free_slice_header(s->hdr);
4051 
4052     if (s->seqs_blk)
4053 	cram_free_block(s->seqs_blk);
4054 
4055     if (s->qual_blk)
4056 	cram_free_block(s->qual_blk);
4057 
4058     if (s->name_blk)
4059 	cram_free_block(s->name_blk);
4060 
4061     if (s->aux_blk)
4062 	cram_free_block(s->aux_blk);
4063 
4064     if (s->base_blk)
4065 	cram_free_block(s->base_blk);
4066 
4067     if (s->soft_blk)
4068 	cram_free_block(s->soft_blk);
4069 
4070 #ifdef TN_external
4071     if (s->tn_blk)
4072 	cram_free_block(s->tn_blk);
4073 #endif
4074 
4075     if (s->cigar)
4076 	free(s->cigar);
4077 
4078     if (s->crecs)
4079 	free(s->crecs);
4080 
4081     if (s->features)
4082 	free(s->features);
4083 
4084 #ifndef TN_external
4085     if (s->TN)
4086 	free(s->TN);
4087 #endif
4088 
4089     if (s->pair[0])
4090 	HashTableDestroy(s->pair[0], 0);
4091     if (s->pair[1])
4092 	HashTableDestroy(s->pair[1], 0);
4093 
4094     if (s->aux_block)
4095 	free(s->aux_block);
4096 
4097     free(s);
4098 }
4099 
4100 /*
4101  * Creates a new empty slice in memory, for subsequent writing to
4102  * disk.
4103  *
4104  * Returns cram_slice ptr on success
4105  *         NULL on failure
4106  */
cram_new_slice(enum cram_content_type type,int nrecs)4107 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4108     cram_slice *s = calloc(1, sizeof(*s));
4109     if (!s)
4110 	return NULL;
4111 
4112     if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4113 	goto err;
4114     s->hdr->content_type = type;
4115 
4116     s->hdr_block = NULL;
4117     s->block = NULL;
4118     s->block_by_id = NULL;
4119     s->last_apos = 0;
4120     if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4121     s->cigar = NULL;
4122     s->cigar_alloc = 0;
4123     s->ncigar = 0;
4124 
4125     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4126     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4127     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4128     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4129     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4130     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4131 
4132     s->features = NULL;
4133     s->nfeatures = s->afeatures = 0;
4134 
4135 #ifndef TN_external
4136     s->TN = NULL;
4137     s->nTN = s->aTN = 0;
4138 #endif
4139 
4140     // Volatile keys as we do realloc in dstring
4141     if (!(s->pair[0] = HashTableCreate(10000, HASH_DYNAMIC_SIZE)))   goto err;
4142     if (!(s->pair[1] = HashTableCreate(10000, HASH_DYNAMIC_SIZE)))   goto err;
4143 
4144 #ifdef BA_external
4145     s->BA_len = 0;
4146 #endif
4147 
4148     //memset(&s->blocks[0], 0, 1024*sizeof(s->blocks[0]));
4149     //if (!(s->blocks[ID("QS")] = cram_new_block(EXTERNAL, ID("QS")))) goto err;
4150     //if (!(s->blocks[ID("RN")] = cram_new_block(EXTERNAL, ID("RN")))) goto err;
4151     //if (!(s->blocks[ID("IN")] = cram_new_block(EXTERNAL, ID("IN")))) goto err;
4152     //if (!(s->blocks[ID("SC")] = cram_new_block(EXTERNAL, ID("SC")))) goto err;
4153 
4154     s->BD_crc = 0;
4155     s->SD_crc = 0;
4156 
4157     return s;
4158 
4159  err:
4160     if (s)
4161 	cram_free_slice(s);
4162 
4163     return NULL;
4164 }
4165 
4166 /*
4167  * Loads an entire slice.
4168  * FIXME: In 1.0 the native unit of slices within CRAM is broken
4169  * as slices contain references to objects in other slices.
4170  * To work around this while keeping the slice oriented outer loop
4171  * we read all slices and stitch them together into a fake large
4172  * slice instead.
4173  *
4174  * Returns cram_slice ptr on success
4175  *         NULL on failure
4176  */
cram_read_slice(cram_fd * fd)4177 cram_slice *cram_read_slice(cram_fd *fd) {
4178     cram_block *b = cram_read_block(fd);
4179     cram_slice *s = calloc(1, sizeof(*s));
4180     int i, n, max_id, min_id;
4181 
4182     if (!b || !s)
4183 	goto err;
4184 
4185     s->hdr_block = b;
4186     switch (b->content_type) {
4187     case MAPPED_SLICE:
4188     case UNMAPPED_SLICE:
4189 	if (!(s->hdr = cram_decode_slice_header(fd, b)))
4190 	    goto err;
4191 	break;
4192 
4193     default:
4194 	fprintf(stderr, "Unexpected block of type %s\n",
4195 		cram_content_type2str(b->content_type));
4196 	goto err;
4197     }
4198 
4199     if (s->hdr->num_blocks < 1) {
4200         fprintf(stderr, "Slice does not include any data blocks.\n");
4201 	goto err;
4202     }
4203 
4204     s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4205     if (!s->block)
4206 	goto err;
4207 
4208     for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4209 	if (!(s->block[i] = cram_read_block(fd)))
4210 	    goto err;
4211 
4212 	if (s->block[i]->content_type == EXTERNAL) {
4213 	    if (max_id < s->block[i]->content_id)
4214 		max_id = s->block[i]->content_id;
4215 	    if (min_id > s->block[i]->content_id)
4216 		min_id = s->block[i]->content_id;
4217 	}
4218     }
4219     if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4220 	goto err;
4221 
4222     for (i = 0; i < n; i++) {
4223 	if (s->block[i]->content_type != EXTERNAL)
4224 	    continue;
4225 	int v = s->block[i]->content_id;
4226 	if (v < 0 || v >= 512)
4227 	    v = 256 + (v > 0 ? v % 251 : (-v) % 251);
4228 	s->block_by_id[v] = s->block[i];
4229     }
4230 
4231     /* Initialise encoding/decoding tables */
4232     s->cigar = NULL;
4233     s->cigar_alloc = 0;
4234     s->ncigar = 0;
4235 
4236     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4237     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4238     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4239     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4240     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4241     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4242 
4243     s->crecs = NULL;
4244 
4245     s->last_apos = s->hdr->ref_seq_start;
4246     s->decode_md = fd->decode_md;
4247 
4248     return s;
4249 
4250  err:
4251     if (b)
4252 	cram_free_block(b);
4253     if (s) {
4254 	s->hdr_block = NULL;
4255 	cram_free_slice(s);
4256     }
4257     return NULL;
4258 }
4259 
4260 
4261 /* ----------------------------------------------------------------------
4262  * CRAM file definition (header)
4263  */
4264 
4265 /*
4266  * Reads a CRAM file definition structure.
4267  * Returns file_def ptr on success
4268  *         NULL on failure
4269  */
cram_read_file_def(cram_fd * fd)4270 cram_file_def *cram_read_file_def(cram_fd *fd) {
4271     cram_file_def *def = malloc(sizeof(*def));
4272     if (!def)
4273 	return NULL;
4274 
4275     if (26 != CRAM_IO_READ(&def->magic[0], 1, 26, fd)) {
4276 	free(def);
4277 	return NULL;
4278     }
4279 
4280     if (memcmp(def->magic, "CRAM", 4) != 0) {
4281 	free(def);
4282 	return NULL;
4283     }
4284 
4285     if (def->major_version > 4) {
4286 	fprintf(stderr, "CRAM version number mismatch\n"
4287 		"Expected 1.x, 2.x, 3.x or 4.x, got %d.%d\n",
4288 		def->major_version, def->minor_version);
4289 	free(def);
4290 	return NULL;
4291     }
4292 
4293     fd->first_container += 26;
4294     fd->last_slice = 0;
4295 
4296     return def;
4297 }
4298 
4299 /*
4300  * Writes a cram_file_def structure to cram_fd.
4301  * Returns 0 on success
4302  *        -1 on failure
4303  */
cram_write_file_def(cram_fd * fd,cram_file_def * def)4304 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4305     return (CRAM_IO_WRITE(&def->magic[0], 1, 26, fd) == 26) ? 0 : -1;
4306 }
4307 
cram_free_file_def(cram_file_def * def)4308 void cram_free_file_def(cram_file_def *def) {
4309     if (def) free(def);
4310 }
4311 
4312 /* ----------------------------------------------------------------------
4313  * SAM header I/O
4314  */
4315 
4316 
4317 /*
4318  * Reads the SAM header from the first CRAM data block.
4319  * Also performs minimal parsing to extract read-group
4320  * and sample information.
4321 
4322  * Returns SAM hdr ptr on success
4323  *         NULL on failure
4324  */
cram_read_SAM_hdr(cram_fd * fd)4325 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
4326     int32_t header_len;
4327     char *header;
4328     SAM_hdr *hdr;
4329 
4330     /* 1.1 onwards stores the header in the first block of a container */
4331     if (IS_CRAM_1_VERS(fd)) {
4332 	/* Length */
4333 	if (-1 == int32_decode(fd, &header_len))
4334 	    return NULL;
4335 
4336 	/* Alloc and read */
4337 	if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4338 	    return NULL;
4339 
4340 	if (header_len != CRAM_IO_READ(header, 1, header_len, fd))
4341 	    return NULL;
4342 	header[header_len] = '\0';
4343 
4344 	fd->first_container += 4 + header_len;
4345     } else {
4346 	cram_container *c = cram_read_container(fd);
4347 	cram_block *b;
4348 	int i, len;
4349 
4350 	if (!c)
4351 	    return NULL;
4352 
4353 	fd->first_container += c->length + c->offset;
4354 
4355 	if (c->num_blocks < 1) {
4356 	    cram_free_container(c);
4357 	    return NULL;
4358 	}
4359 
4360 	if (!(b = cram_read_block(fd))) {
4361 	    cram_free_container(c);
4362 	    return NULL;
4363 	}
4364 	if (cram_uncompress_block(b) != 0) {
4365 	    cram_free_container(c);
4366 	    cram_free_block(b);
4367 	    return NULL;
4368 	}
4369 
4370 	len = b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4371 	    itf8_size(b->content_id) +
4372 	    itf8_size(b->uncomp_size) +
4373 	    itf8_size(b->comp_size);
4374 
4375 	/* Extract header from 1st block */
4376 	if (-1 == int32_get_blk(b, &header_len) ||
4377             header_len < 0 || /* Spec. says signed...  why? */
4378 	    b->uncomp_size - 4 < header_len) {
4379 	    cram_free_container(c);
4380 	    cram_free_block(b);
4381 	    return NULL;
4382 	}
4383 	if (NULL == (header = malloc((size_t) header_len+1))) {
4384 	    cram_free_container(c);
4385 	    cram_free_block(b);
4386 	    return NULL;
4387 	}
4388 	memcpy(header, BLOCK_END(b), header_len);
4389 	header[header_len] = '\0';
4390 	cram_free_block(b);
4391 
4392 	/* Consume any remaining blocks */
4393 	for (i = 1; i < c->num_blocks; i++) {
4394 	    if (!(b = cram_read_block(fd))) {
4395 		cram_free_container(c);
4396 		return NULL;
4397 	    }
4398 	    len += b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4399 		itf8_size(b->content_id) +
4400 		itf8_size(b->uncomp_size) +
4401 		itf8_size(b->comp_size);
4402 	    cram_free_block(b);
4403 	}
4404 
4405 	if (c->length > 0 && len > 0 && c->length > len) {
4406 	    // Consume padding
4407 	    char *pads = malloc(c->length - len);
4408 	    if (!pads) {
4409 		cram_free_container(c);
4410 		return NULL;
4411 	    }
4412 
4413 	    if (c->length - len != CRAM_IO_READ(pads, 1, c->length - len, fd)) {
4414 		cram_free_container(c);
4415 		return NULL;
4416 	    }
4417 	    free(pads);
4418 	}
4419 
4420 	cram_free_container(c);
4421     }
4422 
4423     /* Parse */
4424 #ifdef SAMTOOLS
4425     hdr = sam_hdr_parse_(header, header_len);
4426 #else
4427     hdr = sam_hdr_parse(header, header_len);
4428 #endif
4429     free(header);
4430 
4431     return hdr;
4432 }
4433 
4434 /*
4435  * Converts 'in' to a full pathname to store in out.
4436  * Out must be at least PATH_MAX bytes long.
4437  */
full_path(char * out,char * in)4438 static void full_path(char *out, char *in) {
4439     if (*in == '/') {
4440 	strncpy(out, in, PATH_MAX);
4441 	out[PATH_MAX-1] = 0;
4442     } else {
4443 	int len;
4444 
4445 	// unable to get dir or out+in is too long
4446 	if (!getcwd(out, PATH_MAX) ||
4447 	    (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4448 	    strncpy(out, in, PATH_MAX);
4449 	    out[PATH_MAX-1] = 0;
4450 	    return;
4451 	}
4452 
4453 	sprintf(out+len, "/%.*s", PATH_MAX - len, in);
4454 
4455 	// FIXME: cope with `pwd`/../../../foo.fa ?
4456     }
4457 }
4458 
4459 /*
4460  * Writes a CRAM SAM header.
4461  * Returns 0 on success
4462  *        -1 on failure
4463  */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)4464 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
4465     int header_len;
4466     int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4467 
4468     /* Fix M5 strings */
4469     if (fd->refs && !fd->no_ref) {
4470 	int i;
4471 	for (i = 0; i < hdr->nref; i++) {
4472 	    SAM_hdr_type *ty;
4473 	    char *ref;
4474 
4475 	    if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
4476 		return -1;
4477 
4478 	    if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
4479 		char unsigned buf[16], buf2[33];
4480 		int j, rlen;
4481 		MD5_CTX md5;
4482 
4483 		if (!fd->refs->ref_id || !fd->refs->ref_id[i])
4484 		    return -1;
4485 
4486 		rlen = fd->refs->ref_id[i]->length;
4487 		MD5_Init(&md5);
4488 		ref = cram_get_ref(fd, i, 1, rlen);
4489 		if (NULL == ref) return -1;
4490 		rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4491 		MD5_Update(&md5, ref, rlen);
4492 		MD5_Final(buf, &md5);
4493 		cram_ref_decr(fd->refs, i);
4494 
4495 		for (j = 0; j < 16; j++) {
4496 		    buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
4497 		    buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
4498 		}
4499 		buf2[32] = 0;
4500 		if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
4501 		    return -1;
4502 	    }
4503 
4504 	    if (fd->ref_fn) {
4505 		char ref_fn[PATH_MAX];
4506 		full_path(ref_fn, fd->ref_fn);
4507 		if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
4508 		    return -1;
4509 	    }
4510 	}
4511     }
4512 
4513     if (sam_hdr_rebuild(hdr))
4514 	return -1;
4515 
4516     /* Length */
4517     header_len = sam_hdr_length(hdr);
4518 
4519     /* Create block(s) inside a container */
4520     cram_block *b = cram_new_block(FILE_HEADER, 0);
4521     cram_container *c = cram_new_container(0, 0);
4522     int padded_length;
4523     char *pads;
4524 
4525     if (!b || !c) {
4526 	if (b) cram_free_block(b);
4527 	if (c) cram_free_container(c);
4528 	return -1;
4529     }
4530 
4531     int32_put(b, header_len);
4532     BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
4533     BLOCK_UPLEN(b);
4534 
4535     // Compress header block if V3.0 and above
4536     if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
4537 	int method = 1<<GZIP;
4538 	if (fd->use_bz2)
4539 	    method |= 1<<BZIP2;
4540 	if (fd->use_bsc)
4541 	    method |= 1<<BSC;
4542 	if (fd->use_fqz)
4543 	    method |= 1<<FQZ;
4544 	if (fd->use_lzma)
4545 	    method |= 1<<LZMA;
4546 	cram_compress_block(fd, NULL, b, NULL, method, fd->level);
4547     }
4548 
4549     if (blank_block) {
4550 	c->length = b->comp_size + 2 + 4*IS_CRAM_3_VERS(fd) +
4551 	    itf8_size(b->content_id)   +
4552 	    itf8_size(b->uncomp_size)  +
4553 	    itf8_size(b->comp_size);
4554 
4555 	c->num_blocks = 2;
4556 	c->num_landmarks = 2;
4557 	if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
4558 	    cram_free_block(b);
4559 	    cram_free_container(c);
4560 	    return -1;
4561 	}
4562 	c->landmark[0] = 0;
4563 	c->landmark[1] = c->length;
4564 
4565 	// Plus extra storage for uncompressed secondary blank block
4566 	padded_length = MIN(c->length*.5, 10000);
4567 	c->length += padded_length + 2 + 4*IS_CRAM_3_VERS(fd) +
4568 	    itf8_size(b->content_id) +
4569 	    itf8_size(padded_length)*2;
4570     } else {
4571 	// Pad the block instead.
4572 	c->num_blocks = 1;
4573 	c->num_landmarks = 1;
4574 	if (!(c->landmark = malloc(sizeof(*c->landmark))))
4575 	    return -1;
4576 	c->landmark[0] = 0;
4577 
4578 	padded_length = MAX(c->length*1.5, 10000) - c->length;
4579 
4580 	c->length = b->comp_size + padded_length +
4581 	    2 + 4*IS_CRAM_3_VERS(fd) +
4582 	    itf8_size(b->content_id)   +
4583 	    itf8_size(b->uncomp_size)  +
4584 	    itf8_size(b->comp_size);
4585 
4586 	if (NULL == (pads = calloc(1, padded_length))) {
4587 	    cram_free_block(b);
4588 	    cram_free_container(c);
4589 	    return -1;
4590 	}
4591 	BLOCK_APPEND(b, pads, padded_length);
4592 	BLOCK_UPLEN(b);
4593 	free(pads);
4594     }
4595 
4596     if (-1 == cram_write_container(fd, c)) {
4597 	cram_free_block(b);
4598 	cram_free_container(c);
4599 	return -1;
4600     }
4601 
4602     if (-1 == cram_write_block(fd, b)) {
4603 	cram_free_block(b);
4604 	cram_free_container(c);
4605 	return -1;
4606     }
4607 
4608     if (blank_block) {
4609 	BLOCK_RESIZE(b, padded_length);
4610 	memset(BLOCK_DATA(b), 0, padded_length);
4611 	BLOCK_SIZE(b) = padded_length;
4612 	BLOCK_UPLEN(b);
4613 	b->method = RAW;
4614 	b->crc32 = 0;
4615 	if (-1 == cram_write_block(fd, b)) {
4616 	    cram_free_block(b);
4617 	    cram_free_container(c);
4618 	    return -1;
4619 	}
4620     }
4621 
4622     cram_free_block(b);
4623     cram_free_container(c);
4624 
4625     if (-1 == refs_from_header(fd->refs, fd, fd->header))
4626 	return -1;
4627     if (-1 == refs2id(fd->refs, fd->header))
4628 	return -1;
4629 
4630     CRAM_IO_FLUSH(fd);
4631 
4632     RP("=== Finishing saving header ===\n");
4633 
4634     return 0;
4635 }
4636 
4637 /* ----------------------------------------------------------------------
4638  * The top-level cram opening, closing and option handling
4639  */
4640 
4641 /*
4642  * Initialises the lookup tables. These could be global statics, but they're
4643  * clumsy to setup in a multi-threaded environment unless we generate
4644  * verbatim code and include that.
4645  */
cram_init_tables(cram_fd * fd)4646 static void cram_init_tables(cram_fd *fd) {
4647     int i;
4648 
4649     memset(fd->L1, 4, 256);
4650     fd->L1['A'] = 0; fd->L1['a'] = 0;
4651     fd->L1['C'] = 1; fd->L1['c'] = 1;
4652     fd->L1['G'] = 2; fd->L1['g'] = 2;
4653     fd->L1['T'] = 3; fd->L1['t'] = 3;
4654 
4655     memset(fd->L2, 5, 256);
4656     fd->L2['A'] = 0; fd->L2['a'] = 0;
4657     fd->L2['C'] = 1; fd->L2['c'] = 1;
4658     fd->L2['G'] = 2; fd->L2['g'] = 2;
4659     fd->L2['T'] = 3; fd->L2['t'] = 3;
4660     fd->L2['N'] = 4; fd->L2['n'] = 4;
4661 
4662     if (IS_CRAM_1_VERS(fd)) {
4663 	for (i = 0; i < 0x200; i++) {
4664 	    int f = 0;
4665 
4666 	    if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
4667 	    if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
4668 	    if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
4669 	    if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
4670 	    if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
4671 	    if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
4672 	    if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
4673 	    if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
4674 	    if (i & CRAM_FDUP)         f |= BAM_FDUP;
4675 
4676 	    fd->bam_flag_swap[i]  = f;
4677 	}
4678 
4679 	for (i = 0; i < 0x1000; i++) {
4680 	    int g = 0;
4681 
4682 	    if (i & BAM_FPAIRED)	   g |= CRAM_FPAIRED;
4683 	    if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
4684 	    if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
4685 	    if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
4686 	    if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
4687 	    if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
4688 	    if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
4689 	    if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
4690 	    if (i & BAM_FDUP)          g |= CRAM_FDUP;
4691 
4692 	    fd->cram_flag_swap[i] = g;
4693 	}
4694     } else {
4695 	/* NOP */
4696 	for (i = 0; i < 0x1000; i++)
4697 	    fd->bam_flag_swap[i] = i;
4698 	for (i = 0; i < 0x1000; i++)
4699 	    fd->cram_flag_swap[i] = i;
4700     }
4701 
4702     memset(fd->cram_sub_matrix, 4, 32*32);
4703     for (i = 0; i < 32; i++) {
4704 	fd->cram_sub_matrix[i]['A'&0x1f]=0;
4705 	fd->cram_sub_matrix[i]['C'&0x1f]=1;
4706 	fd->cram_sub_matrix[i]['G'&0x1f]=2;
4707 	fd->cram_sub_matrix[i]['T'&0x1f]=3;
4708 	fd->cram_sub_matrix[i]['N'&0x1f]=4;
4709     }
4710     for (i = 0; i < 20; i+=4) {
4711 	int j;
4712 	for (j = 0; j < 20; j++) {
4713 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4714 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4715 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4716 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4717 	}
4718 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
4719 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
4720 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
4721 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
4722     }
4723 }
4724 
4725 // Default version numbers for CRAM
4726 static int major_version = 3;
4727 static int minor_version = 0;
4728 
cram_default_version(void)4729 int cram_default_version(void) {
4730     return major_version*100 + minor_version;
4731 }
4732 
cram_io_close(cram_fd * fd,int * fclose_result)4733 cram_fd * cram_io_close(cram_fd * fd, int * fclose_result)
4734 {
4735     if ( fd ) {
4736         if ( fd->fp_in ) {
4737             fclose(fd->fp_in);
4738             fd->fp_in = NULL;
4739         }
4740         if ( fd->fp_out ) {
4741             int const r = paranoid_fclose(fd->fp_out);
4742             if ( fclose_result )
4743                 *fclose_result = r;
4744             fd->fp_out = NULL;
4745         }
4746 
4747 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4748         if ( fd->fp_in_callbacks ) {
4749             fd->fp_in_callbacks = fd->fp_in_callback_deallocate_function(fd->fp_in_callbacks);
4750         }
4751         if ( fd->fp_in_buffer ) {
4752             fd->fp_in_buffer = cram_io_deallocate_input_buffer(fd->fp_in_buffer);
4753         }
4754         if ( fd->fp_out_callbacks ) {
4755             fd->fp_out_callbacks = fd->fp_out_callback_deallocate_function(fd->fp_out_callbacks);
4756         }
4757         if ( fd->fp_out_buffer ) {
4758             fd->fp_out_buffer = cram_io_deallocate_output_buffer(fd->fp_out_buffer);
4759         }
4760 #endif
4761 
4762         free(fd);
4763         fd = NULL;
4764     }
4765     return fd;
4766 }
4767 
4768 #if defined(CRAM_IO_CUSTOM_BUFFERING)
cram_io_open_by_callbacks(char const * filename,cram_io_allocate_read_input_t callback_allocate_function,cram_io_deallocate_read_input_t callback_deallocate_function,size_t const bufsize,int const decompress)4769 cram_fd * cram_io_open_by_callbacks(
4770     char const * filename,
4771     cram_io_allocate_read_input_t   callback_allocate_function,
4772     cram_io_deallocate_read_input_t callback_deallocate_function,
4773     size_t const bufsize,
4774     int const decompress
4775 )
4776 {
4777     cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4778 
4779     if ( ! fd )
4780        return cram_io_close(fd,0);
4781 
4782     memset(fd,0,sizeof(cram_fd));
4783 
4784     fd->fp_in_callback_allocate_function   = callback_allocate_function;
4785     fd->fp_in_callback_deallocate_function = callback_deallocate_function;
4786 
4787     fd->fp_in_callbacks =
4788 	fd->fp_in_callback_allocate_function(filename,decompress);
4789 
4790     if ( ! fd->fp_in_callbacks )
4791 	return cram_io_close(fd,0);
4792 
4793     fd->fp_in_buffer = cram_io_allocate_input_buffer(bufsize);
4794     if ( ! fd->fp_in_buffer ) {
4795 	return cram_io_close(fd,0);
4796     }
4797 
4798     return fd;
4799 }
4800 
4801 // FIXME: make a shared interface with cram_io_open_by_callbacks above
cram_io_openw_by_callbacks(char const * filename,cram_io_allocate_write_output_t callback_allocate_function,cram_io_deallocate_write_output_t callback_deallocate_function,size_t const bufsize)4802 cram_fd * cram_io_openw_by_callbacks(
4803     char const * filename,
4804     cram_io_allocate_write_output_t   callback_allocate_function,
4805     cram_io_deallocate_write_output_t callback_deallocate_function,
4806     size_t const bufsize
4807 )
4808 {
4809     cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4810 
4811     if ( ! fd )
4812        return cram_io_close(fd,0);
4813 
4814     memset(fd,0,sizeof(cram_fd));
4815 
4816     fd->fp_out_callback_allocate_function   = callback_allocate_function;
4817     fd->fp_out_callback_deallocate_function = callback_deallocate_function;
4818 
4819     fd->fp_out_callbacks =
4820 	fd->fp_out_callback_allocate_function(filename);
4821 
4822     if ( ! fd->fp_out_callbacks )
4823 	return cram_io_close(fd,0);
4824 
4825     fd->fp_out_buffer = cram_io_allocate_output_buffer(bufsize);
4826     if ( ! fd->fp_out_buffer ) {
4827 	return cram_io_close(fd,0);
4828     }
4829 
4830     return fd;
4831 }
4832 #endif // CRAM_IO_CUSTOM_BUFFERING
4833 
cram_io_open(char const * filename,char const * mode,char const * fmode)4834 cram_fd * cram_io_open(
4835 	char const * filename,
4836 	char const * mode,
4837 	char const * fmode
4838 )
4839 {
4840     cram_fd * fd = (cram_fd *)malloc(sizeof(cram_fd));
4841     if ( ! fd )
4842        return cram_io_close(fd,0);
4843 
4844     memset(fd,0,sizeof(cram_fd));
4845 
4846 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4847     fd->fp_in_callback_allocate_function = NULL;
4848     fd->fp_in_callback_deallocate_function = cram_IO_deallocate_cram_io_input;
4849     fd->fp_out_callback_allocate_function = NULL;
4850     fd->fp_out_callback_deallocate_function = cram_IO_deallocate_cram_io_output;
4851 #endif
4852 
4853     if ( *mode == 'r' ) {
4854         size_t bufsize = 0;
4855 #if defined(CRAM_IO_CUSTOM_BUFFERING) && defined(HAVE_STDIO_EXT_H)
4856         int isreg = 0;
4857 #endif
4858 
4859     	if ( strcmp(filename,"-") == 0 ) {
4860 	    fd->fp_in = stdin;
4861 	}
4862 	else {
4863 	    fd->fp_in = fopen(filename, fmode);
4864 	}
4865 
4866 	if ( ! fd->fp_in )
4867 	    return cram_io_close(fd,0);
4868 
4869 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4870 
4871 #if defined(HAVE_STDIO_EXT_H)
4872 
4873 #if defined(HAVE_FILENO) && defined(HAVE_FSTAT)
4874         do {
4875             int const filedesc = fileno(fd->fp_in);
4876             struct stat sb;
4877             int const fdret = fstat(filedesc,&sb);
4878             isreg = (fdret == 0) && S_ISREG(sb.st_mode);
4879         } while ( 0 );
4880 #endif
4881 
4882         /* get input buffer size */
4883         if ( isreg ) {
4884             /* read one character to force buffer to be set up */
4885             int const c = fgetc(fd->fp_in);
4886             /* EOF? */
4887             if ( c != EOF ) {
4888             	/* get buffer size */
4889 		int cc;
4890             	bufsize = __fbufsize(fd->fp_in);
4891             	/* put character back (C standard says this is guaranteed to
4892 		 * work for a single character)
4893 		 */
4894             	cc = ungetc(c,fd->fp_in);
4895             	/* check result anyway */
4896 		if ( cc == EOF ) {
4897                     return cram_io_close(fd,0);
4898 		}
4899             }
4900 	}
4901 #endif // HAVE_STDIO_EXT_H
4902 
4903         fd->fp_in_callbacks = cram_IO_allocate_cram_io_input_from_C_FILE(fd->fp_in);
4904 	if ( ! fd->fp_in_callbacks )
4905 	    return cram_io_close(fd,0);
4906 
4907 	if ( !bufsize )
4908 	    bufsize = 32*1024;
4909 
4910 	do {
4911 	    fd->fp_in_buffer = cram_io_allocate_input_buffer(bufsize);
4912 	    if ( ! fd->fp_in_buffer ) {
4913                 return cram_io_close(fd,0);
4914 	    }
4915 	    else {
4916 	        setvbuf(fd->fp_in, NULL, _IONBF, 0);
4917 	    }
4918 	} while ( 0 );
4919 
4920 #endif // CRAM_IO_CUSTOM_BUFFERING
4921     } else {
4922 	if (filename) {
4923 	    if ( strcmp(filename,"-") == 0 ) {
4924 		fd->fp_out = stdout;
4925 	    } else {
4926 		fd->fp_out = fopen(filename, fmode);
4927 	    }
4928 
4929 	    if ( ! fd->fp_out )
4930 		return cram_io_close(fd,0);
4931 	} else {
4932 	    // E.g. opening a CRAM in-memory file.
4933 	    fd->fp_out = NULL;
4934 	}
4935 
4936 #if defined(CRAM_IO_CUSTOM_BUFFERING)
4937         fd->fp_out_callbacks
4938 	    = cram_IO_allocate_cram_io_output_from_C_FILE(fd->fp_out);
4939 
4940 	if ( ! fd->fp_out_callbacks )
4941 	    return cram_io_close(fd,0);
4942 
4943 	int bufsize = 32*1024; // FIXME: use same bufsize calc as above
4944 	fd->fp_out_buffer = cram_io_allocate_output_buffer(bufsize);
4945 	if ( ! fd->fp_out_buffer ) {
4946 	    return cram_io_close(fd,0);
4947 	} else if (fd->fp_out) {
4948 	    setvbuf(fd->fp_out, NULL, _IONBF, 0);
4949 	}
4950 
4951 #endif // CRAM_IO_CUSTOM_BUFFERING
4952 
4953     }
4954 
4955     return fd;
4956 }
4957 
4958 /*
4959  * Opens a CRAM file for read (mode "rb") or write ("wb").
4960  * The filename may be "-" to indicate stdin or stdout.
4961  *
4962  * Returns file handle on success
4963  *         NULL on failure.
4964  */
cram_open(const char * filename,const char * mode)4965 cram_fd *cram_open(const char *filename, const char *mode) {
4966     int i;
4967     char *cp;
4968     char fmode[3]= { mode[0], '\0', '\0' };
4969     cram_fd *fd = NULL;
4970 
4971     if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
4972 	fmode[1] = 'b';
4973     }
4974 
4975     fd = cram_io_open(filename,mode,fmode);
4976     if (!fd)
4977 	return cram_io_close(fd,0);
4978 
4979     fd->level = 5;
4980     if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9')
4981 	fd->level = mode[2] - '0';
4982 
4983     fd->mode = *mode;
4984     fd->first_container = 0;
4985 
4986     if (fd->mode == 'r') {
4987 	/* Reader */
4988 
4989 	if (!(fd->file_def = cram_read_file_def(fd)))
4990 	    goto err;
4991 
4992 	fd->version = fd->file_def->major_version * 256 +
4993 	    fd->file_def->minor_version;
4994 
4995 	if (!(fd->header = cram_read_SAM_hdr(fd))) {
4996 	    cram_free_file_def(fd->file_def);
4997 	    goto err;
4998 	}
4999 
5000     } else {
5001 	/* Writer */
5002 	cram_file_def def;
5003 
5004 	if (major_version == 1) {
5005 	    fprintf(stderr, "Unable to write to version 1.0\n");
5006 	    goto err;
5007 	}
5008 
5009 	def.magic[0] = 'C';
5010 	def.magic[1] = 'R';
5011 	def.magic[2] = 'A';
5012 	def.magic[3] = 'M';
5013 	def.major_version = major_version;
5014 	def.minor_version = minor_version;
5015 	memset(def.file_id, 0, 20);
5016 	strncpy(def.file_id, filename, 20);
5017 	if (0 != cram_write_file_def(fd, &def))
5018 	    goto err;
5019 
5020 	fd->version = def.major_version * 256 + def.minor_version;
5021 
5022 	/* SAM header written later */
5023     }
5024 
5025     cram_init_tables(fd);
5026 
5027     fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5028     if (!fd->prefix)
5029 	goto err;
5030     fd->first_base = fd->last_base = -1;
5031     fd->record_counter = 0;
5032 
5033     fd->ctr = NULL;
5034     fd->ctr_mt = NULL;
5035     fd->refs  = refs_create();
5036     if (!fd->refs)
5037 	goto err;
5038     fd->ref_id = -2;
5039     fd->ref = NULL;
5040 
5041     fd->decode_md = 0;
5042     fd->verbose = 0;
5043     fd->seqs_per_slice = SEQS_PER_SLICE;
5044     fd->bases_per_slice = BASES_PER_SLICE;
5045     fd->slices_per_container = SLICE_PER_CNT;
5046     fd->embed_ref = 0;
5047     fd->no_ref = 0;
5048     fd->ignore_md5 = 0;
5049     fd->ignore_chksum = 1; // Some disagreement in the specification of these
5050     fd->lossy_read_names = 0;
5051     fd->use_bz2 = 0;
5052     fd->use_rans = IS_CRAM_3_VERS(fd);
5053     fd->use_bsc = 0;
5054     fd->use_lzma = 0;
5055     fd->multi_seq = 0;
5056     fd->unsorted   = 0;
5057     fd->shared_ref = 0;
5058 
5059     fd->index       = NULL;
5060     fd->own_pool    = 0;
5061     fd->pool        = NULL;
5062     fd->rqueue      = NULL;
5063     fd->job_pending = NULL;
5064     fd->ooc         = 0;
5065     fd->binning     = BINNING_NONE;
5066     fd->required_fields = INT_MAX;
5067 
5068     for (i = 0; i < DS_END; i++)
5069 	fd->m[i] = cram_new_metrics();
5070 
5071     if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5072 	goto err;
5073 
5074     fd->range.refid = -2; // no ref.
5075     fd->eof = 1;
5076     fd->ref_fn = NULL;
5077 
5078     fd->bl = NULL;
5079 
5080     /* Initialise dummy refs from the @SQ headers */
5081     if (-1 == refs_from_header(fd->refs, fd, fd->header))
5082 	goto err;
5083 
5084     return fd;
5085 
5086  err:
5087     fd = cram_io_close(fd,0);
5088 
5089     return NULL;
5090 }
5091 
5092 #if defined(CRAM_IO_CUSTOM_BUFFERING)
5093 /*
5094  * Opens a CRAM file for input via callbacks
5095  *
5096  * Returns file handle on success
5097  *         NULL on failure.
5098  */
cram_open_by_callbacks(char const * filename,cram_io_allocate_read_input_t callback_allocate_function,cram_io_deallocate_read_input_t callback_deallocate_function,size_t const bufsize)5099 cram_fd *cram_open_by_callbacks(
5100     char const * filename,
5101     cram_io_allocate_read_input_t   callback_allocate_function,
5102     cram_io_deallocate_read_input_t callback_deallocate_function,
5103     size_t const bufsize
5104 ) {
5105     int i;
5106     char *cp;
5107     cram_fd *fd = NULL;
5108 
5109     fd = cram_io_open_by_callbacks(filename,
5110 				   callback_allocate_function,
5111 				   callback_deallocate_function,
5112 				   bufsize,
5113 				   0);
5114     if (!fd)
5115 	return cram_io_close(fd,0);
5116 
5117     fd->level = 5;
5118 
5119     fd->mode = 'r';
5120     fd->first_container = 0;
5121 
5122     /* Reader */
5123     if (!(fd->file_def = cram_read_file_def(fd)))
5124         goto err;
5125 
5126     fd->version = fd->file_def->major_version * 256 +
5127         fd->file_def->minor_version;
5128 
5129     if (!(fd->header = cram_read_SAM_hdr(fd)))
5130         goto err;
5131 
5132     cram_init_tables(fd);
5133 
5134     fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5135     if (!fd->prefix)
5136 	goto err;
5137     fd->first_base = fd->last_base = -1;
5138     fd->record_counter = 0;
5139 
5140     fd->ctr = NULL;
5141     fd->ctr_mt = NULL;
5142     fd->refs  = refs_create();
5143     if (!fd->refs)
5144 	goto err;
5145     fd->ref_id = -2;
5146     fd->ref = NULL;
5147 
5148     fd->decode_md = 0;
5149     fd->verbose = 0;
5150     fd->seqs_per_slice = SEQS_PER_SLICE;
5151     fd->bases_per_slice = BASES_PER_SLICE;
5152     fd->slices_per_container = SLICE_PER_CNT;
5153     fd->embed_ref = 0;
5154     fd->no_ref = 0;
5155     fd->ignore_md5 = 0;
5156     fd->ignore_chksum = 1; // Some disagreement in the specification of these
5157     fd->lossy_read_names = 0;
5158     fd->use_bz2 = 0;
5159     fd->use_rans = IS_CRAM_3_VERS(fd);
5160     fd->use_bsc = 0;
5161     fd->use_lzma = 0;
5162     fd->multi_seq = 0;
5163     fd->unsorted   = 0;
5164     fd->shared_ref = 0;
5165 
5166     fd->index       = NULL;
5167     fd->own_pool    = 0;
5168     fd->pool        = NULL;
5169     fd->rqueue      = NULL;
5170     fd->job_pending = NULL;
5171     fd->ooc         = 0;
5172     fd->binning     = BINNING_NONE;
5173     fd->required_fields = INT_MAX;
5174 
5175     for (i = 0; i < DS_END; i++)
5176 	fd->m[i] = cram_new_metrics();
5177 
5178     if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5179 	goto err;
5180 
5181     fd->range.refid = -2; // no ref.
5182     fd->eof = 1;
5183     fd->ref_fn = NULL;
5184 
5185     fd->bl = NULL;
5186 
5187     /* Initialise dummy refs from the @SQ headers */
5188     if (-1 == refs_from_header(fd->refs, fd, fd->header))
5189 	goto err;
5190 
5191     return fd;
5192 
5193  err:
5194     fd = cram_io_close(fd,0);
5195 
5196     return NULL;
5197 }
5198 
5199 /*
5200  * FIXME: make shared interface with code above.
5201  *
5202  * Opens a CRAM file for write via callbacks
5203  *
5204  * Returns file handle on success
5205  *         NULL on failure.
5206  */
cram_openw_by_callbacks(char const * filename,cram_io_allocate_write_output_t callback_allocate_function,cram_io_deallocate_write_output_t callback_deallocate_function,size_t const bufsize)5207 cram_fd *cram_openw_by_callbacks(
5208     char const * filename,
5209     cram_io_allocate_write_output_t   callback_allocate_function,
5210     cram_io_deallocate_write_output_t callback_deallocate_function,
5211     size_t const bufsize
5212 ) {
5213     int i;
5214     char *cp;
5215     cram_fd *fd = NULL;
5216 
5217     fd = cram_io_openw_by_callbacks(filename,
5218 				    callback_allocate_function,
5219 				    callback_deallocate_function,
5220 				    bufsize);
5221     if (!fd)
5222 	return cram_io_close(fd,0);
5223 
5224     fd->level = 5;
5225 
5226     fd->mode = 'w';
5227     fd->first_container = 0;
5228 
5229     {
5230 	/* Writer */
5231 	cram_file_def def;
5232 
5233 	if (major_version == 1) {
5234 	    fprintf(stderr, "Unable to write to version 1.0\n");
5235 	    goto err;
5236 	}
5237 
5238 	def.magic[0] = 'C';
5239 	def.magic[1] = 'R';
5240 	def.magic[2] = 'A';
5241 	def.magic[3] = 'M';
5242 	def.major_version = major_version;
5243 	def.minor_version = minor_version;
5244 	memset(def.file_id, 0, 20);
5245 	if (filename)
5246 	    strncpy(def.file_id, filename, 20);
5247 	if (0 != cram_write_file_def(fd, &def))
5248 	    goto err;
5249 
5250 	fd->version = def.major_version * 256 + def.minor_version;
5251 
5252 	/* SAM header written later */
5253     }
5254 
5255     cram_init_tables(fd);
5256 
5257     if (filename) {
5258 	fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5259 	if (!fd->prefix)
5260 	    goto err;
5261     } else {
5262 	fd->prefix = strdup("");
5263     }
5264     fd->first_base = fd->last_base = -1;
5265     fd->record_counter = 0;
5266 
5267     fd->ctr = NULL;
5268     fd->refs  = refs_create();
5269     if (!fd->refs)
5270 	goto err;
5271     fd->ref_id = -2;
5272     fd->ref = NULL;
5273 
5274     fd->decode_md = 0;
5275     fd->verbose = 0;
5276     fd->seqs_per_slice = SEQS_PER_SLICE;
5277     fd->bases_per_slice = BASES_PER_SLICE;
5278     fd->slices_per_container = SLICE_PER_CNT;
5279     fd->embed_ref = 0;
5280     fd->no_ref = 0;
5281     fd->ignore_md5 = 0;
5282     fd->use_bz2 = 0;
5283     fd->use_rans = IS_CRAM_3_VERS(fd);
5284     fd->use_bsc = 0;
5285     fd->use_lzma = 0;
5286     fd->multi_seq = 0;
5287     fd->unsorted   = 0;
5288     fd->shared_ref = 0;
5289 
5290     fd->index       = NULL;
5291     fd->own_pool    = 0;
5292     fd->pool        = NULL;
5293     fd->rqueue      = NULL;
5294     fd->job_pending = NULL;
5295     fd->ooc         = 0;
5296     fd->binning     = BINNING_NONE;
5297     fd->required_fields = INT_MAX;
5298 
5299     for (i = 0; i < DS_END; i++)
5300 	fd->m[i] = cram_new_metrics();
5301 
5302     if (!(fd->tags_used = HashTableCreate(16, HASH_DYNAMIC_SIZE)))
5303 	goto err;
5304 
5305     fd->range.refid = -2; // no ref.
5306     fd->eof = 1;
5307     fd->ref_fn = NULL;
5308 
5309     fd->bl = NULL;
5310 
5311     /* Initialise dummy refs from the @SQ headers */
5312     if (-1 == refs_from_header(fd->refs, fd, fd->header))
5313 	goto err;
5314 
5315     return fd;
5316 
5317  err:
5318     fd = cram_io_close(fd,0);
5319 
5320     return NULL;
5321 }
5322 #endif
5323 
5324 /*
5325  * Flushes a CRAM file.
5326  * Useful for when writing to stdout without wishing to close the stream.
5327  *
5328  * Returns 0 on success
5329  *        -1 on failure
5330  */
cram_flush(cram_fd * fd)5331 int cram_flush(cram_fd *fd) {
5332     if (!fd)
5333 	return -1;
5334 
5335     if (fd->mode == 'w' && fd->ctr) {
5336 	if(fd->ctr->slice)
5337 	    cram_update_curr_slice(fd->ctr);
5338 
5339 	if (-1 == cram_flush_container_mt(fd, fd->ctr))
5340 	    return -1;
5341     }
5342 
5343     return 0;
5344 }
5345 
5346 /*
5347  * Writes an EOF block to a CRAM file.
5348  *
5349  * Returns 0 on success
5350  *        -1 on failure
5351  */
cram_write_eof_block(cram_fd * fd)5352 int cram_write_eof_block(cram_fd *fd) {
5353     // 7-bit encoding version:
5354     //    CRAM_IO_WRITE(
5355     //		  "\x0f\x00\x00\x00\x8f\xff\xff\xff" // Cont HDR
5356     //		  "\x7f\x82\x95\x9e\x46\x00\x00\x00" // Cont HDR
5357     //		  "\x00\x01\x00"                     // Cont HDR
5358     //		  "\xac\xd6\x05\xbc"                 // CRC32
5359     //		  "\x00\x01\x00\x06\x06"             // Comp.HDR blk
5360     //		  "\x01\x00\x01\x00\x01\x00"         // Comp.HDR blk
5361     //		  "\xee\x63\x01\x4b",                // CRC32
5362     //		  38, 1, fd);
5363     //    return cram_io_flush_output_buffer(fd);
5364 
5365     // EOF block is a container with
5366     //   ref_seq_id -1
5367     //   start pos 0x454f46 ("EOF")
5368     //   span 0
5369     //   nrec 0
5370     //   counter 0
5371     //   nbases 0
5372     //   1 block (landmark 0)
5373     //   (CRC32)
5374     // followed by an empty compression header block
5375     //   method raw (0)
5376     //   type comp header (1)
5377     //   content id 0
5378     //   block contents size 6
5379     //   raw size 6
5380     //     empty preservation map (01 00)
5381     //     empty data series map (01 00)
5382     //     empty tag map (01 00)
5383     //   block CRC
5384     if (IS_CRAM_3_VERS(fd)) {
5385 	if (1 != CRAM_IO_WRITE(
5386 		"\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
5387 		"\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
5388 		"\x00\x01\x00"                     // Cont HDR
5389 		"\x05\xbd\xd9\x4f"                 // CRC32
5390 		//"\xa8\x2a\x1b\xb9"		   // CRC32C
5391 		"\x00\x01\x00\x06\x06"             // Comp.HDR blk
5392 		"\x01\x00\x01\x00\x01\x00"         // Comp.HDR blk
5393 		"\xee\x63\x01\x4b",                // CRC32
5394 		//"\xe9\x70\xd3\x86",              // CRC32C
5395 		38, 1, fd)) {
5396 	    fd = cram_io_close(fd,0);
5397 	    return -1;
5398 	}
5399     } else {
5400 	if (1 != CRAM_IO_WRITE("\x0b\x00\x00\x00\xff\xff\xff\xff"
5401 			       "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
5402 			       "\x00\x01\x00\x00\x01\x00\x06\x06"
5403 			       "\x01\x00\x01\x00\x01\x00", 30, 1, fd)) {
5404 	    fd = cram_io_close(fd,0);
5405 	    return -1;
5406 	}
5407     }
5408 
5409 #if defined(CRAM_IO_CUSTOM_BUFFERING)
5410     return cram_io_flush_output_buffer(fd);
5411 #else
5412     return 0;
5413 #endif
5414 }
5415 
5416 
5417 /*
5418  * Closes a CRAM file.
5419  * Returns 0 on success
5420  *        -1 on failure
5421  */
cram_close(cram_fd * fd)5422 int cram_close(cram_fd *fd) {
5423     spare_bams *bl, *next;
5424     int i;
5425     int rclose = 0;
5426 
5427     if (!fd) {
5428 	fd = cram_io_close(fd,0);
5429 	return -1;
5430     }
5431 
5432     if (fd->mode == 'w' && fd->ctr) {
5433 	if(fd->ctr->slice)
5434 	    cram_update_curr_slice(fd->ctr);
5435 
5436 	if (-1 == cram_flush_container_mt(fd, fd->ctr)) {
5437 	    fd = cram_io_close(fd,0);
5438 	    return -1;
5439 	}
5440     }
5441 
5442     if (fd->pool && fd->eof >= 0) {
5443 	t_pool_flush(fd->pool);
5444 
5445 	if (0 != cram_flush_result(fd)) {
5446 	    fd = cram_io_close(fd,0);
5447 	    return -1;
5448 	}
5449 
5450 	pthread_mutex_destroy(fd->metrics_lock);
5451 	pthread_mutex_destroy(fd->ref_lock);
5452 	pthread_mutex_destroy(fd->bam_list_lock);
5453 	free(fd->metrics_lock);
5454 	free(fd->ref_lock);
5455 	free(fd->bam_list_lock);
5456 
5457 	if (fd->mode == 'w')
5458 	    fd->ctr = NULL; // prevent double freeing
5459 
5460 	//fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5461 
5462 	t_results_queue_destroy(fd->rqueue);
5463     }
5464 
5465     if (fd->mode == 'w') {
5466 	/* Write EOF block */
5467 	if (0 != cram_write_eof_block(fd))
5468 	    return -1;
5469 
5470 //	if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff"
5471 //			"\xff\xe0\x45\x4f\x46\x00\x00\x00"
5472 //			"\x00\x00\x00", 19, 1, fd->fp))
5473 //	    return -1;
5474     }
5475 
5476     for (bl = fd->bl; bl; bl = next) {
5477 	int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
5478 
5479 	next = bl->next;
5480 	for (i = 0; i < max_rec; i++) {
5481 	    if (bl->bams[i])
5482 		free(bl->bams[i]);
5483 	}
5484 	free(bl->bams);
5485 	free(bl);
5486     }
5487 
5488     if (fd->file_def)
5489 	cram_free_file_def(fd->file_def);
5490 
5491     if (fd->header)
5492 	sam_hdr_free(fd->header);
5493 
5494     free(fd->prefix);
5495 
5496     if (fd->ctr)
5497 	cram_free_container(fd->ctr);
5498 
5499     if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5500 	cram_free_container(fd->ctr_mt);
5501 
5502     if (fd->refs)
5503 	refs_free(fd->refs);
5504     if (fd->ref_free)
5505         free(fd->ref_free);
5506 
5507     for (i = 0; i < DS_END; i++)
5508 	if (fd->m[i])
5509 	    free(fd->m[i]);
5510 
5511     if (fd->tags_used)
5512 	HashTableDestroy(fd->tags_used, 1);
5513 
5514     if (fd->index)
5515 	cram_index_free(fd);
5516 
5517     if (fd->own_pool && fd->pool)
5518 	t_pool_destroy(fd->pool, 0);
5519 
5520     /* rclose == return value for flush and close in case of CRAM output */
5521     fd = cram_io_close(fd, &rclose);
5522 
5523     return rclose;
5524 }
5525 
5526 
5527 /*
5528  * Returns 1 if we hit an EOF while reading.
5529  */
cram_eof(cram_fd * fd)5530 int cram_eof(cram_fd *fd) {
5531     return fd->eof;
5532 }
5533 
5534 
5535 /*
5536  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5537  * Use this immediately after opening.
5538  *
5539  * Returns 0 on success
5540  *        -1 on failure
5541  */
cram_set_option(cram_fd * fd,enum cram_option opt,...)5542 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
5543     int r;
5544     va_list args;
5545 
5546     va_start(args, opt);
5547     r = cram_set_voption(fd, opt, args);
5548     va_end(args);
5549 
5550     return r;
5551 }
5552 
5553 /*
5554  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5555  * Use this immediately after opening.
5556  *
5557  * Returns 0 on success
5558  *        -1 on failure
5559  */
cram_set_voption(cram_fd * fd,enum cram_option opt,va_list args)5560 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
5561     refs_t *refs;
5562 
5563     switch (opt) {
5564     case CRAM_OPT_DECODE_MD:
5565 	fd->decode_md = va_arg(args, int);
5566 	break;
5567 
5568     case CRAM_OPT_PREFIX:
5569 	if (fd->prefix)
5570 	    free(fd->prefix);
5571 	if (!(fd->prefix = strdup(va_arg(args, char *))))
5572 	    return -1;
5573 	break;
5574 
5575     case CRAM_OPT_VERBOSITY:
5576 	fd->verbose = va_arg(args, int);
5577 	break;
5578 
5579     case CRAM_OPT_SEQS_PER_SLICE:
5580 	fd->seqs_per_slice = va_arg(args, int);
5581 	break;
5582 
5583     case CRAM_OPT_BASES_PER_SLICE:
5584 	fd->bases_per_slice = va_arg(args, int);
5585 	break;
5586 
5587     case CRAM_OPT_SLICES_PER_CONTAINER:
5588 	fd->slices_per_container = va_arg(args, int);
5589 	break;
5590 
5591     case CRAM_OPT_EMBED_REF:
5592 	fd->embed_ref = va_arg(args, int);
5593 	break;
5594 
5595     case CRAM_OPT_NO_REF:
5596 	fd->no_ref = va_arg(args, int);
5597 	break;
5598 
5599     case CRAM_OPT_IGNORE_MD5:
5600 	fd->ignore_md5 = va_arg(args, int);
5601 	break;
5602 
5603     case CRAM_OPT_IGNORE_CHKSUM:
5604 	fd->ignore_chksum = va_arg(args, int);
5605 	break;
5606 
5607     case CRAM_OPT_LOSSY_READ_NAMES:
5608 	fd->lossy_read_names = va_arg(args, int);
5609 	break;
5610 
5611     case CRAM_OPT_USE_BZIP2:
5612 	fd->use_bz2 = va_arg(args, int);
5613 	break;
5614 
5615     case CRAM_OPT_USE_ARITH:
5616     case CRAM_OPT_USE_RANS:
5617 	fd->use_rans = va_arg(args, int);
5618 	break;
5619 
5620     case CRAM_OPT_USE_BSC:
5621 	fd->use_bsc = va_arg(args, int);
5622 	break;
5623 
5624     case CRAM_OPT_USE_FQZ:
5625 	fd->use_fqz = va_arg(args, int);
5626 	break;
5627 
5628     case CRAM_OPT_USE_LZMA:
5629 	fd->use_lzma = va_arg(args, int);
5630 	break;
5631 
5632     case CRAM_OPT_SHARED_REF:
5633 	fd->shared_ref = 1;
5634 	refs = va_arg(args, refs_t *);
5635 	if (refs != fd->refs) {
5636 	    if (fd->refs)
5637 		refs_free(fd->refs);
5638 	    fd->refs = refs;
5639 	    fd->refs->count++;
5640 	}
5641 	break;
5642 
5643     case CRAM_OPT_RANGE: {
5644 	int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5645 	if (fd->range.refid != -2)
5646 	    fd->required_fields |= SAM_POS;
5647 	return r;
5648     }
5649 
5650     case CRAM_OPT_REFERENCE:
5651 	return cram_load_reference(fd, va_arg(args, char *));
5652 
5653     case CRAM_OPT_VERSION: {
5654 	int major, minor;
5655 	char *s = va_arg(args, char *);
5656 	if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5657 	    fprintf(stderr, "Malformed version string %s\n", s);
5658 	    return -1;
5659 	}
5660 	if (!((major == 1 &&  minor == 0) ||
5661 	      (major == 2 && (minor == 0 || minor == 1)) ||
5662 	      (major == 3 && (minor == 0 || minor == 1)) ||
5663 	      (major == 4 &&  minor == 0))) {
5664 	    fprintf(stderr, "Unknown version string; "
5665 		    "use 1.0 (deprecated), 2.0, 2.1 or 3.0 "
5666 		    "(experimentally 3.1 or 4.0)\n");
5667 	    errno = EINVAL;
5668 	    return -1;
5669 	}
5670 	if (major == 1 && minor == 0 && fd && fd->mode != 'r') {
5671 	    fprintf(stderr, "Unable to write to version 1.0\n");
5672 	    return -1;
5673 	}
5674 	major_version = major;
5675 	minor_version = minor;
5676 	break;
5677     }
5678 
5679     case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5680 	fd->multi_seq = va_arg(args, int);
5681 	break;
5682 
5683     case CRAM_OPT_NTHREADS: {
5684 	int nthreads =  va_arg(args, int);
5685         if (nthreads > 1) {
5686             if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
5687                 return -1;
5688 
5689 	    fd->rqueue = t_results_queue_init();
5690 	    fd->metrics_lock = malloc(sizeof(pthread_mutex_t));
5691 	    fd->ref_lock = malloc(sizeof(pthread_mutex_t));
5692 	    fd->bam_list_lock = malloc(sizeof(pthread_mutex_t));
5693 	    pthread_mutex_init(fd->metrics_lock, NULL);
5694 	    pthread_mutex_init(fd->ref_lock, NULL);
5695 	    pthread_mutex_init(fd->bam_list_lock, NULL);
5696 	    fd->shared_ref = 1;
5697 	    fd->own_pool = 1;
5698         }
5699 	break;
5700     }
5701 
5702     case CRAM_OPT_THREAD_POOL:
5703 	fd->pool = va_arg(args, t_pool *);
5704 	if (fd->pool) {
5705 	    fd->rqueue = t_results_queue_init();
5706 	    fd->metrics_lock = malloc(sizeof(pthread_mutex_t));
5707 	    fd->ref_lock = malloc(sizeof(pthread_mutex_t));
5708 	    fd->bam_list_lock = malloc(sizeof(pthread_mutex_t));
5709 	    pthread_mutex_init(fd->metrics_lock, NULL);
5710 	    pthread_mutex_init(fd->ref_lock, NULL);
5711 	    pthread_mutex_init(fd->bam_list_lock, NULL);
5712 	}
5713 	fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5714 	fd->own_pool = 0;
5715 
5716 	//fd->qsize = 1;
5717 	//fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5718 	//t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
5719 	break;
5720 
5721     case CRAM_OPT_BINNING:
5722 	fd->binning = va_arg(args, int);
5723 	break;
5724 
5725     case CRAM_OPT_REQUIRED_FIELDS:
5726 	fd->required_fields = va_arg(args, int);
5727 	if (fd->range.refid != -2)
5728 	    fd->required_fields |= SAM_POS;
5729 	break;
5730 
5731     case CRAM_OPT_PRESERVE_AUX_ORDER:
5732 	fd->preserve_aux_order = va_arg(args, int);
5733 	break;
5734 
5735     case CRAM_OPT_PRESERVE_AUX_SIZE:
5736 	fd->preserve_aux_size = va_arg(args, int);
5737 	break;
5738 
5739     default:
5740 	fprintf(stderr, "Unknown CRAM option code %d\n", opt);
5741 	return -1;
5742     }
5743 
5744     return 0;
5745 }
5746 
5747