1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*! \file
35  * Generic SAM/BAM/CRAM interface.
36  *
37  * This file implements a higher level scram_*() API for programs that
38  * wish to be file format agnostic.
39  */
40 
41 #ifndef _SCRAM_H_
42 #define _SCRAM_H_
43 
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47 
48 #ifdef HAVE_CONFIG_H
49 #include "io_lib_config.h"
50 #endif
51 
52 #include "io_lib/bam.h"
53 #include "io_lib/cram.h"
54 
55 /*! The primary file handle for reading and writing. */
56 typedef struct {
57     int is_bam;
58     int eof;
59     union {
60 	bam_file_t *b;
61 	cram_fd    *c;
62     };
63 
64     /* Primary Input/Output buffer */
65     unsigned char *buf;
66     size_t alloc;
67     size_t used;
68     FILE *fp;   // copy of file handle.
69 
70     t_pool *pool;
71 } scram_fd;
72 
73 /*
74  * An input stream in SCRAM is a large block of memory which we periodically
75  * fread into.
76  *
77  * This input stream is then broken down into chunks of appropriate size
78  * as used by the underlying format. The only tricky bit here is the first
79  * portion (opening the underlying format) can use an unknown amount of
80  * buffer due to the BAM header being variable length.
81  *
82  * Once we have this, scram_next_input() will return the next natural
83  * chunk from the input buffer. This permits a single input buffer being
84  * divided into multiple scram_buffers to pass to separate threads for
85  * decoding.
86  */
87 typedef struct {
88     unsigned char *buf;
89     size_t alloc; // allocated size of buf
90     size_t size;  // size loaded
91     size_t usize; // size usable by the underlying format
92 } scram_buffer_t;
93 
94 /*!@return
95  * Returns 0 if not at end of file
96  *         1 if we hit an expected EOF (end of range or EOF block)
97  *         2 for other EOF (end of stream without EOF block)
98  */
99 #define scram_eof(fd) ((fd)->eof)
100 
101 
102 /*! Opens a file.
103  *
104  * If reading we look for the following mode parameters:
105  * -    r  => Try SAM/BAM first, if fail try CRAM
106  * -    rb => BAM
107  * -    rc => CRAM
108  *
109  * If writing we look at the mode parameter:
110  * -    w  => SAM
111  * -    wb => BAM
112  * -    wc => CRAM
113  *
114  * Additionally we can specify the compression level when writing
115  * after the file type character, as 0 to 9. Eg "wb9" for maximum
116  * compression of BAM or "wc0" for uncompressed CRAM.
117  *
118  * @return
119  * Returns scram pointer on success
120  *         NULL on failure
121  */
122 scram_fd *scram_open(const char *filename, const char *mode);
123 
124 #if defined(CRAM_IO_CUSTOM_BUFFERING)
125 /*
126  * Open CRAM file for reading via callbacks
127  *
128  * Returns scram pointer on success
129  *         NULL on failure
130  */
131 scram_fd *scram_open_cram_via_callbacks(
132     char const * filename,
133     cram_io_allocate_read_input_t   callback_allocate_function,
134     cram_io_deallocate_read_input_t callback_deallocate_function,
135     size_t const bufsize
136 );
137 #endif
138 
139 /*! Closes a scram_fd handle
140  *
141  * @return
142  * Returns 0 on success;
143  *        -1 on failure
144  */
145 int scram_close(scram_fd *fd);
146 
147 
148 /*! Returns the SAM_hdr struct.
149  *
150  * @return
151  * The SAM_hdr struct on success; NULL on failure.
152  */
153 SAM_hdr *scram_get_header(scram_fd *fd);
154 
155 
156 /*! Sets the SAM_hdr struct.
157  *
158  * Note that this sets the raw pointer and does not take an internal
159  * copy of it. If you need to do this call sam_hdr_dup() first.
160  */
161 void scram_set_header(scram_fd *fd, SAM_hdr *sh);
162 
163 
164 /*! Writes the SAM hdr.
165  *
166  * This calls the appropriate SAM, BAM or CRAM I/O function to write
167  * out the SAM_hdr currently associated with this fd.
168  *
169  * @return
170  * Returns 0 on success;
171  *        -1 on failure
172  */
173 int scram_write_header(scram_fd *fd);
174 
175 
176 /*! Returns the reference sequence array.
177  *
178  * Note: this only works for CRAM files.
179  *
180  * @return
181  * Returns the refs structure on success;
182  *         NULL on failure.
183  *
184  * After failure, check with scram_eof(fd) to see whether an genuine
185  * error occurred or whether we hit the end of file.
186  */
187 refs_t *scram_get_refs(scram_fd *fd);
188 
189 
190 /*! Sets the reference sequence array.
191  *
192  * Note: this only works for CRAM files.
193  */
194 void scram_set_refs(scram_fd *fd, refs_t *refs);
195 
196 
197 /*!
198  * Replaces the FILE* input interface with an explicit buffer to decode
199  * from.
200  *
201  * @Returns 0 on success;
202  *         -1 on failure
203  */
204 int scram_input_buffer(scram_fd *fd, unsigned char *buf, size_t size);
205 
206 
207 /*! Fetches the next sequence and returns it in BAM format.
208  *
209  * This reads a new sequence line from fd and returns it in the BAM
210  * in-memory format, regardless of whether the input file was SAM, BAM
211  * or CRAM.
212  *
213  * @param bsp bsp is a pointer to a bam_seq_t*, as our usual bam_seq_t
214  * structure pointer may be reallocated internally by this
215  * function. It is permitted to pass in the address of a bam_seq_t*
216  * that points to NULL. This behaviour differs to the Samtools API due
217  * to the bam_seq_t structure being a single contiguous block of
218  * memory instead of in two halves; the static and variable "data"
219  * component.
220  *
221  * Note: For maximum speed of CRAM I/O you may wish to use the cram
222  * specific layer and return cram_record objects instead.
223  *
224  * @return
225  * Returns 0 on success and fills out bsp;
226  *        -1 on failure
227  */
228 int scram_get_seq(scram_fd *fd, bam_seq_t **bsp);
229 
230 /*! Deprecated: please use scram_get_seq() instead */
231 int scram_next_seq(scram_fd *fd, bam_seq_t **bsp);
232 
233 
234 /*! Writes a BAM encoded bam_seq_t to fd.
235  *
236  * @return
237  * Returns 0 on success;
238  *        -1 on failure
239  */
240 int scram_put_seq(scram_fd *fd, bam_seq_t *s);
241 
242 
243 /*! Sets a CRAM option on fd.
244  *
245  * This is only supported for CRAM files currently.
246  *
247  * @return
248  * Returns 0 on success;
249  *        -1 on failure
250  */
251 int scram_set_option(scram_fd *fd, enum cram_option opt, ...);
252 
253 /*! Returns the line number when processing a SAM file
254  *
255  * @return
256  * Returns line number if input is SAM;
257  *         0 for CRAM / BAM input.
258  */
259 int scram_line(scram_fd *fd);
260 
261 
262 /*! Advises the memory allocator of CRAM usage patterns
263  *
264  * CRAM decoding will typically allocate & deallocate blocks for each
265  * slice.  Under certain conditions this can cause a large number of
266  * page faults where malloc gives a page back to the OS (free) and
267  * then requests it again (the next malloc).  We could write our own
268  * memory cache layer on top of malloc to keep track of previously
269  * freed blocks, but it is complex in a multi-threaded environment and
270  * arguably this is what malloc does anyway.
271  *
272  * Under GNU malloc we can simply request it doesn't give back memory
273  * unless it is a larger amount.
274  */
275 void scram_init(void);
276 #ifdef __cplusplus
277 }
278 #endif
279 
280 #endif /* _SCRAM_H_ */
281