1 /* 2 * Copyright (c) 2013 Genome Research Ltd. 3 * Author(s): James Bonfield 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following 13 * disclaimer in the documentation and/or other materials provided 14 * with the distribution. 15 * 16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 17 * Institute nor the names of its contributors may be used to endorse 18 * or promote products derived from this software without specific 19 * prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS 22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH 25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*! \file 35 * Generic SAM/BAM/CRAM interface. 36 * 37 * This file implements a higher level scram_*() API for programs that 38 * wish to be file format agnostic. 39 */ 40 41 #ifndef _SCRAM_H_ 42 #define _SCRAM_H_ 43 44 #ifdef __cplusplus 45 extern "C" { 46 #endif 47 48 #ifdef HAVE_CONFIG_H 49 #include "io_lib_config.h" 50 #endif 51 52 #include "io_lib/bam.h" 53 #include "io_lib/cram.h" 54 55 /*! The primary file handle for reading and writing. */ 56 typedef struct { 57 int is_bam; 58 int eof; 59 union { 60 bam_file_t *b; 61 cram_fd *c; 62 }; 63 64 /* Primary Input/Output buffer */ 65 unsigned char *buf; 66 size_t alloc; 67 size_t used; 68 FILE *fp; // copy of file handle. 69 70 t_pool *pool; 71 } scram_fd; 72 73 /* 74 * An input stream in SCRAM is a large block of memory which we periodically 75 * fread into. 76 * 77 * This input stream is then broken down into chunks of appropriate size 78 * as used by the underlying format. The only tricky bit here is the first 79 * portion (opening the underlying format) can use an unknown amount of 80 * buffer due to the BAM header being variable length. 81 * 82 * Once we have this, scram_next_input() will return the next natural 83 * chunk from the input buffer. This permits a single input buffer being 84 * divided into multiple scram_buffers to pass to separate threads for 85 * decoding. 86 */ 87 typedef struct { 88 unsigned char *buf; 89 size_t alloc; // allocated size of buf 90 size_t size; // size loaded 91 size_t usize; // size usable by the underlying format 92 } scram_buffer_t; 93 94 /*!@return 95 * Returns 0 if not at end of file 96 * 1 if we hit an expected EOF (end of range or EOF block) 97 * 2 for other EOF (end of stream without EOF block) 98 */ 99 #define scram_eof(fd) ((fd)->eof) 100 101 102 /*! Opens a file. 103 * 104 * If reading we look for the following mode parameters: 105 * - r => Try SAM/BAM first, if fail try CRAM 106 * - rb => BAM 107 * - rc => CRAM 108 * 109 * If writing we look at the mode parameter: 110 * - w => SAM 111 * - wb => BAM 112 * - wc => CRAM 113 * 114 * Additionally we can specify the compression level when writing 115 * after the file type character, as 0 to 9. Eg "wb9" for maximum 116 * compression of BAM or "wc0" for uncompressed CRAM. 117 * 118 * @return 119 * Returns scram pointer on success 120 * NULL on failure 121 */ 122 scram_fd *scram_open(const char *filename, const char *mode); 123 124 #if defined(CRAM_IO_CUSTOM_BUFFERING) 125 /* 126 * Open CRAM file for reading via callbacks 127 * 128 * Returns scram pointer on success 129 * NULL on failure 130 */ 131 scram_fd *scram_open_cram_via_callbacks( 132 char const * filename, 133 cram_io_allocate_read_input_t callback_allocate_function, 134 cram_io_deallocate_read_input_t callback_deallocate_function, 135 size_t const bufsize 136 ); 137 #endif 138 139 /*! Closes a scram_fd handle 140 * 141 * @return 142 * Returns 0 on success; 143 * -1 on failure 144 */ 145 int scram_close(scram_fd *fd); 146 147 148 /*! Returns the SAM_hdr struct. 149 * 150 * @return 151 * The SAM_hdr struct on success; NULL on failure. 152 */ 153 SAM_hdr *scram_get_header(scram_fd *fd); 154 155 156 /*! Sets the SAM_hdr struct. 157 * 158 * Note that this sets the raw pointer and does not take an internal 159 * copy of it. If you need to do this call sam_hdr_dup() first. 160 */ 161 void scram_set_header(scram_fd *fd, SAM_hdr *sh); 162 163 164 /*! Writes the SAM hdr. 165 * 166 * This calls the appropriate SAM, BAM or CRAM I/O function to write 167 * out the SAM_hdr currently associated with this fd. 168 * 169 * @return 170 * Returns 0 on success; 171 * -1 on failure 172 */ 173 int scram_write_header(scram_fd *fd); 174 175 176 /*! Returns the reference sequence array. 177 * 178 * Note: this only works for CRAM files. 179 * 180 * @return 181 * Returns the refs structure on success; 182 * NULL on failure. 183 * 184 * After failure, check with scram_eof(fd) to see whether an genuine 185 * error occurred or whether we hit the end of file. 186 */ 187 refs_t *scram_get_refs(scram_fd *fd); 188 189 190 /*! Sets the reference sequence array. 191 * 192 * Note: this only works for CRAM files. 193 */ 194 void scram_set_refs(scram_fd *fd, refs_t *refs); 195 196 197 /*! 198 * Replaces the FILE* input interface with an explicit buffer to decode 199 * from. 200 * 201 * @Returns 0 on success; 202 * -1 on failure 203 */ 204 int scram_input_buffer(scram_fd *fd, unsigned char *buf, size_t size); 205 206 207 /*! Fetches the next sequence and returns it in BAM format. 208 * 209 * This reads a new sequence line from fd and returns it in the BAM 210 * in-memory format, regardless of whether the input file was SAM, BAM 211 * or CRAM. 212 * 213 * @param bsp bsp is a pointer to a bam_seq_t*, as our usual bam_seq_t 214 * structure pointer may be reallocated internally by this 215 * function. It is permitted to pass in the address of a bam_seq_t* 216 * that points to NULL. This behaviour differs to the Samtools API due 217 * to the bam_seq_t structure being a single contiguous block of 218 * memory instead of in two halves; the static and variable "data" 219 * component. 220 * 221 * Note: For maximum speed of CRAM I/O you may wish to use the cram 222 * specific layer and return cram_record objects instead. 223 * 224 * @return 225 * Returns 0 on success and fills out bsp; 226 * -1 on failure 227 */ 228 int scram_get_seq(scram_fd *fd, bam_seq_t **bsp); 229 230 /*! Deprecated: please use scram_get_seq() instead */ 231 int scram_next_seq(scram_fd *fd, bam_seq_t **bsp); 232 233 234 /*! Writes a BAM encoded bam_seq_t to fd. 235 * 236 * @return 237 * Returns 0 on success; 238 * -1 on failure 239 */ 240 int scram_put_seq(scram_fd *fd, bam_seq_t *s); 241 242 243 /*! Sets a CRAM option on fd. 244 * 245 * This is only supported for CRAM files currently. 246 * 247 * @return 248 * Returns 0 on success; 249 * -1 on failure 250 */ 251 int scram_set_option(scram_fd *fd, enum cram_option opt, ...); 252 253 /*! Returns the line number when processing a SAM file 254 * 255 * @return 256 * Returns line number if input is SAM; 257 * 0 for CRAM / BAM input. 258 */ 259 int scram_line(scram_fd *fd); 260 261 262 /*! Advises the memory allocator of CRAM usage patterns 263 * 264 * CRAM decoding will typically allocate & deallocate blocks for each 265 * slice. Under certain conditions this can cause a large number of 266 * page faults where malloc gives a page back to the OS (free) and 267 * then requests it again (the next malloc). We could write our own 268 * memory cache layer on top of malloc to keep track of previously 269 * freed blocks, but it is complex in a multi-threaded environment and 270 * arguably this is what malloc does anyway. 271 * 272 * Under GNU malloc we can simply request it doesn't give back memory 273 * unless it is a larger amount. 274 */ 275 void scram_init(void); 276 #ifdef __cplusplus 277 } 278 #endif 279 280 #endif /* _SCRAM_H_ */ 281