1 /* @include ajseqbam **********************************************************
2 **
3 ** AJAX BAM format sequence processing functions
4 **
5 ** These functions control all aspects of AJAX BAM file processing
6 **
7 ** @author Copyright (C) 2010 Peter Rice ported from samtools
8 ** @version $Revision: 1.21 $
9 ** @modified 2010-2011 Peter Rice
10 ** @modified $Date: 2012/07/02 17:24:52 $ by $Author: rice $
11 ** @@
12 **
13 ** This library is free software; you can redistribute it and/or
14 ** modify it under the terms of the GNU Lesser General Public
15 ** License as published by the Free Software Foundation; either
16 ** version 2.1 of the License, or (at your option) any later version.
17 **
18 ** This library is distributed in the hope that it will be useful,
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 ** Lesser General Public License for more details.
22 **
23 ** You should have received a copy of the GNU Lesser General Public
24 ** License along with this library; if not, write to the Free Software
25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26 ** MA  02110-1301,  USA.
27 **
28 ******************************************************************************/
29 
30 /* The MIT License
31 **
32 **   Copyright (c) 2008 Genome Research Ltd (GRL).
33 **
34 **   Permission is hereby granted, free of charge, to any person obtaining
35 **   a copy of this software and associated documentation files (the
36 **   "Software"), to deal in the Software without restriction, including
37 **   without limitation the rights to use, copy, modify, merge, publish,
38 **   distribute, sublicense, and/or sell copies of the Software, and to
39 **   permit persons to whom the Software is furnished to do so, subject to
40 **   the following conditions:
41 **
42 **   The above copyright notice and this permission notice shall be
43 **   included in all copies or substantial portions of the Software.
44 **
45 **   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
46 **   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
47 **   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 **   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
49 **   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
50 **   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
51 **   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52 **   SOFTWARE.
53 */
54 
55 /* Contact: Heng Li <lh3@sanger.ac.uk> */
56 
57 /*
58 ** much modified for EMBOSS by Peter Rice pmr@ebi.ac.uk May 2010
59 ** lists changed to AjPList
60 ** hashes changed to AjPTable
61 ** strings changed to AjPStr
62 ** fixed-length datatypes int32_t etc. changed to EMBOSS types
63 */
64 
65 #ifndef AJSEQBAM_H
66 #define AJSEQBAM_H
67 
68 /*
69 **  BAM library provides I/O and various operations on manipulating files
70 **  in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
71 **  format. It now supports importing from or exporting to TAM, sorting,
72 **  merging, generating pileup, and quickly retrieval of reads overlapped
73 **  with a specified region.
74 **
75 **  copyright Genome Research Ltd.
76 */
77 
78 /* ========================================================================= */
79 /* ============================= include files ============================= */
80 /* ========================================================================= */
81 
82 #include "ajdefine.h"
83 #include "ajlist.h"
84 #include "ajtable.h"
85 
86 #include <stdlib.h>
87 #include <string.h>
88 #include <stdio.h>
89 
90 #include "zlib.h"
91 
92 AJ_BEGIN_DECLS
93 
94 
95 
96 
97 /* ========================================================================= */
98 /* =============================== constants =============================== */
99 /* ========================================================================= */
100 
101 
102 
103 
104 #ifdef WIN32
105 #define inline __inline
106 #endif /* WIN32 */
107 
108 /* The read is paired in sequencing, no matter whether it is mapped in a pair */
109 #define BAM_FPAIRED        1
110 
111 /* The read is mapped in a proper pair */
112 #define BAM_FPROPER_PAIR   2
113 
114 /* The read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
115 #define BAM_FUNMAP         4
116 
117 /* The mate is unmapped */
118 #define BAM_FMUNMAP        8
119 
120 /* The read is mapped to the reverse strand */
121 #define BAM_FREVERSE      16
122 
123 /* The mate is mapped to the reverse strand */
124 #define BAM_FMREVERSE     32
125 
126 /* This is read1 */
127 #define BAM_FREAD1        64
128 
129 /* This is read2 */
130 #define BAM_FREAD2       128
131 
132 /* Not primary alignment */
133 #define BAM_FSECONDARY   256
134 
135 /* QC failure */
136 #define BAM_FQCFAIL      512
137 
138 /* Optical or PCR duplicate */
139 #define BAM_FDUP        1024
140 
141 
142 #define BAM_OFDEC          0
143 #define BAM_OFHEX          1
144 #define BAM_OFSTR          2
145 
146 
147 /* Default mask for pileup */
148 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
149 
150 #define BAM_CORE_SIZE   sizeof(AjOSeqBamCore)
151 
152 /*
153 ** Describing how CIGAR operation/length is packed in a 32-bit integer.
154 */
155 
156 #define BAM_CIGAR_SHIFT 4
157 #define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
158 
159 /*
160 **  CIGAR operations.
161 */
162 
163 /* CIGAR: M match or mismatch */
164 #define BAM_CMATCH      0
165 
166 /* CIGAR: I insertion to the reference */
167 #define BAM_CINS        1
168 
169 /* CIGAR: D deletion from the reference */
170 #define BAM_CDEL        2
171 
172 /* CIGAR: N skip on the reference (e.g. spliced alignment) */
173 #define BAM_CREF_SKIP   3
174 
175 /* CIGAR: S clip on the read with clipped sequence present in qseq */
176 #define BAM_CSOFT_CLIP  4
177 
178 /* CIGAR: H clip on the read with clipped sequence trimmed off */
179 #define BAM_CHARD_CLIP  5
180 
181 /* CIGAR: P padding */
182 #define BAM_CPAD        6
183 
184 /* CIGAR: = match */
185 #define BAM_CEQUAL        7
186 
187 /* CIGAR: X mismatch */
188 #define BAM_CDIFF        8
189 
190 
191 extern const char* cigarcode;
192 extern const char* bam_nt16_rev_table;
193 
194 
195 
196 
197 /* ========================================================================= */
198 /* ============================== public data ============================== */
199 /* ========================================================================= */
200 
201 
202 
203 
204 /* @data AjPSeqBamBgzf ********************************************************
205 **
206 ** BGZF file handling object
207 **
208 ** @alias AjOSeqBamBgzf
209 ** @alias AjSSeqBamBgzf
210 **
211 ** @attr file [FILE*] File object
212 ** @attr cache [AjPTable] Block cache
213 ** @attr uncompressed_block [void*] Uncompressed block data
214 ** @attr compressed_block [void*] Compressed block data
215 ** @attr error [const char*] Error description
216 ** @attr block_address [ajlong] Block offset
217 ** @attr file_descriptor [int] File descriptor
218 ** @attr cache_size [int] Cache size
219 ** @attr uncompressed_block_size [int] Uncompressed block size
220 ** @attr compressed_block_size [int] Compressed block size
221 ** @attr block_length [int] Block length
222 ** @attr block_offset [int] Block offset
223 ** @attr open_mode [char] Open_mode 'r' or 'w'
224 ** @attr owned_file [char] Boolean
225 ** @attr is_uncompressed [char] Boolean
226 ** @attr Padding [char[5]] Padding
227 **
228 ******************************************************************************/
229 
230 typedef struct AjSSeqBamBgzf
231 {
232     FILE* file;
233     AjPTable cache;
234     void* uncompressed_block;
235     void* compressed_block;
236     const char* error;
237     ajlong block_address;
238     int file_descriptor;
239     int cache_size;
240     int uncompressed_block_size;
241     int compressed_block_size;
242     int block_length;
243     int block_offset;
244     char open_mode;
245     char owned_file;
246     char is_uncompressed;
247     char Padding[5];
248 } AjOSeqBamBgzf;
249 
250 #define AjPSeqBamBgzf AjOSeqBamBgzf*
251 
252 
253 
254 
255 #define BAM_VIRTUAL_OFFSET16
256 
257 
258 
259 
260 /* #abstract BAM file handler */
261 
262 /* @data AjPSeqBamHeader ******************************************************
263 **
264 ** BAM alignment file header data
265 **
266 ** @attr  target_name [char**] names of the reference sequences
267 ** @attr  target_len  [ajuint*] lengths of the reference sequences
268 ** @attr  dict        [AjPList] header dictionary
269 ** @attr  hash        [AjPTable] hash table for fast name lookup
270 ** @attr  rg2lib      [AjPTable] hash table for @RG-ID -> LB lookup
271 ** @attr  text        [char*] plain text
272 ** @attr  n_targets   [ajint] number of reference sequences
273 ** @attr  l_text      [ajint] length of the plain text in the header
274 **
275 ** @@
276 ** discussion Field hash points to null by default. It is a private
277 **  member.
278 ******************************************************************************/
279 
280 typedef struct AjSSeqBamheader
281 {
282     char **target_name;
283     ajuint *target_len;
284     AjPList dict;
285     AjPTable hash;
286     AjPTable rg2lib;
287     char *text;
288     ajint n_targets;
289     ajint l_text;
290 } AjOSeqBamHeader;
291 
292 #define AjPSeqBamHeader AjOSeqBamHeader*
293 
294 
295 
296 
297 /* @data AjPSeqBamCore ********************************************************
298 **
299 ** Structure for core alignment information.
300 **
301 ** @attr  tid     [ajint]  read ID, defined by AjPSeqBamheader
302 ** @attr  pos     [ajint]  0-based leftmost coordinate
303 ** @attr  bin     [ajushort]  bin calculated by ajSeqBamReg2bin()
304 ** @attr  qual    [unsigned char]  mapping quality
305 ** @attr  l_qname [unsigned char]  length of the query name
306 ** @attr  flag    [ajushort]  bitwise flag
307 ** @attr  n_cigar [ajushort]  number of CIGAR operations
308 ** @attr  l_qseq  [ajint]  length of the query sequence (read)
309 ** @attr  mtid    [ajint]  paired read (mate) ID
310 ** @attr  mpos    [ajint]  paired read (mate) position
311 ** @attr  isize   [ajint]  insert size for paired reads
312 ******************************************************************************/
313 
314 typedef struct AjSBamSeqCore
315 {
316     ajint tid;
317     ajint pos;
318     ajushort bin;
319     unsigned char qual;
320     unsigned char l_qname;
321     ajushort flag;
322     ajushort n_cigar;
323     ajint l_qseq;
324     ajint mtid;
325     ajint mpos;
326     ajint isize;
327 } AjOSeqBamCore;
328 
329 #define AjPSeqBamCore AjOSeqBamCore*
330 
331 
332 
333 
334 /* @data AjPSeqBam ************************************************************
335 **
336 ** Structure for one alignment.
337 **
338 ** @alias AjSSeqBam
339 ** @alias AjOSeqBam
340 **
341 ** @attr  core      [AjOSeqBamCore]  core information about the alignment
342 ** @attr  data      [unsigned char*] all variable-length data, concatenated;
343 **                             structure: cigar-qname-seq-qual-aux
344 ** @attr  l_aux      [int]  length of auxiliary data
345 ** @attr  data_len   [int]  current length of data
346 ** @attr  m_data     [int]  maximum reserved size of data
347 ** @attr  Padding    [int]  Padding to alignment boundary
348 **
349 ** @@
350 ** discussion Notes:
351 **
352 **   1. qname is zero tailed and core.l_qname includes the tailing '\0'.
353 **
354 **   2. l_qseq is calculated from the total length of an alignment block
355 **      on reading or from CIGAR.
356 ******************************************************************************/
357 
358 typedef struct AjSSeqBam
359 {
360     AjOSeqBamCore core;
361     unsigned char *data;
362     int l_aux;
363     int data_len;
364     int m_data;
365     int Padding;
366 } AjOSeqBam;
367 
368 #define AjPSeqBam AjOSeqBam*
369 
370 
371 
372 /* ========================================================================= */
373 /* =========================== public functions ============================ */
374 /* ========================================================================= */
375 
376 
377 
378 
379 #define MAJSEQBAMSTRAND(b) (((b)->core.flag&BAM_FREVERSE) != 0)
380 #define MAJSEQBAMMSTRAND(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
381 
382 
383 
384 
385 /*
386 **  Get the CIGAR array
387 **  param  b  pointer to an alignment
388 **  return    pointer to the CIGAR array
389 **
390 **  In the CIGAR array, each element is a 32-bit integer. The
391 **  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
392 **  length of a CIGAR.
393 */
394 #define MAJSEQBAMCIGAR(b) ((ajuint*)((b)->data + (b)->core.l_qname))
395 
396 
397 /*
398 **  Get the name of the query
399 **  param  b  pointer to an alignment
400 **  return    pointer to the name string, null terminated
401 */
402 #define MAJSEQBAMQNAME(b) ((char*)((b)->data))
403 
404 
405 /*
406 **  Get query sequence
407 **  param  b  pointer to an alignment
408 **  return    pointer to sequence
409 **
410 **  Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
411 **  8 for T and 15 for N. Two bases are packed in one byte with the base
412 **  at the higher 4 bits having smaller coordinate on the read. It is
413 **  recommended to use bam1_seqi() macro to get the base.
414 */
415 #define MAJSEQBAMSEQ(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
416 
417 
418 /*
419 **  Get query quality
420 **  param  b  pointer to an alignment
421 **  return    pointer to quality string
422 */
423 #define MAJSEQBAMQUAL(b) ((b)->data + (b)->core.n_cigar*4 +             \
424                           (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
425 
426 /*
427 **  Get a base on read
428 **  param  s  Query sequence returned by bam1_seq()
429 **  param  i  The i-th position, 0-based
430 **  return    4-bit integer representing the base.
431 */
432 #define MAJSEQBAMSEQI(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
433 
434 /*
435 **  Get pointer to the list of auxiliary data
436 **  param  b  pointer to an alignment
437 **  return    pointer to the concatenated auxiliary data
438 */
439 #define MAJSEQBAMAUX(b) ((b)->data + (b)->core.n_cigar*4 + \
440 			 (b)->core.l_qname + \
441 		         (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
442 
443 #ifndef kroundup32
444 /*
445 **  Round an integer to the next closest power-2 integer.
446 **  param  x  integer to be rounded (in place)
447 **  x will be modified.
448 */
449 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4,    \
450                        (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
451 #endif /* !kroundup32 */
452 
453 
454 
455 
456 /*
457 ** Prototype definitions
458 */
459 
460 void ajSeqBamAuxAppend(AjPSeqBam b, const char tag[2],
461 		       char type, int len, const unsigned char* data);
462 
463 AjPSeqBamBgzf ajSeqBamBgzfNew(FILE* file, const char* mode);
464 
465 AjPSeqBamBgzf ajSeqBamBgzfOpenfd(int fd, const char *mode);
466 AjPSeqBamBgzf ajSeqBamBgzfOpenC(const char* path, const char *mode);
467 
468 ajuint ajSeqBamCalend(const AjPSeqBamCore c, const ajuint *cigar);
469 
470 int ajSeqBamBgzfClose(AjPSeqBamBgzf fp);
471 int ajSeqBamBgzfEof(AjPSeqBamBgzf fp);
472 int ajSeqBamBgzfFlush(AjPSeqBamBgzf fp);
473 int ajSeqBamBgzfRead(AjPSeqBamBgzf fp, void* data, int length);
474 ajlong ajSeqBamBgzfSeek(AjPSeqBamBgzf fp, ajlong pos, int where);
475 AjBool ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile, AjPFile outf);
476 AjBool ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile, AjPFile outf);
477 int ajSeqBamBgzfWrite(AjPSeqBamBgzf fp, const void* data, int length);
478 
479 void ajSeqBamDel(AjPSeqBam *Pbam);
480 
481 AjPSeqBamHeader ajSeqBamHeaderNew(void);
482 AjPSeqBamHeader ajSeqBamHeaderNewN(ajint n);
483 AjPSeqBamHeader ajSeqBamHeaderNewTextC(const char* txt);
484 AjPSeqBamHeader ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile);
485 void ajSeqBamHeaderDel(AjPSeqBamHeader *Pheader);
486 int ajSeqBamHeaderWrite(AjPSeqBamBgzf fp, const AjPSeqBamHeader header);
487 AjPTable ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header);
488 AjPTable ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header);
489 const char* ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header);
490 void ajSeqBamHeaderSetTextC(AjPSeqBamHeader header, const char* txt);
491 
492 int ajSeqBamRead(AjPSeqBamBgzf fp, AjPSeqBam b);
493 
494 int ajSeqBamReg2bin(ajuint beg, ajuint end);
495 
496 int ajSeqBamValidate(const AjPSeqBamHeader header, const AjPSeqBam b);
497 int ajSeqBamWrite(AjPSeqBamBgzf fp, const AjPSeqBam b);
498 
499 const char *ajSeqBamGetLibrary(AjPSeqBamHeader header, const AjPSeqBam b);
500 
501 AjPTable ajSeqBamHeaderLineParse(const char *headerLine);
502 
503 /*
504 ** End of prototype definitions
505 */
506 
507 
508 
509 
510 AJ_END_DECLS
511 
512 #endif /* !AJSEQBAM_H */
513