1 /* @source ajseqbam ***********************************************************
2 **
3 ** AJAX BAM format sequence processing functions
4 **
5 ** These functions control all aspects of AJAX BAM file processing
6 **
7 ** @author Copyright (C) 2010 Peter Rice ported from samtools
8 ** @version $Revision: 1.36 $
9 ** @modified 2010-2011 Peter Rice
10 ** @modified $Date: 2012/06/27 08:07:40 $ by $Author: rice $
11 ** @@
12 **
13 ** This library is free software; you can redistribute it and/or
14 ** modify it under the terms of the GNU Lesser General Public
15 ** License as published by the Free Software Foundation; either
16 ** version 2.1 of the License, or (at your option) any later version.
17 **
18 ** This library is distributed in the hope that it will be useful,
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ** Lesser General Public License for more details.
22 **
23 ** You should have received a copy of the GNU Lesser General Public
24 ** License along with this library; if not, write to the Free Software
25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26 ** MA 02110-1301, USA.
27 **
28 ******************************************************************************/
29
30
31 #include "ajlib.h"
32
33 #include "ajseqbam.h"
34 #include "ajassert.h"
35 #include "ajlist.h"
36 #include "ajtable.h"
37 #include "ajutil.h"
38 #include "ajsys.h"
39
40
41 #include <stdio.h>
42 #include <ctype.h>
43 #include <errno.h>
44 #include <fcntl.h>
45
46
47 #ifndef WIN32
48 #include <sys/file.h>
49 #else /* WIN32 */
50 #include <io.h>
51 #define open _open
52 #define fileno _fileno
53 #endif /* !WIN32 */
54
55
56
57 /* Don't define this as a const static int if used in array def */
58 #define AJSEQBAM_BLOCK_HEADER_LENGTH 18
59
60
61 const char *cigarcode = "MIDNSHP";
62 const char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
63
64 static AjBool bamBigendian = ajFalse;
65 static AjBool bamInitialized = ajFalse;
66
67 const char *o_hd_tags[] = {"SO","GO",NULL};
68 const char *r_hd_tags[] = {"VN",NULL};
69
70 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
71 const char *r_sq_tags[] = {"SN","LN",NULL};
72 const char *u_sq_tags[] = {"SN",NULL};
73
74 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
75 const char *r_rg_tags[] = {"ID",NULL};
76 const char *u_rg_tags[] = {"ID",NULL};
77
78 const char *o_pg_tags[] = {"VN","CL",NULL};
79 const char *r_pg_tags[] = {"ID",NULL};
80
81 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
82 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,
83 NULL};
84 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,
85 NULL};
86 const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
87
88
89 unsigned char bam_nt16_table[256] =
90 {
91 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
92 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
93 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
94 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
95 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
96 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
97 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
98 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
99 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
100 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
101 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
102 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
103 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
104 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
105 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
106 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
107 };
108
109
110
111
112 /* @datastatic BamPHeaderTag **************************************************
113 **
114 ** BAM header tag
115 **
116 ** @attr value [char*] Value
117 ** @attr key [char[2]] Key
118 ** @attr Padding [char[6]] Padding for alignment
119 ******************************************************************************/
120
121 typedef struct BamSHeaderTag
122 {
123 char *value;
124 char key[2];
125 char Padding[6];
126 } BamOHeaderTag;
127
128 #define BamPHeaderTag BamOHeaderTag*
129
130
131
132
133 /* @datastatic BamPHeaderLine *************************************************
134 **
135 ** BAM header line
136 **
137 ** @attr tags [AjPList] List of tags
138 ** @attr type [char[2]] Type
139 ** @attr Padding [char[6]] Padding for alignment
140 ******************************************************************************/
141
142 typedef struct BamSHeaderLine
143 {
144 AjPList tags;
145 char type[2];
146 char Padding[6];
147 } BamOHeaderLine;
148
149 #define BamPHeaderLine BamOHeaderLine*
150
151 const char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
152
153 static AjPSeqBamBgzf bamBgzfNew(void);
154 static void bamCacheFree(AjPSeqBamBgzf fp);
155 static int bamCacheLoadBlock(AjPSeqBamBgzf fp, ajlong block_address);
156 static int bamDeflateBlock(AjPSeqBamBgzf fp, int block_length);
157 static void bamDestroyHeaderHash(AjPSeqBamHeader header);
158 static unsigned char *bamGetAux(const AjPSeqBam b, const char tag[2]);
159 static int bamHeaderCheck(const char *header);
160 static void bamHeaderFree(AjPList *header);
161 static BamPHeaderTag bamHeaderLineHasTag(const BamPHeaderLine hline,
162 const char *key);
163 static BamPHeaderLine bamHeaderLineParse(const char *headerLine);
164 static AjPTable bamHeaderLineToTablemap(const BamPHeaderLine hline);
165 static void bamHeaderLineFree(BamPHeaderLine *hline);
166 static int bamHeaderLineValidate(BamPHeaderLine hline);
167 static AjPList bamHeaderParse2(const char *headerText);
168 static AjPTable bamHeaderTotable(const AjPList _dict, const char type[2],
169 const char key_tag[2],
170 const char value_tag[2]);
171 static int bamInflateBlock(AjPSeqBamBgzf fp, int block_length);
172 static int bamReadBlock(AjPSeqBamBgzf fp);
173 static void bamSwapEndianData(const AjPSeqBamCore c, int data_len,
174 unsigned char *data);
175 static BamPHeaderTag bamTagNew(const char *name, const char *value_from,
176 const char *value_to);
177
178 static const char* bamNextline(char **lineptr, size_t *n, const char *text);
179 static int bamTagExists(const char *tag, const char **tags);
180
181
182
183
184 /* @func ajSeqBamCalend *****************************************************
185 **
186 ** Calculate the rightmost coordinate of an alignment on the reference genome
187 **
188 ** @param [r] c [const AjPSeqBamCore] pointer to the AjPSeqBamCore structure
189 ** @param [r] cigar [const ajuint*] the corresponding CIGAR array
190 ** @return [ajuint] the rightmost coordinate, 0-based
191 **
192 ** @release 6.5.0
193 ******************************************************************************/
194
ajSeqBamCalend(const AjPSeqBamCore c,const ajuint * cigar)195 ajuint ajSeqBamCalend(const AjPSeqBamCore c, const ajuint *cigar)
196 {
197 ajuint k;
198 ajuint end;
199 ajint op;
200
201 end = c->pos;
202
203 for (k = 0; k < c->n_cigar; ++k)
204 {
205 op = cigar[k] & BAM_CIGAR_MASK;
206 if (op == BAM_CMATCH || op == BAM_CDIFF || op == BAM_CEQUAL
207 || op == BAM_CDEL || op == BAM_CREF_SKIP)
208 end += cigar[k] >> BAM_CIGAR_SHIFT;
209 }
210
211 return end;
212 }
213
214
215
216
217 /* @func ajSeqBamReg2bin ******************************************************
218 **
219 ** Calculate the minimum bin that contains a region [beg,end).
220 **
221 ** @param [r] beg [ajuint] start of the region, 0-based
222 ** @param [r] end [ajuint] end of the region, 0-based
223 ** @return [int] bin
224 **
225 ******************************************************************************/
226
ajSeqBamReg2bin(ajuint beg,ajuint end)227 int ajSeqBamReg2bin(ajuint beg, ajuint end)
228 {
229 --end;
230
231 if(beg>>14 == end>>14)
232 return 4681 + (beg>>14);
233
234 if(beg>>17 == end>>17)
235 return 585 + (beg>>17);
236
237 if(beg>>20 == end>>20)
238 return 73 + (beg>>20);
239
240 if(beg>>23 == end>>23)
241 return 9 + (beg>>23);
242
243 if(beg>>26 == end>>26)
244 return 1 + (beg>>26);
245
246 return 0;
247 }
248
249
250
251
252 /* #funcstatic bam_copy1 *****************************************************
253 **
254 ** Copy an alignment
255 **
256 ** #param [u] bdst [AjPSeqBam] destination alignment struct
257 ** #param [r] bsrc [const AjPSeqBam] source alignment struct
258 ** #return [AjPSeqBam] pointer to the destination alignment struct
259 **
260 ******************************************************************************/
261 /*
262 //static inline AjPSeqBam bam_copy1(AjPSeqBam bdst, const AjPSeqBam bsrc)
263 //{
264 // unsigned char *data = NULL;
265 // int m_data;
266 //
267 // /# backup data and m_data #/
268 // data = bdst->data;
269 // m_data = bdst->m_data;
270 //
271 //
272 // /# double the capacity #/
273 // if (m_data < bsrc->m_data)
274 // {
275 // m_data = bsrc->m_data; kroundup32(m_data);
276 // data = (unsigned char*)realloc(data, m_data);
277 // }
278 //
279 // /# copy var-len data #/
280 // memcpy(data, bsrc->data, bsrc->data_len);
281 //
282 // /# copy the rest #/
283 // *bdst = *bsrc;
284 //
285 // /# restore the backup #/
286 // bdst->m_data = m_data;
287 // bdst->data = data;
288 //
289 // return bdst;
290 //}
291 */
292
293
294
295
296 /* @funcstatic bam_aux_type2size **********************************************
297 **
298 ** Returns size of the array elements in optional fields of alignment records
299 **
300 ** @param [r] x [int] type of the array elements
301 ** @return [int] size of the array elements
302 **
303 ******************************************************************************/
304
bam_aux_type2size(int x)305 static inline int bam_aux_type2size(int x)
306 {
307 if (x == 'C' || x == 'c' || x == 'A')
308 return 1;
309 else if (x == 'S' || x == 's')
310 return 2;
311 else if (x == 'I' || x == 'i' || x == 'f')
312 return 4;
313 else
314 return 0;
315 }
316
317
318
319
320 /* @func ajSeqBamHeaderNew ****************************************************
321 **
322 ** Create an empty BAM header object
323 **
324 ** @return [AjPSeqBamHeader] BAM header object
325 **
326 **
327 ** @release 6.3.0
328 ******************************************************************************/
329
ajSeqBamHeaderNew(void)330 AjPSeqBamHeader ajSeqBamHeaderNew(void)
331 {
332 AjPSeqBamHeader header=NULL;
333
334 AJNEW0(header);
335
336 return header;
337 }
338
339
340
341
342 /* @func ajSeqBamHeaderNewTextC ***********************************************
343 **
344 ** Create an empty BAM header object with given header text
345 **
346 ** @param [r] txt [const char*] BAM/SAM header text string
347 ** @return [AjPSeqBamHeader] BAM header object
348 **
349 **
350 ** @release 6.3.0
351 ******************************************************************************/
352
ajSeqBamHeaderNewTextC(const char * txt)353 AjPSeqBamHeader ajSeqBamHeaderNewTextC(const char* txt)
354 {
355 AjPSeqBamHeader ret;
356
357 AJNEW0(ret);
358
359 ret->l_text = strlen(txt);
360 ret->text = ajCharNewResC(txt, ret->l_text + 1);
361
362 return ret;
363 }
364
365
366
367
368 /* @func ajSeqBamHeaderSetTextC ***********************************************
369 **
370 ** Set BAM header text
371 **
372 ** @param [u] header [AjPSeqBamHeader] BAM header
373 ** @param [r] txt [const char*] header text
374 ** @return [void]
375 **
376 **
377 ** @release 6.5.0
378 ******************************************************************************/
379
ajSeqBamHeaderSetTextC(AjPSeqBamHeader header,const char * txt)380 void ajSeqBamHeaderSetTextC(AjPSeqBamHeader header, const char* txt)
381 {
382
383 if(header->text)
384 AJFREE(header->text);
385
386 header->l_text = strlen(txt);
387 header->text = ajCharNewResC(txt, header->l_text + 1);
388
389 return;
390 }
391
392
393
394
395 /* @func ajSeqBamHeaderNewN ***************************************************
396 **
397 ** Create an empty BAM header object with n reference sequences
398 **
399 ** @param [r] n [ajint] Number of target/reference sequences
400 ** @return [AjPSeqBamHeader] BAM header object
401 **
402 **
403 ** @release 6.5.0
404 ******************************************************************************/
405
ajSeqBamHeaderNewN(ajint n)406 AjPSeqBamHeader ajSeqBamHeaderNewN(ajint n)
407 {
408 AjPSeqBamHeader ret;
409
410 AJNEW0(ret);
411
412 ret->n_targets = n;
413 AJCNEW0(ret->target_name, n);
414 AJCNEW0(ret->target_len, n);
415
416 return ret;
417 }
418
419
420
421
422 /* @func ajSeqBamDel **********************************************************
423 **
424 ** Deletes a BAM alignment object
425 **
426 ** @param [d] Pbam [AjPSeqBam*] BAM alignment object
427 ** @return [void]
428 **
429 **
430 ** @release 6.5.0
431 ******************************************************************************/
432
ajSeqBamDel(AjPSeqBam * Pbam)433 void ajSeqBamDel(AjPSeqBam *Pbam)
434 {
435 AjPSeqBam bam = NULL;
436
437 bam = *Pbam;
438
439 if(bam == 0)
440 return;
441
442 AJFREE(bam->data);
443 AJFREE(*Pbam);
444
445 return;
446 }
447
448
449
450
451 /* @func ajSeqBamHeaderDel ****************************************************
452 **
453 ** Destructor for a BAM header object
454 **
455 ** @param [d] Pheader [AjPSeqBamHeader*] BAM header object
456 ** @return [void]
457 **
458 **
459 ** @release 6.3.0
460 ******************************************************************************/
461
ajSeqBamHeaderDel(AjPSeqBamHeader * Pheader)462 void ajSeqBamHeaderDel(AjPSeqBamHeader *Pheader)
463 {
464 ajint i;
465 AjPSeqBamHeader header = NULL;
466
467 header = *Pheader;
468
469 if(header == 0)
470 return;
471
472 if(header->target_name)
473 {
474 for(i = 0; i < header->n_targets; ++i)
475 AJFREE(header->target_name[i]);
476
477 AJFREE(header->target_name);
478 AJFREE(header->target_len);
479 }
480
481 AJFREE(header->text);
482
483 if(header->dict)
484 bamHeaderFree(&header->dict);
485
486 if(header->rg2lib)
487 ajTableFree(&header->rg2lib);
488
489 bamDestroyHeaderHash(header);
490
491 AJFREE(*Pheader);
492
493 return;
494 }
495
496
497
498
499 /* @func ajSeqBamHeaderWrite **************************************************
500 **
501 ** Write a header structure to BAM
502 **
503 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
504 ** @param [r] header [const AjPSeqBamHeader] Header structure
505 ** @return [int] Always 0 currently
506 **
507 **
508 ** @release 6.3.0
509 ******************************************************************************/
510
ajSeqBamHeaderWrite(AjPSeqBamBgzf fp,const AjPSeqBamHeader header)511 int ajSeqBamHeaderWrite(AjPSeqBamBgzf fp, const AjPSeqBamHeader header)
512 {
513 char buf[4];
514 ajint i;
515 ajint name_len;
516 ajint x;
517 char *p = NULL;
518
519 if(!bamInitialized)
520 {
521 bamInitialized = ajTrue;
522 bamBigendian = ajUtilGetBigendian();
523 }
524
525 /* write "BAM1" */
526 strncpy(buf, "BAM\001", 4);
527 ajSeqBamBgzfWrite(fp, buf, 4);
528
529 /* write plain text and the number of reference sequences */
530
531 if(bamBigendian)
532 {
533 x = header->l_text;
534 ajByteRevInt(&x);
535 ajSeqBamBgzfWrite(fp, &x, 4);
536
537 if (header->l_text)
538 ajSeqBamBgzfWrite(fp, header->text, header->l_text);
539
540 x = header->n_targets;
541 ajByteRevInt(&x);
542 ajSeqBamBgzfWrite(fp, &x, 4);
543 }
544 else
545 {
546 ajSeqBamBgzfWrite(fp, &header->l_text, 4);
547
548 if(header->l_text)
549 ajSeqBamBgzfWrite(fp, header->text, header->l_text);
550
551 ajSeqBamBgzfWrite(fp, &header->n_targets, 4);
552 }
553
554 /* write sequence names and lengths */
555
556 for(i = 0; i != header->n_targets; ++i)
557 {
558 p = header->target_name[i];
559 name_len = strlen(p) + 1;
560
561 if(bamBigendian)
562 {
563 x = name_len;
564 ajByteRevInt(&x);
565 ajSeqBamBgzfWrite(fp, &x, 4);
566 }
567 else
568 ajSeqBamBgzfWrite(fp, &name_len, 4);
569
570 ajSeqBamBgzfWrite(fp, p, name_len);
571
572 if(bamBigendian)
573 {
574 x = header->target_len[i];
575 ajByteRevInt(&x);
576 ajSeqBamBgzfWrite(fp, &x, 4);
577 }
578 else
579 ajSeqBamBgzfWrite(fp, &header->target_len[i], 4);
580 }
581
582 return 0;
583 }
584
585
586
587
588 /* @funcstatic bamSwapEndianData **********************************************
589 **
590 ** Swap bigendian data in a BAM core data buffer
591 **
592 ** @param [r] c [const AjPSeqBamCore] BAM core data
593 ** @param [r] data_len [int] Data length
594 ** @param [u] data [unsigned char*] Data buffer
595 ** @return [void]
596 **
597 **
598 ** @release 6.3.0
599 ******************************************************************************/
600
bamSwapEndianData(const AjPSeqBamCore c,int data_len,unsigned char * data)601 static void bamSwapEndianData(const AjPSeqBamCore c, int data_len,
602 unsigned char *data)
603 {
604 unsigned char *s = NULL;
605 ajuint i;
606 ajuint *cigar = NULL;
607 unsigned char type;
608
609 cigar = (ajuint*)(data + c->l_qname);
610
611 s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
612
613 for(i = 0; i < c->n_cigar; ++i)
614 ajByteRevUint(&cigar[i]);
615
616 while(s < data + data_len)
617 {
618 s += 2; /* skip key */
619 type = toupper(*s); ++s; /* skip type */
620
621 if(type == 'C' || type == 'A')
622 ++s;
623 else if(type == 'S')
624 {
625 ajByteRevShort((ajshort*)s);
626 s += 2;
627 }
628 else if(type == 'I' || type == 'F')
629 {
630 ajByteRevUint((ajuint*)s);
631 s += 4;
632 }
633 else if(type == 'D')
634 {
635 ajByteRevLong((ajlong*)s);
636 s += 8;
637 }
638 else if(type == 'Z' || type == 'H')
639 {
640 while(*s)
641 ++s;
642
643 ++s;
644 }
645 else if (type == 'B')
646 {
647 ajuint n;
648 ajint Bsize = bam_aux_type2size(*s);
649 memcpy(&n, s + 1, 4);
650
651 if (1 == Bsize) {
652 } else if (2 == Bsize) {
653 for (i = 0; i < n; i += 2)
654 ajByteRevShort((ajshort*)s + 5 + i);
655 } else if (4 == Bsize) {
656 for (i = 0; i < n; i += 4)
657 ajByteRevUint((ajuint*)s + 5 + i);
658 }
659 ajByteRevUint((ajuint*)s+1);
660 }
661
662 }
663
664 return;
665 }
666
667
668
669
670 /* @func ajSeqBamRead *********************************************************
671 **
672 ** Read a BAM alignment record of a read(query sequence)
673 **
674 ** The file position indicator must be placed right before an
675 ** alignment. Upon success, this function will set the position
676 ** indicator to the start of the next alignment. This function is not
677 ** affected by the machine endianness.
678 **
679 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
680 ** @param [u] b [AjPSeqBam] read alignment; all members are updated.
681 ** @return [int] Number of bytes read from the file
682 **
683 ** @release 6.3.0
684 ** @@
685 ******************************************************************************/
686
ajSeqBamRead(AjPSeqBamBgzf fp,AjPSeqBam b)687 int ajSeqBamRead(AjPSeqBamBgzf fp, AjPSeqBam b)
688 {
689 AjPSeqBamCore c = &b->core;
690 ajint block_len;
691 ajint ret;
692 ajint i;
693 ajuint x[8];
694 ajint oldsize;
695
696 if(!bamInitialized)
697 {
698 bamInitialized = ajTrue;
699 bamBigendian = ajUtilGetBigendian();
700 }
701
702 assert(BAM_CORE_SIZE == 32);
703
704 if((ret = ajSeqBamBgzfRead(fp, &block_len, 4)) != 4)
705 {
706 if(ret == 0)
707 return -1; /* normal end-of-file */
708
709 return -2; /* truncated */
710 }
711
712 if(block_len < (ajint) BAM_CORE_SIZE)
713 ajErr("block_len: %d core_size: %d", block_len, BAM_CORE_SIZE);
714
715 if(ajSeqBamBgzfRead(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE)
716 return -3;
717
718 if(bamBigendian)
719 {
720 ajByteRevInt(&block_len);
721
722 for(i = 0; i < 8; ++i)
723 ajByteRevUint(x + i);
724 }
725
726 c->tid = x[0];
727 c->pos = x[1];
728 c->bin = x[2]>>16;
729 c->qual = x[2]>>8&0xff;
730 c->l_qname = x[2]&0xff;
731 c->flag = x[3]>>16;
732 c->n_cigar = x[3]&0xffff;
733 c->l_qseq = x[4];
734 c->mtid = x[5];
735 c->mpos = x[6];
736 c->isize = x[7];
737 b->data_len = block_len - BAM_CORE_SIZE;
738
739 if(b->m_data < b->data_len)
740 {
741 oldsize = b->m_data;
742 b->m_data = b->data_len;
743 kroundup32(b->m_data);
744 AJRESIZE0(b->data, oldsize, b->m_data);
745 }
746
747 if(ajSeqBamBgzfRead(fp, b->data, b->data_len) != b->data_len)
748 {
749 ajErr("ajSeqBamBgzfRead len: %d failed", b->data_len);
750 return -4;
751 }
752
753 b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname -
754 c->l_qseq - (c->l_qseq+1)/2;
755
756 if(bamBigendian)
757 bamSwapEndianData(c, b->data_len, b->data);
758
759 return 4 + block_len;
760 }
761
762
763
764
765 /* @func ajSeqBamWrite ********************************************************
766 **
767 ** Writes BAM alignment data to a BGZ file
768 **
769 ** @param [u] fp [AjPSeqBamBgzf]BAM file handler
770 ** @param [r] b [const AjPSeqBam] alignment to write
771 ** @return [int] number of bytes written to the file
772 **
773 **
774 ** @release 6.3.0
775 ******************************************************************************/
776
ajSeqBamWrite(AjPSeqBamBgzf fp,const AjPSeqBam b)777 int ajSeqBamWrite(AjPSeqBamBgzf fp, const AjPSeqBam b)
778 {
779 const AjPSeqBamCore c;
780 int data_len;
781 unsigned char *data = NULL;
782
783 ajuint x[8];
784 ajuint block_len;
785 ajuint y;
786 int i;
787
788 c = &b->core;
789 data_len = b->data_len;
790 data = b->data;
791 block_len = data_len + BAM_CORE_SIZE;
792
793 if(!bamInitialized)
794 {
795 bamInitialized = ajTrue;
796 bamBigendian = ajUtilGetBigendian();
797 }
798
799 assert(BAM_CORE_SIZE == 32);
800
801 x[0] = c->tid;
802 x[1] = c->pos;
803 x[2] = (ajuint)c->bin<<16 | c->qual<<8 | c->l_qname;
804 x[3] = (ajuint)c->flag<<16 | c->n_cigar;
805 x[4] = c->l_qseq;
806 x[5] = c->mtid;
807 x[6] = c->mpos;
808 x[7] = c->isize;
809
810 if(bamBigendian)
811 {
812 for(i = 0; i < 8; ++i)
813 ajByteRevUint(x + i);
814
815 y = block_len;
816 ajByteRevUint(&y);
817 ajSeqBamBgzfWrite(fp, &y, 4);
818 bamSwapEndianData(c, data_len, data);
819 }
820 else
821 ajSeqBamBgzfWrite(fp, &block_len, 4);
822
823 ajSeqBamBgzfWrite(fp, x, BAM_CORE_SIZE);
824 ajSeqBamBgzfWrite(fp, data, data_len);
825
826 if(bamBigendian)
827 bamSwapEndianData(c, data_len, data);
828
829 return 4 + block_len;
830 }
831
832
833
834
835 /* @func ajSeqBamGetLibrary ***************************************************
836 **
837 ** Return the name of the library from the BAM file header RG tag
838 **
839 ** @param [u] h [AjPSeqBamHeader] BAM header
840 ** @param [r] b [const AjPSeqBam] BAM record
841 ** @return [const char*] Library name
842 **
843 **
844 ** @release 6.3.0
845 ******************************************************************************/
846
ajSeqBamGetLibrary(AjPSeqBamHeader h,const AjPSeqBam b)847 const char* ajSeqBamGetLibrary(AjPSeqBamHeader h, const AjPSeqBam b)
848 {
849 const unsigned char *rg;
850 /*
851 ** FIXME: this should also check the LB tag associated with each alignment
852 */
853 if(h->dict == 0)
854 h->dict = bamHeaderParse2(h->text);
855
856 if(h->rg2lib == 0)
857 h->rg2lib = bamHeaderTotable(h->dict, "RG", "ID", "LB");
858
859 rg = bamGetAux(b, "RG");
860
861 return (rg == 0) ? 0 : ajTableFetchC(h->rg2lib, (const char*)(rg + 1));
862 }
863
864
865
866
867 /* @datastatic BamPCache ******************************************************
868 **
869 ** BAM block cache
870 **
871 ** @alias BamSCache
872 ** @alias BamOCache
873 **
874 ** @attr block [unsigned char*] Block data
875 ** @attr end_offset [ajlong] End offset
876 ** @attr size [int]Block size
877 ** @attr padding [int] To alignment boundary
878 **
879 ******************************************************************************/
880
881 typedef struct BamSCache
882 {
883 unsigned char *block;
884 ajlong end_offset;
885 int size;
886 int padding;
887 } BamOCache;
888
889 #define BamPCache BamOCache*
890
891
892 /* @conststatic BAM variables *************************************************
893 **
894 ** Variables defined for BAM processing
895 **
896 ** @value AJSEQBAM_DEFAULT_BLOCK_SIZE [int] Default block size of 64k
897 ** @value AJSEQBAM_MAX_BLOCK_SIZE [int] Maximum block size also 64k
898 ** @value AJSEQBAM_BLOCK_FOOTER_LENGTH [int] Block footer length
899 ** @value AJSEQBAM_GZIP_ID1 [int] Gzip magic id 1
900 ** @value AJSEQBAM_GZIP_ID2 [int] Gzip magic id 2
901 ** @value AJSEQBAM_CM_DEFLATE [int] CM deflate
902 ** @value AJSEQBAM_FLG_FEXTRA [int] Fextra flag
903 ** @value AJSEQBAM_OS_UNKNOWN [int] Unkown OS value
904 ** @value AJSEQBAM_BGZF_ID1 [int] Bgfz magic id 1 'B'
905 ** @value AJSEQBAM_BGZF_ID2 [int] Bgfz magic id 2 'C'
906 ** @value AJSEQBAM_BGZF_LEN [int] Bgzf length
907 ** @value AJSEQBAM_BGZF_XLEN [int] Bgzf xlength (length plus 4)
908 ** @value AJSEQBAM_GZIP_WINDOW_BITS [int] Gzip window bits (-15 - no zlib header
909 ** @value AJSEQBAM_Z_DEFAULT_MEM_LEVEL [int] Z default memory level
910 ******************************************************************************/
911
912 static const int AJSEQBAM_DEFAULT_BLOCK_SIZE = 64 * 1024;
913 static const int AJSEQBAM_MAX_BLOCK_SIZE = 64 * 1024;
914
915 static const int AJSEQBAM_BLOCK_FOOTER_LENGTH = 8;
916
917 static const int AJSEQBAM_GZIP_ID1 = 31;
918 static const int AJSEQBAM_GZIP_ID2 = 139;
919 static const int AJSEQBAM_CM_DEFLATE = 8;
920 static const int AJSEQBAM_FLG_FEXTRA = 4;
921 static const int AJSEQBAM_OS_UNKNOWN = 255;
922 static const int AJSEQBAM_BGZF_ID1 = 66;
923 static const int AJSEQBAM_BGZF_ID2 = 67;
924 static const int AJSEQBAM_BGZF_LEN = 2;
925 static const int AJSEQBAM_BGZF_XLEN = 6;
926
927 static const int AJSEQBAM_GZIP_WINDOW_BITS = -15;
928 static const int AJSEQBAM_Z_DEFAULT_MEM_LEVEL = 8;
929
930
931
932
933
934 static inline void packInt16(unsigned char* buffer, ajushort value);
935
936 static inline void packInt32(unsigned char* buffer, ajuint value);
937 static inline int unpackInt16(const unsigned char* buffer);
938 static inline int BAMBGZFMIN(int x, int y);
939
940
941
942
943 /* @funcstatic packInt16 *****************************************************
944 **
945 ** Pack a 2-byte integer
946 **
947 ** @param [u] buffer [unsigned char*] Buffer of at least 2 bytes
948 ** @param [r] value [ajushort] Unsigned integer value
949 ** @return [void]
950 ******************************************************************************/
951
packInt16(unsigned char * buffer,ajushort value)952 static inline void packInt16(unsigned char* buffer, ajushort value)
953 {
954 buffer[0] = (unsigned char) value;
955 buffer[1] = value >> 8;
956 }
957
958
959
960
961 /* @funcstatic unpackInt16 ****************************************************
962 **
963 ** Unpack a 2-byte integer
964 **
965 ** @param [r] buffer [const unsigned char*] Buffer of at least 2 bytes
966 ** @return [int] Integer value from first 2 bytes of buffer
967 **
968 ******************************************************************************/
969
unpackInt16(const unsigned char * buffer)970 static inline int unpackInt16(const unsigned char* buffer)
971 {
972 return (buffer[0] | (buffer[1] << 8));
973 }
974
975
976
977
978
979 /* @funcstatic packInt32 *****************************************************
980 **
981 ** Pack a 4-byte integer
982 **
983 ** @param [u] buffer [unsigned char*] Buffer of at least 4 bytes
984 ** @param [r] value [ajuint] Unsigned integer value
985 ** @return [void]
986 ******************************************************************************/
987
packInt32(unsigned char * buffer,ajuint value)988 static inline void packInt32(unsigned char* buffer, ajuint value)
989 {
990 buffer[0] = value;
991 buffer[1] = value >> 8;
992 buffer[2] = value >> 16;
993 buffer[3] = value >> 24;
994 }
995
996
997
998
999
1000 /* @funcstatic BAMBGZFMIN ****************************************************
1001 **
1002 ** Return minimum of two integers
1003 **
1004 ** @param [r] x [int] First integer
1005 ** @param [r] y [int] Second integer
1006 ** @return [int] Lowest integer value
1007 ******************************************************************************/
1008
BAMBGZFMIN(int x,int y)1009 static inline int BAMBGZFMIN(int x, int y)
1010 {
1011 return (x < y) ? x : y;
1012 }
1013
1014
1015
1016
1017
1018 /* @funcstatic bamReportError *************************************************
1019 **
1020 ** Save error message in the BGZ file error attribute
1021 **
1022 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1023 ** @param [r] message [const char*] message text
1024 ** @return [void]
1025 **
1026 **
1027 ** @release 6.3.0
1028 ******************************************************************************/
1029
bamReportError(AjPSeqBamBgzf fp,const char * message)1030 static void bamReportError(AjPSeqBamBgzf fp, const char* message)
1031 {
1032 ajWarn("++bamReportError '%s'", message);
1033 fp->error = message;
1034
1035 return;
1036 }
1037
1038
1039
1040
1041 /* @funcstatic bamBgzfNew *****************************************************
1042 **
1043 ** Initialize a BGZ file object
1044 **
1045 ** @return [AjPSeqBamBgzf] BGZ file object
1046 **
1047 **
1048 ** @release 6.3.0
1049 ******************************************************************************/
1050
bamBgzfNew(void)1051 static AjPSeqBamBgzf bamBgzfNew(void)
1052 {
1053 AjPSeqBamBgzf fp;
1054
1055 fp = calloc(1, sizeof(AjOSeqBamBgzf));
1056 fp->uncompressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1057 fp->uncompressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1058 fp->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1059 fp->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1060 fp->cache_size = 0;
1061 fp->cache = NULL;
1062
1063 return fp;
1064 }
1065
1066
1067
1068
1069 /* @funcstatic bamBgzfOpenfdRead **********************************************
1070 **
1071 ** Open a BGZ file for reading
1072 **
1073 ** @param [r] fd [int] File descriptor of opened file
1074 ** @return [AjPSeqBamBgzf] BGZ file object
1075 **
1076 **
1077 ** @release 6.3.0
1078 ******************************************************************************/
1079
bamBgzfOpenfdRead(int fd)1080 static AjPSeqBamBgzf bamBgzfOpenfdRead(int fd)
1081 {
1082 FILE* file;
1083 AjPSeqBamBgzf fp;
1084
1085 file = ajSysFuncFdopen(fd, "r");
1086
1087 if(file == 0)
1088 return 0;
1089
1090 fp = bamBgzfNew();
1091 fp->file_descriptor = fd;
1092 fp->open_mode = 'r';
1093 fp->file = file;
1094 fp->cache = ajTableNew(512);
1095
1096 fseek(fp->file, 0L, SEEK_SET);
1097
1098 return fp;
1099 }
1100
1101
1102
1103
1104 /* @funcstatic bamBgzfOpenfdWrite *********************************************
1105 **
1106 ** Open a BGZ file for writing
1107 **
1108 ** @param [r] fd [int] File descriptor of opened file
1109 ** @param [r] is_uncompressed [char] If non-zero, file is uncompressed
1110 ** @return [AjPSeqBamBgzf] BGZ file object
1111 **
1112 **
1113 ** @release 6.3.0
1114 ******************************************************************************/
1115
bamBgzfOpenfdWrite(int fd,char is_uncompressed)1116 static AjPSeqBamBgzf bamBgzfOpenfdWrite(int fd, char is_uncompressed)
1117 {
1118 FILE* file;
1119 AjPSeqBamBgzf fp;
1120
1121 file = ajSysFuncFdopen(fd, "w");
1122
1123 if(file == 0)
1124 return 0;
1125
1126 fp = malloc(sizeof(AjOSeqBamBgzf));
1127 fp->file_descriptor = fd;
1128 fp->open_mode = 'w';
1129 fp->owned_file = 0;
1130 fp->is_uncompressed = is_uncompressed;
1131 fp->file = file;
1132 fp->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1133 fp->uncompressed_block = NULL;
1134 fp->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1135 fp->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1136 fp->block_address = 0;
1137 fp->block_offset = 0;
1138 fp->block_length = 0;
1139 fp->error = NULL;
1140
1141 return fp;
1142 }
1143
1144
1145
1146
1147 /* @func ajSeqBamBgzfOpenC ****************************************************
1148 **
1149 ** Open a named BGZ file for reading or writing
1150 **
1151 ** @param [r] path [const char*] File path
1152 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1153 ** @return [AjPSeqBamBgzf] BGZ file object
1154 **
1155 **
1156 ** @release 6.3.0
1157 ******************************************************************************/
1158
ajSeqBamBgzfOpenC(const char * path,const char * mode)1159 AjPSeqBamBgzf ajSeqBamBgzfOpenC(const char* path, const char* mode)
1160 {
1161 AjPSeqBamBgzf fp = NULL;
1162 int fd;
1163 int oflag;
1164
1165 /* The reading mode is preferred. */
1166 if(mode[0] == 'r' || mode[0] == 'R')
1167 {
1168 oflag = O_RDONLY;
1169 #ifdef WIN32
1170 oflag |= O_BINARY;
1171 #endif /* WIN32 */
1172 fd = open(path, oflag);
1173
1174 if(fd == -1)
1175 return 0;
1176
1177 fp = bamBgzfOpenfdRead(fd);
1178 }
1179 else if(mode[0] == 'w' || mode[0] == 'W')
1180 {
1181 oflag = O_WRONLY | O_CREAT | O_TRUNC;
1182 #ifdef WIN32
1183 oflag |= O_BINARY;
1184 #endif /* WIN32 */
1185 fd = open(path, oflag, 0666);
1186
1187 if(fd == -1)
1188 return 0;
1189
1190 fp = bamBgzfOpenfdWrite(fd, strstr(mode, "u")? 1 : 0);
1191 }
1192
1193 if(fp != NULL)
1194 fp->owned_file = 1;
1195
1196 return fp;
1197 }
1198
1199
1200
1201
1202 /* @func ajSeqBamBgzfOpenfd ***************************************************
1203 **
1204 ** Open a BGZip BAM file by file descriptor
1205 **
1206 ** @param [r] fd [int] open file descriptor
1207 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1208 ** @return [AjPSeqBamBgzf] BGZip BAM file
1209 **
1210 **
1211 ** @release 6.3.0
1212 ******************************************************************************/
1213
ajSeqBamBgzfOpenfd(int fd,const char * mode)1214 AjPSeqBamBgzf ajSeqBamBgzfOpenfd(int fd, const char * mode)
1215 {
1216 if(fd == -1)
1217 return 0;
1218
1219 if (mode[0] == 'r' || mode[0] == 'R')
1220 return bamBgzfOpenfdRead(fd);
1221 else if (mode[0] == 'w' || mode[0] == 'W')
1222 return bamBgzfOpenfdWrite(fd, strstr(mode, "u")? 1 : 0);
1223
1224 return NULL;
1225 }
1226
1227
1228
1229
1230 /* @func ajSeqBamBgzfSetInfile ************************************************
1231 **
1232 ** Sets data structure to read from an open BGZip BAM file
1233 **
1234 ** @param [u] gzfile [AjPSeqBamBgzf] BGZip BAM file
1235 ** @param [u] inf [AjPFile] Open input file object
1236 ** @return [AjBool] True on success
1237 ** @release 6.5.0
1238 ******************************************************************************/
1239
ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile,AjPFile inf)1240 AjBool ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile, AjPFile inf)
1241 {
1242 if(!inf)
1243 return ajFalse;
1244
1245 gzfile->file_descriptor = fileno(ajFileGetFileptr(inf));
1246 gzfile->open_mode = 'r';
1247 gzfile->owned_file = 0;
1248 gzfile->is_uncompressed = '\0';
1249 gzfile->file = ajFileGetFileptr(inf);
1250 gzfile->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1251 gzfile->uncompressed_block = NULL;
1252 gzfile->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1253 gzfile->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1254 gzfile->block_address = 0;
1255 gzfile->block_offset = 0;
1256 gzfile->block_length = 0;
1257 gzfile->error = NULL;
1258
1259 return ajTrue;
1260 }
1261
1262
1263
1264
1265 /* @func ajSeqBamBgzfSetOutfile ***********************************************
1266 **
1267 ** Sets data structure to write to an open BGZip BAM file
1268 **
1269 ** @param [u] gzfile [AjPSeqBamBgzf] BGZip BAM file
1270 ** @param [u] outf [AjPFile] Open output file object
1271 ** @return [AjBool] True on success
1272 ** @release 6.5.0
1273 ******************************************************************************/
1274
ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile,AjPFile outf)1275 AjBool ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile, AjPFile outf)
1276 {
1277 if(!outf)
1278 return ajFalse;
1279
1280 gzfile->file_descriptor = fileno(ajFileGetFileptr(outf));
1281 gzfile->open_mode = 'w';
1282 gzfile->owned_file = 0;
1283 gzfile->is_uncompressed = '\0';
1284 gzfile->file = ajFileGetFileptr(outf);
1285 gzfile->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1286 gzfile->uncompressed_block = NULL;
1287 gzfile->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1288 gzfile->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1289 gzfile->block_address = 0;
1290 gzfile->block_offset = 0;
1291 gzfile->block_length = 0;
1292 gzfile->error = NULL;
1293
1294 return ajTrue;
1295 }
1296
1297
1298
1299
1300 /* @func ajSeqBamBgzfNew *****************************************************
1301 **
1302 ** Constructs a BGZip BAM file by file
1303 **
1304 ** @param [u] file [FILE*] open file
1305 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1306 ** @return [AjPSeqBamBgzf] BGZip BAM file
1307 **
1308 **
1309 ** @release 6.5.0
1310 ******************************************************************************/
1311
ajSeqBamBgzfNew(FILE * file,const char * mode)1312 AjPSeqBamBgzf ajSeqBamBgzfNew(FILE* file, const char* mode)
1313 {
1314 AjPSeqBamBgzf fp;
1315
1316 if(file == 0)
1317 return 0;
1318
1319 fp = bamBgzfNew();
1320 fp->file_descriptor = fileno(file);
1321
1322 if(mode[0] == 'r' || mode[0] == 'R')
1323 {
1324 fp->open_mode = 'r';
1325 fp->cache = ajTableNew(512);
1326 }
1327 else
1328 fp->open_mode = 'w';
1329
1330 fp->file = file;
1331 fseek(fp->file, 0L, SEEK_SET);
1332
1333 return fp;
1334 }
1335
1336
1337
1338
1339 /* @funcstatic bamDeflateBlock ************************************************
1340 **
1341 ** Deflate an uncompressed block
1342 **
1343 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1344 ** @param [r] block_length [int] Uncompressed block length
1345 ** @return [int] Length of compressed block
1346 **
1347 **
1348 ** @release 6.3.0
1349 ******************************************************************************/
1350
bamDeflateBlock(AjPSeqBamBgzf fp,int block_length)1351 static int bamDeflateBlock(AjPSeqBamBgzf fp, int block_length)
1352 {
1353 /*
1354 ** Deflate the block in fp->uncompressed_block into fp->compressed_block.
1355 ** Also adds an extra field that stores the compressed block length.
1356 */
1357
1358 char *buffer = NULL;
1359 int buffer_size;
1360 int input_length;
1361 int compressed_length;
1362 int compress_level;
1363 int status;
1364 z_stream zs;
1365 ajuint crc;
1366 int remaining;
1367
1368 buffer = fp->compressed_block;
1369 buffer_size = fp->compressed_block_size;
1370
1371 /* Init gzip header */
1372 buffer[0] = AJSEQBAM_GZIP_ID1;
1373 buffer[1] = AJSEQBAM_GZIP_ID2;
1374 buffer[2] = AJSEQBAM_CM_DEFLATE;
1375 buffer[3] = AJSEQBAM_FLG_FEXTRA;
1376 buffer[4] = 0; /* mtime */
1377 buffer[5] = 0;
1378 buffer[6] = 0;
1379 buffer[7] = 0;
1380 buffer[8] = 0;
1381 buffer[9] = AJSEQBAM_OS_UNKNOWN;
1382 buffer[10] = AJSEQBAM_BGZF_XLEN;
1383 buffer[11] = 0;
1384 buffer[12] = AJSEQBAM_BGZF_ID1;
1385 buffer[13] = AJSEQBAM_BGZF_ID2;
1386 buffer[14] = AJSEQBAM_BGZF_LEN;
1387 buffer[15] = 0;
1388 buffer[16] = 0; /* placeholder for block length */
1389 buffer[17] = 0;
1390
1391 /* loop to retry for blocks that do not compress enough */
1392 input_length = block_length;
1393 compressed_length = 0;
1394
1395 while(1)
1396 {
1397 compress_level = fp->is_uncompressed ? Z_NO_COMPRESSION :
1398 Z_DEFAULT_COMPRESSION;
1399 zs.zalloc = NULL;
1400 zs.zfree = NULL;
1401 zs.next_in = fp->uncompressed_block;
1402 zs.avail_in = input_length;
1403 zs.next_out = (void*)&buffer[AJSEQBAM_BLOCK_HEADER_LENGTH];
1404 zs.avail_out = buffer_size - AJSEQBAM_BLOCK_HEADER_LENGTH -
1405 AJSEQBAM_BLOCK_FOOTER_LENGTH;
1406
1407 status = deflateInit2(&zs, compress_level, Z_DEFLATED,
1408 AJSEQBAM_GZIP_WINDOW_BITS,
1409 AJSEQBAM_Z_DEFAULT_MEM_LEVEL,
1410 Z_DEFAULT_STRATEGY);
1411
1412 if(status != Z_OK)
1413 {
1414 bamReportError(fp, "deflate init failed");
1415 return -1;
1416 }
1417
1418 status = deflate(&zs, Z_FINISH);
1419
1420 if(status != Z_STREAM_END)
1421 {
1422 deflateEnd(&zs);
1423
1424 if(status == Z_OK)
1425 {
1426 /*
1427 ** Not enough space in buffer.
1428 ** Can happen in the rare case the input doesn't
1429 ** compress enough.
1430 ** Reduce the amount of input until it fits.
1431 */
1432 input_length -= 1024;
1433
1434 if(input_length <= 0)
1435 {
1436 /* should never happen */
1437 bamReportError(fp, "input reduction failed");
1438 return -1;
1439 }
1440
1441 continue;
1442 }
1443
1444 bamReportError(fp, "deflate failed");
1445 return -1;
1446 }
1447
1448 status = deflateEnd(&zs);
1449
1450 if(status != Z_OK)
1451 {
1452 bamReportError(fp, "deflate end failed");
1453 return -1;
1454 }
1455
1456 compressed_length = zs.total_out;
1457 compressed_length += AJSEQBAM_BLOCK_HEADER_LENGTH +
1458 AJSEQBAM_BLOCK_FOOTER_LENGTH;
1459
1460 if(compressed_length > AJSEQBAM_MAX_BLOCK_SIZE)
1461 {
1462 /* should never happen */
1463 bamReportError(fp, "deflate overflow");
1464 return -1;
1465 }
1466
1467 break;
1468 }
1469
1470 packInt16((unsigned char*)&buffer[16], compressed_length-1);
1471 crc = crc32(0L, NULL, 0L);
1472 crc = crc32(crc, fp->uncompressed_block, input_length);
1473 packInt32((unsigned char*)&buffer[compressed_length-8], crc);
1474 packInt32((unsigned char*)&buffer[compressed_length-4], input_length);
1475
1476 remaining = block_length - input_length;
1477
1478 if(remaining > 0)
1479 {
1480 if(remaining > input_length)
1481 {
1482 /* should never happen (check so we can use memcpy) */
1483 bamReportError(fp, "remainder too large");
1484 return -1;
1485 }
1486
1487 memcpy((unsigned char*)fp->uncompressed_block,
1488 (unsigned char*)fp->uncompressed_block + input_length,
1489 remaining);
1490 }
1491
1492 fp->block_offset = remaining;
1493
1494 return compressed_length;
1495 }
1496
1497
1498
1499
1500 /* @funcstatic bamInflateBlock ************************************************
1501 **
1502 ** Uncompress a compressed block
1503 **
1504 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1505 ** @param [r] block_length [int] Compressed block length
1506 ** @return [int] Uncompressed block length
1507 **
1508 **
1509 ** @release 6.3.0
1510 ******************************************************************************/
1511
bamInflateBlock(AjPSeqBamBgzf fp,int block_length)1512 static int bamInflateBlock(AjPSeqBamBgzf fp, int block_length)
1513 {
1514 /* Inflate the block in fp->compressed_block into fp->uncompressed_block */
1515 int status;
1516 z_stream zs;
1517
1518 zs.zalloc = NULL;
1519 zs.zfree = NULL;
1520 zs.next_in = (unsigned char*)fp->compressed_block + 18;
1521 zs.avail_in = block_length - 16;
1522 zs.next_out = fp->uncompressed_block;
1523 zs.avail_out = fp->uncompressed_block_size;
1524
1525 status = inflateInit2(&zs, AJSEQBAM_GZIP_WINDOW_BITS);
1526
1527 if(status != Z_OK)
1528 {
1529 bamReportError(fp, "inflate init failed");
1530 return -1;
1531 }
1532
1533 status = inflate(&zs, Z_FINISH);
1534
1535 if(status != Z_STREAM_END)
1536 {
1537 inflateEnd(&zs);
1538 bamReportError(fp, "inflate failed");
1539 return -1;
1540 }
1541
1542 status = inflateEnd(&zs);
1543
1544 if(status != Z_OK)
1545 {
1546 bamReportError(fp, "inflate failed");
1547 return -1;
1548 }
1549
1550 return zs.total_out;
1551 }
1552
1553
1554
1555
1556 /* @funcstatic bamHeaderCheck *************************************************
1557 **
1558 ** Check a BAM file header
1559 **
1560 ** @param [r] header [const char*] Header buffer
1561 ** @return [int] 0 on success
1562 **
1563 **
1564 ** @release 6.3.0
1565 ******************************************************************************/
1566
bamHeaderCheck(const char * header)1567 static int bamHeaderCheck(const char *header)
1568 {
1569 return (header[0] == AJSEQBAM_GZIP_ID1 &&
1570 header[1] == (char) AJSEQBAM_GZIP_ID2 &&
1571 header[2] == Z_DEFLATED &&
1572 (header[3] & AJSEQBAM_FLG_FEXTRA) != 0 &&
1573 unpackInt16((const unsigned char*)&header[10]) ==
1574 AJSEQBAM_BGZF_XLEN &&
1575 header[12] == AJSEQBAM_BGZF_ID1 &&
1576 header[13] == AJSEQBAM_BGZF_ID2 &&
1577 unpackInt16((const unsigned char*)&header[14]) ==
1578 AJSEQBAM_BGZF_LEN);
1579 }
1580
1581
1582
1583
1584 /* @funcstatic bamCacheFree ***************************************************
1585 **
1586 ** Free BAM cache data
1587 **
1588 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1589 ** @return [void]
1590 **
1591 **
1592 ** @release 6.3.0
1593 ******************************************************************************/
1594
bamCacheFree(AjPSeqBamBgzf fp)1595 static void bamCacheFree(AjPSeqBamBgzf fp)
1596 {
1597
1598 /* if fp->open_mode is not 'r' then fp->cache is expected to be null */
1599
1600 if(!fp->cache)
1601 return;
1602
1603 ajTableDel(&fp->cache);
1604
1605 return;
1606 }
1607
1608
1609
1610
1611 /* @funcstatic bamCacheLoadBlock **********************************************
1612 **
1613 ** Load a block from cache
1614 **
1615 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1616 ** @param [r] block_address [ajlong] Block offset in file
1617 ** @return [int] Size of block
1618 **
1619 **
1620 ** @release 6.3.0
1621 ******************************************************************************/
1622
bamCacheLoadBlock(AjPSeqBamBgzf fp,ajlong block_address)1623 static int bamCacheLoadBlock(AjPSeqBamBgzf fp, ajlong block_address)
1624 {
1625 const BamPCache p;
1626
1627 p = ajTableFetchV(fp->cache, &block_address);
1628
1629 if(!p)
1630 return 0;
1631
1632 if(fp->block_length != 0)
1633 fp->block_offset = 0;
1634
1635 fp->block_address = block_address;
1636 fp->block_length = p->size;
1637 memcpy(fp->uncompressed_block, p->block, AJSEQBAM_MAX_BLOCK_SIZE);
1638 fseek(fp->file, p->end_offset, SEEK_SET);
1639
1640 return p->size;
1641 }
1642
1643
1644
1645
1646 /* @funcstatic bamCacheBlock **************************************************
1647 **
1648 ** add a block to the cache
1649 ** remove an old block if it exceeds maximum
1650 **
1651 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1652 ** @param [r] size [int] Cache size of data from current position
1653 ** @return [void]
1654 **
1655 **
1656 ** @release 6.3.0
1657 ******************************************************************************/
1658
bamCacheBlock(AjPSeqBamBgzf fp,int size)1659 static void bamCacheBlock(AjPSeqBamBgzf fp, int size)
1660 {
1661 BamPCache p;
1662 BamPCache oldp;
1663
1664 if(AJSEQBAM_MAX_BLOCK_SIZE >= fp->cache_size)
1665 return;
1666
1667 if((ajint) ajTableGetLength(fp->cache) * AJSEQBAM_MAX_BLOCK_SIZE >
1668 fp->cache_size)
1669 {
1670 /*
1671 ** A better way would be to remove the oldest block in the
1672 ** cache, but here a random one is removed for simplicity. This
1673 ** should not have a big impact on performance.
1674 */
1675
1676 /* some way to remove a value - or half the values - from the table */
1677 }
1678
1679 AJNEW0(p);
1680 p->size = fp->block_length;
1681 p->end_offset = fp->block_address + size;
1682 p->block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1683 memcpy(p->block, fp->uncompressed_block, AJSEQBAM_MAX_BLOCK_SIZE);
1684 oldp = ajTablePut(fp->cache, &fp->block_address, p);
1685
1686 if(oldp)
1687 {
1688 AJFREE(oldp->block);
1689 AJFREE(oldp);
1690 }
1691
1692 return;
1693 }
1694
1695
1696
1697
1698 /* @funcstatic bamReadBlock ***************************************************
1699 **
1700 ** Read the next block of data from a BGZ file
1701 **
1702 ** @param [u] fp [AjPSeqBamBgzf] BGZ file handler
1703 ** @return [int] Zeo on success
1704 **
1705 **
1706 ** @release 6.3.0
1707 ******************************************************************************/
1708
bamReadBlock(AjPSeqBamBgzf fp)1709 static int bamReadBlock(AjPSeqBamBgzf fp)
1710 {
1711 char header[AJSEQBAM_BLOCK_HEADER_LENGTH];
1712 int size = 0;
1713 ajlong block_address;
1714 int count;
1715 int block_length;
1716 char *compressed_block;
1717 int remaining;
1718
1719 block_address = ftell(fp->file);
1720
1721 if(bamCacheLoadBlock(fp, block_address))
1722 return 0;
1723
1724 count = fread(header, 1, sizeof(header), fp->file);
1725
1726 if(count == 0)
1727 {
1728 fp->block_length = 0;
1729 return 0;
1730 }
1731
1732 size = count;
1733
1734 if ((unsigned int) count != sizeof(header))
1735 {
1736 bamReportError(fp, "read failed");
1737 return -1;
1738 }
1739
1740 if(!bamHeaderCheck(header))
1741 {
1742 /*bamReportError(fp, "invalid block header");*/
1743 return -1;
1744 }
1745
1746 block_length = unpackInt16((unsigned char*)&header[16]) + 1;
1747 compressed_block = (char*) fp->compressed_block;
1748 memcpy(compressed_block, header, AJSEQBAM_BLOCK_HEADER_LENGTH);
1749 remaining = block_length - AJSEQBAM_BLOCK_HEADER_LENGTH;
1750 count = fread(&compressed_block[AJSEQBAM_BLOCK_HEADER_LENGTH], 1,
1751 remaining, fp->file);
1752
1753 if(count != remaining)
1754 {
1755 bamReportError(fp, "read failed");
1756 return -1;
1757 }
1758
1759 size += count;
1760 count = bamInflateBlock(fp, block_length);
1761
1762 if(count < 0)
1763 return -1;
1764
1765 if(fp->block_length != 0)
1766 {
1767 /* Do not reset offset if this read follows a seek. */
1768 fp->block_offset = 0;
1769 }
1770
1771 fp->block_address = block_address;
1772 fp->block_length = count;
1773 bamCacheBlock(fp, size);
1774
1775 return 0;
1776 }
1777
1778
1779
1780
1781 /* @func ajSeqBamBgzfRead *****************************************************
1782 **
1783 ** Read a BAM file data record of a given length
1784 **
1785 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1786 ** @param [u] data [void*] Buffer
1787 ** @param [r] length [int] length of buffer
1788 ** @return [int] Number of bytes read
1789 ** -1 on error
1790 **
1791 ** @release 6.3.0
1792 ******************************************************************************/
1793
ajSeqBamBgzfRead(AjPSeqBamBgzf fp,void * data,int length)1794 int ajSeqBamBgzfRead(AjPSeqBamBgzf fp, void *data, int length)
1795 {
1796 int bytes_read;
1797 char *output;
1798 int available;
1799 int copy_length;
1800 char *buffer;
1801
1802 if(length <= 0)
1803 return 0;
1804
1805 if(fp->open_mode != 'r')
1806 {
1807 bamReportError(fp, "file not open for reading");
1808 return -1;
1809 }
1810
1811 bytes_read = 0;
1812 output = data;
1813
1814 while(bytes_read < length)
1815 {
1816 available = fp->block_length - fp->block_offset;
1817
1818 if(available <= 0)
1819 {
1820 if(bamReadBlock(fp) != 0)
1821 {
1822 ajDebug("bamReadBlock failed\n");
1823 return -1;
1824 }
1825
1826 available = fp->block_length - fp->block_offset;
1827
1828 if(available <= 0)
1829 break;
1830 }
1831
1832 copy_length = BAMBGZFMIN(length-bytes_read, available);
1833 buffer = fp->uncompressed_block;
1834 memcpy(output, buffer + fp->block_offset, copy_length);
1835 fp->block_offset += copy_length;
1836 output += copy_length;
1837 bytes_read += copy_length;
1838 }
1839
1840 if(fp->block_offset == fp->block_length)
1841 {
1842 fp->block_address = ftell(fp->file);
1843 fp->block_offset = 0;
1844 fp->block_length = 0;
1845 }
1846
1847 return bytes_read;
1848 }
1849
1850
1851
1852
1853 /* @func ajSeqBamBgzfFlush ****************************************************
1854 **
1855 ** Flush block to output file
1856 **
1857 ** @param [u] fp [AjPSeqBamBgzf] Output file
1858 ** @return [int] 0 on success, -1 on failure
1859 **
1860 ** @release 6.3.0
1861 ******************************************************************************/
1862
ajSeqBamBgzfFlush(AjPSeqBamBgzf fp)1863 int ajSeqBamBgzfFlush(AjPSeqBamBgzf fp)
1864 {
1865 int count;
1866 int block_length;
1867
1868 while (fp->block_offset > 0)
1869 {
1870 block_length = bamDeflateBlock(fp, fp->block_offset);
1871
1872 if (block_length < 0)
1873 {
1874 return -1;
1875 }
1876
1877 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
1878
1879 if (count != block_length)
1880 {
1881 bamReportError(fp, "write failed");
1882 return -1;
1883 }
1884
1885 fp->block_address += block_length;
1886 }
1887
1888 return 0;
1889 }
1890
1891
1892
1893
1894 /* @func ajSeqBamBgzfWrite ****************************************************
1895 **
1896 ** Write length bytes from data to the file.
1897 **
1898 ** @param [u] fp [AjPSeqBamBgzf] BAM file
1899 ** @param [r] data [const void*] data buffer
1900 ** @param [r] length [int] Length of data buffer to be written
1901 ** @return [int] Returns the number of bytes written.
1902 ** Returns -1 on error.
1903 **
1904 ** @release 6.3.0
1905 ******************************************************************************/
1906
ajSeqBamBgzfWrite(AjPSeqBamBgzf fp,const void * data,int length)1907 int ajSeqBamBgzfWrite(AjPSeqBamBgzf fp, const void* data, int length)
1908 {
1909 const char *input;
1910 int block_length;
1911 int bytes_written;
1912 int copy_length;
1913 char *buffer;
1914
1915 if(fp->open_mode != 'w')
1916 {
1917 bamReportError(fp, "file not open for writing");
1918 return -1;
1919 }
1920
1921 if(fp->uncompressed_block == NULL)
1922 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
1923
1924 input = data;
1925 block_length = fp->uncompressed_block_size;
1926 bytes_written = 0;
1927
1928 while(bytes_written < length)
1929 {
1930 copy_length = BAMBGZFMIN(block_length - fp->block_offset,
1931 length - bytes_written);
1932
1933 buffer = fp->uncompressed_block;
1934 memcpy(buffer + fp->block_offset, input, copy_length);
1935 fp->block_offset += copy_length;
1936 input += copy_length;
1937 bytes_written += copy_length;
1938
1939 if(fp->block_offset == block_length)
1940 {
1941 if(ajSeqBamBgzfFlush(fp) != 0)
1942 break;
1943 }
1944 }
1945
1946 return bytes_written;
1947 }
1948
1949
1950
1951
1952 /* @func ajSeqBamBgzfClose ****************************************************
1953 **
1954 ** Close a BGZ file and free all associated resources.
1955 **
1956 ** Does not close the underlying file descriptor if created with bgzf_fdopen.
1957 **
1958 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1959 ** @return [int] Returns zero on success, -1 on error.
1960 **
1961 **
1962 ** @release 6.3.0
1963 ******************************************************************************/
1964
ajSeqBamBgzfClose(AjPSeqBamBgzf fp)1965 int ajSeqBamBgzfClose(AjPSeqBamBgzf fp)
1966 {
1967 int block_length;
1968
1969 if(!fp) return 0;
1970
1971 if(fp->open_mode == 'w')
1972 {
1973 if(ajSeqBamBgzfFlush(fp) != 0)
1974 return -1;
1975 /* add an empty last block */
1976 block_length = bamDeflateBlock(fp, 0);
1977 fwrite(fp->compressed_block, 1, block_length, fp->file);
1978
1979 if(fflush(fp->file) != 0)
1980 {
1981 bamReportError(fp, "flush failed");
1982 return -1;
1983 }
1984 }
1985
1986 if(fp->owned_file)
1987 {
1988 if(fclose(fp->file) != 0)
1989 return -1;
1990 }
1991
1992 free(fp->uncompressed_block);
1993 free(fp->compressed_block);
1994 bamCacheFree(fp);
1995 free(fp);
1996
1997 return 0;
1998 }
1999
2000
2001
2002
2003 /* @func ajSeqBamBgzfEof ******************************************************
2004 **
2005 ** Check for end of file marker for a BGzip BAM file
2006 **
2007 ** @param [u] fp [AjPSeqBamBgzf] Undocumented
2008 ** @return [int] 1 if we are at the end of file mark
2009 ** 0 if we are higher in the file
2010 ** -1 for an error
2011 **
2012 **
2013 ** @release 6.3.0
2014 ******************************************************************************/
2015
ajSeqBamBgzfEof(AjPSeqBamBgzf fp)2016 int ajSeqBamBgzfEof(AjPSeqBamBgzf fp)
2017 {
2018 static unsigned char magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0"
2019 "\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
2020 unsigned char buf[28];
2021 ajint offset;
2022 int status;
2023
2024 offset = ftell(fp->file);
2025
2026 status = fseek(fp->file, -28, SEEK_END);
2027 if(status != 0)
2028 {
2029 ajDebug("seqBamBgzfEof seek end-28 failed errno %d: %s\n",
2030 errno, strerror(errno));
2031 return -1;
2032 }
2033
2034 /* Ubuntu warns if retval of fread is not tested */
2035 if(!fread(buf, 1, 28, fp->file))
2036 {
2037 ajDebug("seqBamBgzfEof fread failed errno %d: %s\n",
2038 errno, strerror(errno));
2039 }
2040
2041 status = fseek(fp->file, offset, SEEK_SET); /* back where we started */
2042 if(status != 0)
2043 {
2044 ajDebug("seqBamBgzfEof seek %d failed errno %d: %s\n",
2045 offset, errno, strerror(errno));
2046 return -1;
2047 }
2048
2049 return (memcmp(magic, buf, 28) == 0) ? 1 : 0;
2050 }
2051
2052
2053
2054
2055 /* @func ajSeqBamHeaderRead ***************************************************
2056 **
2057 ** Read header section of BAM files
2058 **
2059 ** @param [u] gzfile [AjPSeqBamBgzf] BAM file handler
2060 ** @return [AjPSeqBamHeader] header object
2061 **
2062 ** @release 6.5.0
2063 ** @@
2064 ******************************************************************************/
2065
ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile)2066 AjPSeqBamHeader ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile)
2067 {
2068 AjPSeqBamHeader header=NULL;
2069 char bambuf [4] = " ";
2070 char* targetname;
2071 ajint status;
2072 ajuint i;
2073 ajuint ilen;
2074 ajuint ntargets;
2075 ajuint targetlen;
2076 ajuint namelen = 0;
2077
2078 static AjBool bigendian = ajFalse;
2079
2080 bigendian = ajUtilGetBigendian();
2081
2082 status = ajSeqBamBgzfEof(gzfile);
2083 ajDebug("called ajSeqBamBgzfEof status %d errno %d ESPIPE %d\n",
2084 status, errno, ESPIPE);
2085
2086 if (status < 0 && (errno == ESPIPE))
2087 {
2088 /*
2089 ** If the file is a pipe, checking the EOF marker will *always* fail
2090 ** with ESPIPE. Suppress the error message in this case.
2091 */
2092 }
2093 else if (status < 0) /* allow 0 - older BAM files have no EOF block */
2094 {
2095 /* failed and not a PIPE */
2096 return NULL;
2097 }
2098
2099 status = ajSeqBamBgzfRead(gzfile, bambuf, 4);
2100
2101 ajDebug("called ajSeqBamBgzfRead status %d\n", status);
2102
2103 if(status < 0)
2104 {
2105 return NULL;
2106 }
2107
2108 if (strncmp(bambuf, "BAM\001", 4))
2109 {
2110 return NULL;
2111 }
2112
2113 /* BAM header */
2114
2115 /* read plain text and the number of reference sequences */
2116 status = ajSeqBamBgzfRead(gzfile, &ilen, 4);
2117
2118 if(status != 4)
2119 ajErr("seqReadBam failed to read reference length");
2120
2121 if(bigendian)
2122 ajByteRevUint(&ilen);
2123
2124 header = ajSeqBamHeaderNew();
2125
2126 header->text = ajCharNewRes(ilen+1);
2127 ajSeqBamBgzfRead(gzfile, header->text, ilen);
2128 ajDebug("header plain text %u '%s'\n", ilen, header->text);
2129
2130 header->dict = bamHeaderParse2(header->text);
2131 header->rg2lib = bamHeaderTotable(header->dict, "RG", "ID", "LB");
2132
2133 ajSeqBamBgzfRead(gzfile, &ntargets, 4);
2134
2135 if(bigendian)
2136 ajByteRevUint(&ntargets);
2137
2138 ajDebug("bam_header_read: # bam targets = %u\n", ntargets);
2139 header->n_targets = ntargets;
2140
2141 AJCNEW0(header->target_name, header->n_targets);
2142 AJCNEW0(header->target_len, header->n_targets);
2143
2144 /* read reference sequence names and lengths */
2145
2146 for(i=0; i < ntargets; i++)
2147 {
2148 ajSeqBamBgzfRead(gzfile, &namelen, 4);
2149
2150 if(bigendian)
2151 ajByteRevUint(&namelen);
2152
2153 targetname = ajCharNewRes(namelen+1);
2154 ajSeqBamBgzfRead(gzfile, targetname, namelen);
2155 ajSeqBamBgzfRead(gzfile, &targetlen, 4);
2156
2157 if(bigendian)
2158 ajByteRevUint(&targetlen);
2159
2160 header->target_name[i] = targetname;
2161 header->target_len[i] = targetlen;
2162
2163 ajDebug("bam target[%u] %u '%s'\n", i, targetlen, targetname);
2164 }
2165
2166 return header;
2167
2168 }
2169
2170
2171
2172
2173 /* @funcstatic bamDestroyHeaderHash *******************************************
2174 **
2175 ** Free the hash table in a BAM header object
2176 **
2177 ** @param [u] header [AjPSeqBamHeader] Header object
2178 ** @return [void]
2179 **
2180 **
2181 ** @release 6.3.0
2182 ******************************************************************************/
2183
bamDestroyHeaderHash(AjPSeqBamHeader header)2184 static void bamDestroyHeaderHash(AjPSeqBamHeader header)
2185 {
2186 if(header->hash)
2187 ajTableFree(&header->hash);
2188
2189 return;
2190 }
2191
2192
2193
2194
2195 /* @funcstatic bamHeaderFree **************************************************
2196 **
2197 ** Destructor for a BAM header list
2198 **
2199 ** @param [d] header [AjPList*] Header list object
2200 ** @return [void]
2201 **
2202 **
2203 ** @release 6.3.0
2204 ******************************************************************************/
2205
bamHeaderFree(AjPList * header)2206 static void bamHeaderFree(AjPList *header)
2207 {
2208 BamPHeaderLine hline = NULL;
2209
2210 while(ajListGetLength(*header))
2211 {
2212 ajListPop(*header, (void**)&hline);
2213
2214 if(hline)
2215 bamHeaderLineFree(&hline);
2216 }
2217
2218 ajListFree(header);
2219
2220 return;
2221 }
2222
2223
2224
2225
2226 /* @funcstatic bamHeaderLineFree **********************************************
2227 **
2228 ** Free a BAM header line
2229 **
2230 ** @param [d] Phline [BamPHeaderLine*] Header line object
2231 ** @return [void]
2232 **
2233 **
2234 ** @release 6.3.0
2235 ******************************************************************************/
2236
bamHeaderLineFree(BamPHeaderLine * Phline)2237 static void bamHeaderLineFree(BamPHeaderLine *Phline)
2238 {
2239 BamPHeaderLine hline = *Phline;
2240 AjPList tags = hline->tags;
2241 BamPHeaderTag htag = NULL;
2242
2243 while(ajListGetLength(tags))
2244 {
2245 ajListPop(tags, (void**)&htag);
2246 AJFREE(htag->value);
2247 AJFREE(htag);
2248 }
2249
2250 ajListFree(&tags);
2251 AJFREE(*Phline);
2252
2253 return;
2254 }
2255
2256
2257
2258
2259 /* @funcstatic bamHeaderParse2 ************************************************
2260 **
2261 ** Parse BAM file header text
2262 **
2263 ** @param [r] headerText [const char*] Header text
2264 ** @return [AjPList] list of header records
2265 **
2266 **
2267 ** @release 6.3.0
2268 ******************************************************************************/
2269
bamHeaderParse2(const char * headerText)2270 static AjPList bamHeaderParse2(const char *headerText)
2271 {
2272 AjPList hlines = NULL;
2273 BamPHeaderLine hline;
2274 const char *text;
2275 char *buf=NULL;
2276 size_t nbuf = 0;
2277
2278 if(!headerText)
2279 return 0;
2280
2281 text = headerText;
2282 hlines = ajListNew();
2283
2284 while((text=bamNextline(&buf, &nbuf, text)))
2285 {
2286 hline = bamHeaderLineParse(buf);
2287
2288 if(hline && bamHeaderLineValidate(hline))
2289 ajListPushAppend(hlines, hline);
2290 else
2291 {
2292 if(hline)
2293 bamHeaderLineFree(&hline);
2294
2295 bamHeaderFree(&hlines);
2296 AJFREE(buf);
2297
2298 return NULL;
2299 }
2300 }
2301
2302 AJFREE(buf);
2303
2304 return hlines;
2305 }
2306
2307
2308
2309
2310 /* @funcstatic bamHeaderTotable ***********************************************
2311 **
2312 ** Extract table data from a list of BAM header records
2313 **
2314 ** @param [r] dict [const AjPList] List of header records
2315 ** @param [r] type [const char[2]] Type
2316 ** @param [r] key_tag [const char[2]] Key tag
2317 ** @param [r] value_tag [const char[2]] Value tag
2318 ** @return [AjPTable] Table of BAM header tags and values
2319 **
2320 **
2321 ** @release 6.3.0
2322 ******************************************************************************/
2323
bamHeaderTotable(const AjPList dict,const char type[2],const char key_tag[2],const char value_tag[2])2324 static AjPTable bamHeaderTotable(const AjPList dict, const char type[2],
2325 const char key_tag[2],
2326 const char value_tag[2])
2327 {
2328 AjPTable tbl = NULL;
2329 BamPHeaderLine hline = NULL;
2330 AjIList k;
2331 BamPHeaderTag key;
2332 BamPHeaderTag value;
2333
2334 tbl = ajTablecharNew(512);
2335
2336 if(dict == 0) /* return an empty (not null) hash table */
2337 return tbl;
2338
2339 k = ajListIterNewread(dict);
2340
2341 while(!ajListIterDone(k))
2342 {
2343 hline = ajListIterGet(k);
2344
2345 if(hline->type[0]!=type[0] || hline->type[1]!=type[1])
2346 continue;
2347
2348 key = bamHeaderLineHasTag(hline,key_tag);
2349 value = bamHeaderLineHasTag(hline,value_tag);
2350
2351 if(!key || !value)
2352 continue;
2353
2354 if(ajTablePut(tbl, key->value, value->value))
2355 ajWarn("[bamHeaderTotable] The key %s not unique.\n",
2356 key->value);
2357 }
2358
2359 ajListIterDel(&k);
2360
2361 return tbl;
2362 }
2363
2364
2365
2366
2367 /* @funcstatic bamNextline ****************************************************
2368 **
2369 ** Mimics the behaviour of getline, except it returns pointer to the
2370 ** next chunk of the text or NULL if everything has been read.
2371 **
2372 ** The lineptr should be freed by the caller. The newline character is
2373 ** stripped.
2374 **
2375 ** @param [u] lineptr [char**] Buffer
2376 ** @param [u] n [size_t*] Buffer current size, can be extended
2377 ** @param [r] text [const char*] Text starting at current position
2378 ** @return [const char*] Next chunk of text, NULL when all is done.
2379 **
2380 **
2381 ** @release 6.3.0
2382 ******************************************************************************/
2383
bamNextline(char ** lineptr,size_t * n,const char * text)2384 static const char* bamNextline(char **lineptr, size_t *n, const char *text)
2385 {
2386 ajuint len;
2387 const char *to = NULL;
2388
2389 to = text;
2390
2391 if(!*to)
2392 return NULL;
2393
2394 while(*to && *to!='\n' && *to!='\r')
2395 to++;
2396
2397 len = to - text + 1;
2398
2399 if(*to)
2400 {
2401 /* Advance the pointer for the next call */
2402 if(*to=='\n')
2403 to++;
2404 else if(*to=='\r' && *(to+1)=='\n')
2405 to+=2;
2406 }
2407
2408 if(!len)
2409 return to;
2410
2411 if(!*lineptr)
2412 {
2413 *lineptr = malloc(len);
2414 *n = len;
2415 }
2416 else if(*n<len)
2417 {
2418 *lineptr = realloc(*lineptr, len);
2419 *n = len;
2420 }
2421
2422 if(!*lineptr)
2423 ajDie("[bamNextline] Insufficient memory!\n");
2424
2425 memcpy(*lineptr,text,len);
2426 (*lineptr)[len-1] = 0;
2427
2428 return to;
2429 }
2430
2431
2432
2433
2434 /* @func ajSeqBamHeaderLineParse **********************************************
2435 **
2436 ** Parse a header line and creates a table with tag-value pairs
2437 ** in the header line
2438 **
2439 ** @param [r] headerLine [const char*] Header line
2440 ** @return [AjPTable] Table with tag-value pairs
2441 **
2442 **
2443 ** @release 6.3.0
2444 ******************************************************************************/
2445
ajSeqBamHeaderLineParse(const char * headerLine)2446 AjPTable ajSeqBamHeaderLineParse(const char *headerLine)
2447 {
2448 BamPHeaderLine hline=NULL;
2449 AjPTable ret=NULL;
2450
2451 hline = bamHeaderLineParse(headerLine);
2452
2453 if(!hline)
2454 return NULL;
2455
2456 bamHeaderLineValidate(hline);
2457
2458 ret = bamHeaderLineToTablemap(hline);
2459
2460 if(!ret)
2461 return NULL;
2462
2463 ajTablePut(ret,
2464 ajCharNewC("type"),
2465 ajCharNewResLenC(hline->type,3,2));
2466
2467 bamHeaderLineFree(&hline);
2468
2469 return ret;
2470 }
2471
2472
2473
2474
2475 /* @funcstatic bamHeaderLineToTablemap ****************************************
2476 **
2477 ** Looks for a tag key and returns the tags and values in a
2478 ** table
2479 **
2480 ** @param [r] hline [const BamPHeaderLine] Header line
2481 ** @return [AjPTable] Table of tags and values
2482 **
2483 **
2484 ** @release 6.5.0
2485 ******************************************************************************/
2486
bamHeaderLineToTablemap(const BamPHeaderLine hline)2487 static AjPTable bamHeaderLineToTablemap(const BamPHeaderLine hline)
2488 {
2489 AjPList tags = hline->tags;
2490 BamPHeaderTag tag = NULL;
2491 AjPTable hlinemap = NULL;
2492 AjIList iter = NULL;
2493
2494 iter = ajListIterNewread(tags);
2495 hlinemap = ajTablecharNew(16);
2496 ajTableSetDestroyvalue(hlinemap, &ajMemFree);
2497
2498 while(!ajListIterDone(iter))
2499 {
2500 tag = ajListIterGet(iter);
2501
2502 ajTablePut(hlinemap, ajCharNewResLenC(tag->key,3,2),
2503 ajCharNewC(tag->value));
2504 }
2505
2506 ajListIterDel(&iter);
2507
2508 return hlinemap;
2509 }
2510
2511
2512
2513
2514 /* @funcstatic bamHeaderLineParse *********************************************
2515 **
2516 ** Parse a header line and creates a header line object
2517 **
2518 ** @param [r] headerLine [const char*] Header line
2519 ** @return [BamPHeaderLine] Header line object
2520 **
2521 **
2522 ** @release 6.3.0
2523 ******************************************************************************/
2524
bamHeaderLineParse(const char * headerLine)2525 static BamPHeaderLine bamHeaderLineParse(const char *headerLine)
2526 {
2527 BamPHeaderLine hline;
2528 BamPHeaderTag tag;
2529 const char *from, *to;
2530 int itype;
2531
2532 from = headerLine;
2533
2534 if(*from != '@')
2535 {
2536 ajWarn("[bamHeaderLineParse] expected '@', got [%s]\n", headerLine);
2537 return 0;
2538 }
2539
2540 to = ++from;
2541
2542 while(*to && *to!='\t')
2543 to++;
2544
2545 if(to-from != 2)
2546 {
2547 ajWarn("[bamHeaderLineParse] expected '@XY', got [%s]\n", headerLine);
2548 return 0;
2549 }
2550
2551 AJNEW(hline);
2552 hline->type[0] = from[0];
2553 hline->type[1] = from[1];
2554 hline->tags = NULL;
2555
2556 itype = bamTagExists(hline->type, types);
2557
2558 from = to;
2559
2560 while(*to && *to=='\t')
2561 to++;
2562
2563 if(to-from != 1)
2564 {
2565 ajWarn("[bamHeaderLineParse] multiple tabs on line [%s] (%d)\n",
2566 headerLine,(int)(to-from));
2567 return 0;
2568 }
2569
2570 from = to;
2571
2572 while(*from)
2573 {
2574 while(*to && *to!='\t')
2575 to++;
2576
2577 if(!required_tags[itype] && !optional_tags[itype])
2578 tag = bamTagNew(" ",from,to-1);
2579 else
2580 tag = bamTagNew(from,from+3,to-1);
2581
2582 if(bamHeaderLineHasTag(hline,tag->key))
2583 ajWarn("The tag '%c%c' is present (at least) twice on line [%s]\n",
2584 tag->key[0],tag->key[1], headerLine);
2585
2586 if(!hline->tags)
2587 hline->tags = ajListNew();
2588
2589 ajListPushAppend(hline->tags, tag);
2590
2591 from = to;
2592
2593 while(*to && *to=='\t')
2594 to++;
2595
2596 if(*to && to-from != 1)
2597 {
2598 ajWarn("[bamHeaderLineParse] multiple tabs on line [%s] (%d)\n",
2599 headerLine,(int)(to-from));
2600 return 0;
2601 }
2602
2603 from = to;
2604 }
2605
2606 return hline;
2607 }
2608
2609
2610
2611
2612 /* @funcstatic bamHeaderLineValidate ******************************************
2613 **
2614 ** Validate a BAM header record
2615 **
2616 ** Must be of an existing type, all tags must be recognised and all
2617 ** required tags must be present
2618 **
2619 ** @param [u] hline [BamPHeaderLine] Header line object
2620 ** @return [int] 1 on success, 0 and a message on failure
2621 **
2622 **
2623 ** @release 6.3.0
2624 ******************************************************************************/
2625
bamHeaderLineValidate(BamPHeaderLine hline)2626 static int bamHeaderLineValidate(BamPHeaderLine hline)
2627 {
2628 AjPList tags = NULL;
2629 BamPHeaderTag tag = NULL;
2630 int itype, itag;
2631 AjIList iter = NULL;
2632
2633 /* Is the type correct? */
2634 itype = bamTagExists(hline->type, types);
2635
2636 if(itype==-1)
2637 {
2638 ajWarn("The type [%c%c] not recognised.\n",
2639 hline->type[0],hline->type[1]);
2640 return 0;
2641 }
2642
2643 /* Has all required tags? */
2644 itag = 0;
2645
2646 while(required_tags[itype] && required_tags[itype][itag])
2647 {
2648 if(!bamHeaderLineHasTag(hline,required_tags[itype][itag]))
2649 {
2650 ajWarn("The tag [%c%c] required for [%c%c] not present.\n",
2651 required_tags[itype][itag][0],
2652 required_tags[itype][itag][1],
2653 hline->type[0],hline->type[1]);
2654
2655 return 0;
2656 }
2657
2658 itag++;
2659 }
2660
2661 /* Are all tags recognised? */
2662 tags = hline->tags;
2663 iter = ajListIterNewread(tags);
2664
2665 while(!ajListIterDone(iter))
2666 {
2667 tag = ajListIterGet(iter);
2668
2669 if (!bamTagExists(tag->key,required_tags[itype]) &&
2670 !bamTagExists(tag->key,optional_tags[itype]))
2671 {
2672 ajWarn("Unknown tag [%c%c] for [%c%c].\n",
2673 tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
2674
2675 ajListIterDel(&iter);
2676
2677 return 0;
2678 }
2679 }
2680
2681 ajListIterDel(&iter);
2682
2683 return 1;
2684 }
2685
2686
2687
2688
2689 /* @funcstatic bamHeaderLineHasTag ********************************************
2690 **
2691 ** Looks for a tag key and returns the tag and value as a
2692 ** BamPheaderTag object if found
2693 **
2694 ** @param [r] hline [const BamPHeaderLine] Header line
2695 ** @param [r] key [const char*] Key to look for
2696 ** @return [BamPHeaderTag] Header tag if found, NULL if not
2697 **
2698 **
2699 ** @release 6.3.0
2700 ******************************************************************************/
2701
bamHeaderLineHasTag(const BamPHeaderLine hline,const char * key)2702 static BamPHeaderTag bamHeaderLineHasTag(const BamPHeaderLine hline,
2703 const char *key)
2704 {
2705 AjPList tags = hline->tags;
2706 BamPHeaderTag tag = NULL;
2707 BamPHeaderTag ret = NULL;
2708 AjIList iter = NULL;
2709
2710 iter = ajListIterNewread(tags);
2711
2712 while(!ajListIterDone(iter))
2713 {
2714 tag = ajListIterGet(iter);
2715
2716 if(tag->key[0]==key[0] && tag->key[1]==key[1])
2717 {
2718 ret = tag;
2719 break;
2720 }
2721 }
2722
2723 ajListIterDel(&iter);
2724
2725 return ret;
2726 }
2727
2728
2729
2730
2731 /* @funcstatic bamTagExists ***************************************************
2732 **
2733 ** Look for a tag "XY" in a predefined const char *[] array.
2734 **
2735 ** @param [r] tag [const char*] query tag name
2736 ** @param [r] tags [const char**] array of tage names
2737 ** @return [int] tag number on success, -1 on failure
2738 **
2739 **
2740 ** @release 6.3.0
2741 ******************************************************************************/
2742
bamTagExists(const char * tag,const char ** tags)2743 static int bamTagExists(const char *tag, const char **tags)
2744 {
2745 int itag=0;
2746
2747 if(!tags)
2748 return -1;
2749
2750 while(tags[itag])
2751 {
2752 if(tags[itag][0]==tag[0] && tags[itag][1]==tag[1])
2753 return itag;
2754
2755 itag++;
2756 }
2757
2758 return -1;
2759 }
2760
2761
2762
2763
2764 /* @funcstatic bamTagNew ******************************************************
2765 **
2766 ** name points to "XY", value_from points to the first character of
2767 ** the value string and value_to points to the last character of the
2768 ** value string.
2769 **
2770 ** @param [r] name [const char*] tag name
2771 ** @param [r] value_from [const char*] first character of value
2772 ** @param [r] value_to [const char*] last character of value
2773 ** @return [BamPHeaderTag] Header tag object
2774 **
2775 **
2776 ** @release 6.3.0
2777 ******************************************************************************/
2778
bamTagNew(const char * name,const char * value_from,const char * value_to)2779 static BamPHeaderTag bamTagNew(const char *name, const char *value_from,
2780 const char *value_to)
2781 {
2782 BamPHeaderTag tag = NULL;
2783 int len;
2784
2785 AJNEW0(tag);
2786 len = value_to-value_from+1;
2787
2788 tag->key[0] = name[0];
2789 tag->key[1] = name[1];
2790 tag->value = malloc(len+1);
2791 memcpy(tag->value,value_from,len+1);
2792 tag->value[len] = 0;
2793
2794 return tag;
2795 }
2796
2797
2798
2799
2800 #define MBAMSKIPTAG(s) do { \
2801 int type = toupper(*(s)); \
2802 ++(s); \
2803 if (type == 'C' || type == 'A') \
2804 ++(s); \
2805 else if (type == 'S') \
2806 (s) += 2; \
2807 else if (type == 'I' || type == 'F') \
2808 (s) += 4; \
2809 else if (type == 'D') \
2810 (s) += 8; \
2811 else if (type == 'Z' || type == 'H') \
2812 { \
2813 while(*(s)) \
2814 ++(s); \
2815 ++(s); \
2816 } \
2817 } while(0)
2818
2819
2820
2821
2822
2823 /* @func ajSeqBamAuxAppend ****************************************************
2824 **
2825 ** Append auxiliary data tag to a BAM record
2826 **
2827 ** @param [u] b [AjPSeqBam] BAM record
2828 ** @param [r] tag [const char[2]] Tag name
2829 ** @param [r] type [char] Tag type
2830 ** @param [r] len [int] data length
2831 ** @param [r] data [const unsigned char*] Tag type
2832 ** @return [void]
2833 **
2834 **
2835 ** @release 6.5.0
2836 ******************************************************************************/
2837
ajSeqBamAuxAppend(AjPSeqBam b,const char tag[2],char type,int len,const unsigned char * data)2838 void ajSeqBamAuxAppend(AjPSeqBam b, const char tag[2],
2839 char type, int len, const unsigned char* data)
2840 {
2841 int ori_len = b->data_len;
2842
2843 b->data_len += 3 + len;
2844 b->l_aux += 3 + len;
2845
2846 if (b->m_data < b->data_len) {
2847 b->m_data = b->data_len;
2848 kroundup32(b->m_data);
2849 b->data = (unsigned char*)realloc(b->data, b->m_data);
2850 }
2851 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
2852 b->data[ori_len + 2] = type;
2853 memcpy(b->data + ori_len + 3, data, len);
2854 }
2855
2856
2857
2858
2859 /* @funcstatic bamGetAux ******************************************************
2860 **
2861 ** Get auxiliary data tag from a BAM record
2862 **
2863 ** @param [r] b [const AjPSeqBam] BAM record
2864 ** @param [r] tag [const char[2]] Tag name
2865 ** @return [unsigned char*] Tag value
2866 **
2867 **
2868 ** @release 6.3.0
2869 ******************************************************************************/
2870
bamGetAux(const AjPSeqBam b,const char tag[2])2871 static unsigned char* bamGetAux(const AjPSeqBam b, const char tag[2])
2872 {
2873 unsigned char *s;
2874 int y;
2875 int x;
2876
2877 y = tag[0]<<8 | tag[1];
2878
2879 s = MAJSEQBAMAUX(b);
2880
2881 while(s < b->data + b->data_len)
2882 {
2883 x = (int)s[0]<<8 | s[1];
2884 s += 2;
2885
2886 if(x == y)
2887 return s;
2888
2889 MBAMSKIPTAG(s);
2890 }
2891
2892 return 0;
2893 }
2894
2895
2896
2897
2898 /* @func ajSeqBamBgzfSeek *****************************************************
2899 **
2900 ** Seek function for a BAM file
2901 **
2902 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
2903 ** @param [r] pos [ajlong] Buffer
2904 ** @param [r] where [int] length of buffer
2905 ** @return [ajlong] Number of bytes read,
2906 ** -1 on error
2907 **
2908 ** @release 6.5.0
2909 ******************************************************************************/
2910
ajSeqBamBgzfSeek(AjPSeqBamBgzf fp,ajlong pos,int where)2911 ajlong ajSeqBamBgzfSeek(AjPSeqBamBgzf fp, ajlong pos, int where)
2912 {
2913 int block_offset;
2914 ajlong block_address;
2915
2916 if (fp->open_mode != 'r')
2917 {
2918 ajErr("file not open for read");
2919 return -1;
2920 }
2921
2922 if (where != SEEK_SET)
2923 {
2924 ajErr("unimplemented seek option");
2925 return -1;
2926 }
2927
2928 block_offset = pos & 0xFFFF;
2929 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
2930
2931 if (fseek(fp->file, block_address, SEEK_SET) != 0)
2932 {
2933 ajErr("seek failed");
2934 return -1;
2935 }
2936
2937 fp->block_length = 0; /* indicates current block is not loaded */
2938 fp->block_address = block_address;
2939 fp->block_offset = block_offset;
2940
2941 return 0;
2942 }
2943
2944
2945
2946
2947 /* @func ajSeqBamValidate *****************************************************
2948 **
2949 ** Simple validation of a BAM record
2950 **
2951 ** Other fields could also be checked, especially the auxiliary data
2952 **
2953 ** @param [r] header [const AjPSeqBamHeader] header object
2954 ** @param [r] b [const AjPSeqBam] BAM alignment record
2955 ** @return [int] 1 on success, 0 otherwise
2956 **
2957 ** @release 6.5.0
2958 ******************************************************************************/
2959
ajSeqBamValidate(const AjPSeqBamHeader header,const AjPSeqBam b)2960 int ajSeqBamValidate(const AjPSeqBamHeader header, const AjPSeqBam b)
2961 {
2962 char *s;
2963
2964 if (b->core.tid < -1 || b->core.mtid < -1)
2965 return 0;
2966
2967 if (header && (b->core.tid >= header->n_targets
2968 || b->core.mtid >= header->n_targets))
2969 return 0;
2970
2971 if (b->data_len < b->core.l_qname)
2972 return 0;
2973
2974 s = memchr(MAJSEQBAMQNAME(b), '\0', b->core.l_qname);
2975
2976 if (s != &MAJSEQBAMQNAME(b)[b->core.l_qname-1])
2977 return 0;
2978
2979 return 1;
2980 }
2981
2982
2983
2984
2985 /* @func ajSeqBamHeaderGetRefseqTags ******************************************
2986 **
2987 ** Return reference sequence tags, in a table which includes sub-tables for
2988 ** each individual reference sequence.
2989 ** Each sub-table including tag value pairs for a reference sequence.
2990 **
2991 ** @param [r] header [const AjPSeqBamHeader] BAM header
2992 ** @return [AjPTable] Table of tables for reference sequence tags
2993 **
2994 **
2995 ** @release 6.5.0
2996 ******************************************************************************/
2997
ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header)2998 AjPTable ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header)
2999 {
3000 AjPTable ret = NULL;
3001 BamPHeaderLine hline = NULL;
3002 AjIList k = NULL;
3003 BamPHeaderTag uri = NULL;
3004 BamPHeaderTag md5 = NULL;
3005 BamPHeaderTag species = NULL;
3006 BamPHeaderTag seqname = NULL;
3007 BamPHeaderTag assemid = NULL;
3008 AjPTable tags = NULL;
3009 const AjPList dict = NULL;
3010
3011 dict = header->dict;
3012
3013 if(dict == NULL)
3014 return ret;
3015
3016 ret = ajTablecharNewConst(16);
3017
3018 k = ajListIterNewread(dict);
3019
3020 while(!ajListIterDone(k))
3021 {
3022 hline = ajListIterGet(k);
3023
3024 if(hline->type[0] != 'S' || hline->type[1] != 'Q')
3025 continue;
3026
3027 seqname = bamHeaderLineHasTag(hline,"SN");
3028
3029 if(!seqname)
3030 {
3031 ajErr("ajSeqBamHeaderGetRefseqTags: missing SN tag"
3032 " in SQ header line");
3033 return NULL;
3034 }
3035
3036 tags = ajTablecharNew(8);
3037
3038 assemid = bamHeaderLineHasTag(hline,"AS");
3039 if(assemid)
3040 ajTablePut(tags, ajCharNewC(assemid->key), assemid->value);
3041
3042 uri = bamHeaderLineHasTag(hline,"UR");
3043 if(uri)
3044 ajTablePut(tags, ajCharNewC(uri->key), uri->value);
3045
3046 md5 = bamHeaderLineHasTag(hline,"M5");
3047 if(md5)
3048 ajTablePut(tags, ajCharNewC(md5->key), md5->value);
3049
3050 species = bamHeaderLineHasTag(hline,"SP");
3051 if(species)
3052 ajTablePut(tags, ajCharNewC(species->key), species->value);
3053
3054 ajTablePut(ret, seqname->value, tags);
3055
3056 }
3057
3058 ajListIterDel(&k);
3059
3060 return ret;
3061 }
3062
3063
3064
3065
3066 /* @func ajSeqBamHeaderGetReadgroupTags****************************************
3067 **
3068 ** Return read group tags, in a table which includes sub-tables for
3069 ** each individual read group.
3070 ** Each sub-table including tag value pairs for a read group.
3071 **
3072 ** @param [r] header [const AjPSeqBamHeader] BAM header
3073 ** @return [AjPTable] Table of tables for read-group tags
3074 **
3075 **
3076 ** @release 6.5.0
3077 ******************************************************************************/
3078
ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header)3079 AjPTable ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header)
3080 {
3081 AjPTable ret = NULL;
3082 BamPHeaderLine hline = NULL;
3083 AjIList k = NULL;
3084 BamPHeaderTag tag = NULL;
3085 AjPTable tags = NULL;
3086 AjIList iter = NULL;
3087 const AjPList dict = NULL;
3088 char* id = NULL;
3089
3090 dict = header->dict;
3091
3092 if(dict == NULL)
3093 return ret;
3094
3095 ret = ajTablecharNewConst(16);
3096
3097 k = ajListIterNewread(dict);
3098
3099 while(!ajListIterDone(k))
3100 {
3101 hline = ajListIterGet(k);
3102
3103 if(hline->type[0] != 'R' || hline->type[1] != 'G') /* RG header line */
3104 continue;
3105
3106 tags = ajTablecharNew(8);
3107
3108 iter = ajListIterNewread(hline->tags);
3109
3110 while(!ajListIterDone(iter))
3111 {
3112 tag = ajListIterGet(iter);
3113
3114 if(tag->key[0]=='I' && tag->key[1]=='D') /* ID tag for RG line */
3115 {
3116 id = tag->value;
3117 continue;
3118 }
3119 ajTablePut(tags, ajCharNewResLenC(tag->key, 3, 2), tag->value);
3120 }
3121
3122 ajListIterDel(&iter);
3123
3124 if(!id)
3125 {
3126 ajErr("ajSeqBamHeaderGetReadgroupTags: missing ID tag"
3127 " in RG header line");
3128 return NULL;
3129 }
3130
3131 ajTablePut(ret, id, tags);
3132
3133 }
3134
3135 ajListIterDel(&k);
3136
3137 return ret;
3138 }
3139
3140
3141
3142
3143 /* @func ajSeqBamHeaderGetSortorder *******************************************
3144 **
3145 ** Return sort-order defined in header
3146 **
3147 ** @param [r] header [const AjPSeqBamHeader] BAM header
3148 ** @return [const char*] sort order for the BAM file
3149 **
3150 **
3151 ** @release 6.5.0
3152 ******************************************************************************/
3153
ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header)3154 const char* ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header)
3155 {
3156 const char* ret = NULL;
3157 BamPHeaderLine hline = NULL;
3158 AjIList k = NULL;
3159 BamPHeaderTag so = NULL;
3160
3161 if(!header || !header->dict)
3162 return NULL;
3163
3164 k = ajListIterNewread(header->dict);
3165
3166 while(!ajListIterDone(k))
3167 {
3168 hline = ajListIterGet(k);
3169
3170 if(hline->type[0] != 'H' || hline->type[1] != 'D')
3171 continue;
3172
3173 so = bamHeaderLineHasTag(hline,"SO");
3174
3175 if(so)
3176 ret = so->value;
3177
3178 break;
3179
3180 }
3181
3182 ajListIterDel(&k);
3183
3184 return ret;
3185 }
3186