1 /* @source ajseqbam ***********************************************************
2 **
3 ** AJAX BAM format sequence processing functions
4 **
5 ** These functions control all aspects of AJAX BAM file processing
6 **
7 ** @author Copyright (C) 2010 Peter Rice ported from samtools
8 ** @version $Revision: 1.36 $
9 ** @modified 2010-2011 Peter Rice
10 ** @modified $Date: 2012/06/27 08:07:40 $ by $Author: rice $
11 ** @@
12 **
13 ** This library is free software; you can redistribute it and/or
14 ** modify it under the terms of the GNU Lesser General Public
15 ** License as published by the Free Software Foundation; either
16 ** version 2.1 of the License, or (at your option) any later version.
17 **
18 ** This library is distributed in the hope that it will be useful,
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 ** Lesser General Public License for more details.
22 **
23 ** You should have received a copy of the GNU Lesser General Public
24 ** License along with this library; if not, write to the Free Software
25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26 ** MA  02110-1301,  USA.
27 **
28 ******************************************************************************/
29 
30 
31 #include "ajlib.h"
32 
33 #include "ajseqbam.h"
34 #include "ajassert.h"
35 #include "ajlist.h"
36 #include "ajtable.h"
37 #include "ajutil.h"
38 #include "ajsys.h"
39 
40 
41 #include <stdio.h>
42 #include <ctype.h>
43 #include <errno.h>
44 #include <fcntl.h>
45 
46 
47 #ifndef WIN32
48 #include <sys/file.h>
49 #else /* WIN32 */
50 #include <io.h>
51 #define open _open
52 #define fileno _fileno
53 #endif /* !WIN32 */
54 
55 
56 
57 /* Don't define this as a const static int if used in array def */
58 #define AJSEQBAM_BLOCK_HEADER_LENGTH 18
59 
60 
61 const char *cigarcode = "MIDNSHP";
62 const char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
63 
64 static AjBool bamBigendian   = ajFalse;
65 static AjBool bamInitialized = ajFalse;
66 
67 const char *o_hd_tags[] = {"SO","GO",NULL};
68 const char *r_hd_tags[] = {"VN",NULL};
69 
70 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
71 const char *r_sq_tags[] = {"SN","LN",NULL};
72 const char *u_sq_tags[] = {"SN",NULL};
73 
74 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
75 const char *r_rg_tags[] = {"ID",NULL};
76 const char *u_rg_tags[] = {"ID",NULL};
77 
78 const char *o_pg_tags[] = {"VN","CL",NULL};
79 const char *r_pg_tags[] = {"ID",NULL};
80 
81 const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
82 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,
83                                 NULL};
84 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,
85                                 NULL};
86 const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
87 
88 
89 unsigned char bam_nt16_table[256] =
90 {
91 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
92 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
93 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
94 	 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
95 	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
96 	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
97 	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
98 	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
99 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
100 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
101 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
102 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
103 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
104 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
105 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
106 	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
107 };
108 
109 
110 
111 
112 /* @datastatic BamPHeaderTag **************************************************
113 **
114 ** BAM header tag
115 **
116 ** @attr value [char*] Value
117 ** @attr key [char[2]] Key
118 ** @attr Padding [char[6]] Padding for alignment
119 ******************************************************************************/
120 
121 typedef struct BamSHeaderTag
122 {
123     char *value;
124     char key[2];
125     char Padding[6];
126 } BamOHeaderTag;
127 
128 #define BamPHeaderTag BamOHeaderTag*
129 
130 
131 
132 
133 /* @datastatic BamPHeaderLine *************************************************
134 **
135 ** BAM header line
136 **
137 ** @attr tags [AjPList] List of tags
138 ** @attr type [char[2]] Type
139 ** @attr Padding [char[6]] Padding for alignment
140 ******************************************************************************/
141 
142 typedef struct BamSHeaderLine
143 {
144     AjPList tags;
145     char type[2];
146     char Padding[6];
147 } BamOHeaderLine;
148 
149 #define BamPHeaderLine BamOHeaderLine*
150 
151 const char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
152 
153 static AjPSeqBamBgzf  bamBgzfNew(void);
154 static void           bamCacheFree(AjPSeqBamBgzf fp);
155 static int            bamCacheLoadBlock(AjPSeqBamBgzf fp, ajlong block_address);
156 static int            bamDeflateBlock(AjPSeqBamBgzf fp, int block_length);
157 static void           bamDestroyHeaderHash(AjPSeqBamHeader header);
158 static unsigned char *bamGetAux(const AjPSeqBam b, const char tag[2]);
159 static int            bamHeaderCheck(const char *header);
160 static void           bamHeaderFree(AjPList *header);
161 static BamPHeaderTag  bamHeaderLineHasTag(const BamPHeaderLine hline,
162                                          const char *key);
163 static BamPHeaderLine bamHeaderLineParse(const char *headerLine);
164 static AjPTable	      bamHeaderLineToTablemap(const BamPHeaderLine hline);
165 static void           bamHeaderLineFree(BamPHeaderLine *hline);
166 static int            bamHeaderLineValidate(BamPHeaderLine hline);
167 static AjPList        bamHeaderParse2(const char *headerText);
168 static AjPTable       bamHeaderTotable(const AjPList _dict, const char type[2],
169                                       const char key_tag[2],
170                                       const char value_tag[2]);
171 static int            bamInflateBlock(AjPSeqBamBgzf fp, int block_length);
172 static int            bamReadBlock(AjPSeqBamBgzf fp);
173 static void           bamSwapEndianData(const AjPSeqBamCore c, int data_len,
174                                         unsigned char *data);
175 static BamPHeaderTag  bamTagNew(const char *name, const char *value_from,
176                                 const char *value_to);
177 
178 static const char*    bamNextline(char **lineptr, size_t *n, const char *text);
179 static int            bamTagExists(const char *tag, const char **tags);
180 
181 
182 
183 
184 /* @func ajSeqBamCalend *****************************************************
185 **
186 ** Calculate the rightmost coordinate of an alignment on the reference genome
187 **
188 ** @param [r] c [const AjPSeqBamCore] pointer to the AjPSeqBamCore structure
189 ** @param [r] cigar [const ajuint*] the corresponding CIGAR array
190 ** @return [ajuint] the rightmost coordinate, 0-based
191 **
192 ** @release 6.5.0
193 ******************************************************************************/
194 
ajSeqBamCalend(const AjPSeqBamCore c,const ajuint * cigar)195 ajuint ajSeqBamCalend(const AjPSeqBamCore c, const ajuint *cigar)
196 {
197 	ajuint k;
198 	ajuint end;
199 	ajint op;
200 
201 	end = c->pos;
202 
203 	for (k = 0; k < c->n_cigar; ++k)
204 	{
205 		op = cigar[k] & BAM_CIGAR_MASK;
206 		if (op == BAM_CMATCH || op == BAM_CDIFF || op == BAM_CEQUAL
207 			|| op == BAM_CDEL || op == BAM_CREF_SKIP)
208 			end += cigar[k] >> BAM_CIGAR_SHIFT;
209 	}
210 
211 	return end;
212 }
213 
214 
215 
216 
217 /* @func ajSeqBamReg2bin ******************************************************
218 **
219 ** Calculate the minimum bin that contains a region [beg,end).
220 **
221 ** @param [r] beg [ajuint] start of the region, 0-based
222 ** @param [r] end [ajuint] end of the region, 0-based
223 ** @return [int] bin
224 **
225 ******************************************************************************/
226 
ajSeqBamReg2bin(ajuint beg,ajuint end)227 int ajSeqBamReg2bin(ajuint beg, ajuint end)
228 {
229     --end;
230 
231     if(beg>>14 == end>>14)
232         return 4681 + (beg>>14);
233 
234     if(beg>>17 == end>>17)
235         return 585 + (beg>>17);
236 
237     if(beg>>20 == end>>20)
238         return 73 + (beg>>20);
239 
240     if(beg>>23 == end>>23)
241         return 9 + (beg>>23);
242 
243     if(beg>>26 == end>>26)
244         return 1 + (beg>>26);
245 
246     return 0;
247 }
248 
249 
250 
251 
252 /* #funcstatic bam_copy1 *****************************************************
253 **
254 ** Copy an alignment
255 **
256 ** #param [u] bdst [AjPSeqBam] destination alignment struct
257 ** #param [r] bsrc [const AjPSeqBam] source alignment struct
258 ** #return [AjPSeqBam] pointer to the destination alignment struct
259 **
260 ******************************************************************************/
261 /*
262 //static inline AjPSeqBam bam_copy1(AjPSeqBam bdst, const AjPSeqBam bsrc)
263 //{
264 //    unsigned char *data = NULL;
265 //    int m_data;
266 //
267 //    /# backup data and m_data #/
268 //    data = bdst->data;
269 //    m_data = bdst->m_data;
270 //
271 //
272 //    /# double the capacity #/
273 //    if (m_data < bsrc->m_data)
274 //    {
275 //        m_data = bsrc->m_data; kroundup32(m_data);
276 //        data = (unsigned char*)realloc(data, m_data);
277 //    }
278 //
279 //    /# copy var-len data #/
280 //    memcpy(data, bsrc->data, bsrc->data_len);
281 //
282 //    /# copy the rest #/
283 //    *bdst = *bsrc;
284 //
285 //    /# restore the backup #/
286 //    bdst->m_data = m_data;
287 //    bdst->data = data;
288 //
289 //    return bdst;
290 //}
291 */
292 
293 
294 
295 
296 /* @funcstatic bam_aux_type2size **********************************************
297 **
298 ** Returns size of the array elements in optional fields of alignment records
299 **
300 ** @param [r] x [int] type of the array elements
301 ** @return [int] size of the array elements
302 **
303 ******************************************************************************/
304 
bam_aux_type2size(int x)305 static inline int bam_aux_type2size(int x)
306 {
307 	if (x == 'C' || x == 'c' || x == 'A')
308 	    return 1;
309 	else if (x == 'S' || x == 's')
310 	    return 2;
311 	else if (x == 'I' || x == 'i' || x == 'f')
312 	    return 4;
313 	else
314 	    return 0;
315 }
316 
317 
318 
319 
320 /* @func ajSeqBamHeaderNew ****************************************************
321 **
322 ** Create an empty BAM header object
323 **
324 ** @return [AjPSeqBamHeader] BAM header object
325 **
326 **
327 ** @release 6.3.0
328 ******************************************************************************/
329 
ajSeqBamHeaderNew(void)330 AjPSeqBamHeader ajSeqBamHeaderNew(void)
331 {
332     AjPSeqBamHeader header=NULL;
333 
334     AJNEW0(header);
335 
336     return header;
337 }
338 
339 
340 
341 
342 /* @func ajSeqBamHeaderNewTextC ***********************************************
343 **
344 ** Create an empty BAM header object with given header text
345 **
346 ** @param [r]  txt [const char*] BAM/SAM header text string
347 ** @return [AjPSeqBamHeader] BAM header object
348 **
349 **
350 ** @release 6.3.0
351 ******************************************************************************/
352 
ajSeqBamHeaderNewTextC(const char * txt)353 AjPSeqBamHeader ajSeqBamHeaderNewTextC(const char* txt)
354 {
355     AjPSeqBamHeader ret;
356 
357     AJNEW0(ret);
358 
359     ret->l_text = strlen(txt);
360     ret->text = ajCharNewResC(txt, ret->l_text + 1);
361 
362     return ret;
363 }
364 
365 
366 
367 
368 /* @func ajSeqBamHeaderSetTextC ***********************************************
369 **
370 ** Set BAM header text
371 **
372 ** @param [u]  header [AjPSeqBamHeader] BAM header
373 ** @param [r]  txt [const char*] header text
374 ** @return [void]
375 **
376 **
377 ** @release 6.5.0
378 ******************************************************************************/
379 
ajSeqBamHeaderSetTextC(AjPSeqBamHeader header,const char * txt)380 void ajSeqBamHeaderSetTextC(AjPSeqBamHeader header, const char* txt)
381 {
382 
383     if(header->text)
384 	AJFREE(header->text);
385 
386     header->l_text = strlen(txt);
387     header->text = ajCharNewResC(txt, header->l_text + 1);
388 
389     return;
390 }
391 
392 
393 
394 
395 /* @func ajSeqBamHeaderNewN ***************************************************
396 **
397 ** Create an empty BAM header object with n reference sequences
398 **
399 ** @param [r] n [ajint] Number of target/reference sequences
400 ** @return [AjPSeqBamHeader] BAM header object
401 **
402 **
403 ** @release 6.5.0
404 ******************************************************************************/
405 
ajSeqBamHeaderNewN(ajint n)406 AjPSeqBamHeader ajSeqBamHeaderNewN(ajint n)
407 {
408     AjPSeqBamHeader ret;
409 
410     AJNEW0(ret);
411 
412     ret->n_targets = n;
413     AJCNEW0(ret->target_name, n);
414     AJCNEW0(ret->target_len, n);
415 
416     return ret;
417 }
418 
419 
420 
421 
422 /* @func ajSeqBamDel **********************************************************
423 **
424 ** Deletes a BAM alignment object
425 **
426 ** @param [d] Pbam [AjPSeqBam*] BAM alignment object
427 ** @return [void]
428 **
429 **
430 ** @release 6.5.0
431 ******************************************************************************/
432 
ajSeqBamDel(AjPSeqBam * Pbam)433 void ajSeqBamDel(AjPSeqBam *Pbam)
434 {
435     AjPSeqBam bam = NULL;
436 
437     bam = *Pbam;
438 
439     if(bam == 0)
440         return;
441 
442     AJFREE(bam->data);
443     AJFREE(*Pbam);
444 
445     return;
446 }
447 
448 
449 
450 
451 /* @func ajSeqBamHeaderDel ****************************************************
452 **
453 ** Destructor for a BAM header object
454 **
455 ** @param [d] Pheader [AjPSeqBamHeader*] BAM header object
456 ** @return [void]
457 **
458 **
459 ** @release 6.3.0
460 ******************************************************************************/
461 
ajSeqBamHeaderDel(AjPSeqBamHeader * Pheader)462 void ajSeqBamHeaderDel(AjPSeqBamHeader *Pheader)
463 {
464     ajint i;
465     AjPSeqBamHeader header = NULL;
466 
467     header = *Pheader;
468 
469     if(header == 0)
470         return;
471 
472     if(header->target_name)
473     {
474         for(i = 0; i < header->n_targets; ++i)
475             AJFREE(header->target_name[i]);
476 
477         AJFREE(header->target_name);
478         AJFREE(header->target_len);
479     }
480 
481     AJFREE(header->text);
482 
483     if(header->dict)
484         bamHeaderFree(&header->dict);
485 
486     if(header->rg2lib)
487         ajTableFree(&header->rg2lib);
488 
489     bamDestroyHeaderHash(header);
490 
491     AJFREE(*Pheader);
492 
493     return;
494 }
495 
496 
497 
498 
499 /* @func ajSeqBamHeaderWrite **************************************************
500 **
501 ** Write a header structure to BAM
502 **
503 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
504 ** @param [r] header [const AjPSeqBamHeader] Header structure
505 ** @return [int] Always 0 currently
506 **
507 **
508 ** @release 6.3.0
509 ******************************************************************************/
510 
ajSeqBamHeaderWrite(AjPSeqBamBgzf fp,const AjPSeqBamHeader header)511 int ajSeqBamHeaderWrite(AjPSeqBamBgzf fp, const AjPSeqBamHeader header)
512 {
513     char buf[4];
514     ajint i;
515     ajint name_len;
516     ajint x;
517     char *p = NULL;
518 
519     if(!bamInitialized)
520     {
521         bamInitialized = ajTrue;
522 	bamBigendian = ajUtilGetBigendian();
523     }
524 
525     /* write "BAM1" */
526     strncpy(buf, "BAM\001", 4);
527     ajSeqBamBgzfWrite(fp, buf, 4);
528 
529     /* write plain text and the number of reference sequences */
530 
531     if(bamBigendian)
532     {
533         x = header->l_text;
534         ajByteRevInt(&x);
535         ajSeqBamBgzfWrite(fp, &x, 4);
536 
537         if (header->l_text)
538             ajSeqBamBgzfWrite(fp, header->text, header->l_text);
539 
540         x = header->n_targets;
541         ajByteRevInt(&x);
542         ajSeqBamBgzfWrite(fp, &x, 4);
543     }
544     else
545     {
546         ajSeqBamBgzfWrite(fp, &header->l_text, 4);
547 
548         if(header->l_text)
549             ajSeqBamBgzfWrite(fp, header->text, header->l_text);
550 
551         ajSeqBamBgzfWrite(fp, &header->n_targets, 4);
552     }
553 
554     /* write sequence names and lengths */
555 
556     for(i = 0; i != header->n_targets; ++i)
557     {
558         p = header->target_name[i];
559         name_len = strlen(p) + 1;
560 
561         if(bamBigendian)
562         {
563             x = name_len;
564             ajByteRevInt(&x);
565             ajSeqBamBgzfWrite(fp, &x, 4);
566         }
567         else
568             ajSeqBamBgzfWrite(fp, &name_len, 4);
569 
570         ajSeqBamBgzfWrite(fp, p, name_len);
571 
572         if(bamBigendian)
573         {
574             x = header->target_len[i];
575             ajByteRevInt(&x);
576             ajSeqBamBgzfWrite(fp, &x, 4);
577         }
578         else
579             ajSeqBamBgzfWrite(fp, &header->target_len[i], 4);
580     }
581 
582     return 0;
583 }
584 
585 
586 
587 
588 /* @funcstatic bamSwapEndianData **********************************************
589 **
590 ** Swap bigendian data in a BAM core data buffer
591 **
592 ** @param [r] c [const AjPSeqBamCore] BAM core data
593 ** @param [r] data_len [int] Data length
594 ** @param [u] data [unsigned char*] Data buffer
595 ** @return [void]
596 **
597 **
598 ** @release 6.3.0
599 ******************************************************************************/
600 
bamSwapEndianData(const AjPSeqBamCore c,int data_len,unsigned char * data)601 static void bamSwapEndianData(const AjPSeqBamCore c, int data_len,
602                               unsigned char *data)
603 {
604     unsigned char *s = NULL;
605     ajuint i;
606     ajuint *cigar = NULL;
607     unsigned char type;
608 
609     cigar = (ajuint*)(data + c->l_qname);
610 
611     s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
612 
613     for(i = 0; i < c->n_cigar; ++i)
614         ajByteRevUint(&cigar[i]);
615 
616     while(s < data + data_len)
617     {
618         s += 2; /* skip key */
619         type = toupper(*s); ++s; /* skip type */
620 
621         if(type == 'C' || type == 'A')
622             ++s;
623         else if(type == 'S')
624         {
625             ajByteRevShort((ajshort*)s);
626             s += 2;
627         }
628         else if(type == 'I' || type == 'F')
629         {
630             ajByteRevUint((ajuint*)s);
631             s += 4;
632         }
633         else if(type == 'D')
634         {
635             ajByteRevLong((ajlong*)s);
636             s += 8;
637         }
638         else if(type == 'Z' || type == 'H')
639         {
640             while(*s)
641                 ++s;
642 
643             ++s;
644         }
645         else if (type == 'B')
646         {
647             ajuint n;
648             ajint Bsize = bam_aux_type2size(*s);
649             memcpy(&n, s + 1, 4);
650 
651             if (1 == Bsize) {
652             } else if (2 == Bsize) {
653         	for (i = 0; i < n; i += 2)
654         	    ajByteRevShort((ajshort*)s + 5 + i);
655             } else if (4 == Bsize) {
656         	for (i = 0; i < n; i += 4)
657         	    ajByteRevUint((ajuint*)s + 5 + i);
658             }
659             ajByteRevUint((ajuint*)s+1);
660         }
661 
662     }
663 
664     return;
665 }
666 
667 
668 
669 
670 /* @func ajSeqBamRead *********************************************************
671 **
672 ** Read a BAM alignment record of a read(query sequence)
673 **
674 ** The file position indicator must be placed right before an
675 ** alignment. Upon success, this function will set the position
676 ** indicator to the start of the next alignment. This function is not
677 ** affected by the machine endianness.
678 **
679 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
680 ** @param [u] b [AjPSeqBam] read alignment; all members are updated.
681 ** @return [int] Number of bytes read from the file
682 **
683 ** @release 6.3.0
684 ** @@
685 ******************************************************************************/
686 
ajSeqBamRead(AjPSeqBamBgzf fp,AjPSeqBam b)687 int ajSeqBamRead(AjPSeqBamBgzf fp, AjPSeqBam b)
688 {
689     AjPSeqBamCore c = &b->core;
690     ajint block_len;
691     ajint ret;
692     ajint i;
693     ajuint x[8];
694     ajint oldsize;
695 
696     if(!bamInitialized)
697     {
698         bamInitialized = ajTrue;
699 	bamBigendian = ajUtilGetBigendian();
700     }
701 
702     assert(BAM_CORE_SIZE == 32);
703 
704     if((ret = ajSeqBamBgzfRead(fp, &block_len, 4)) != 4)
705     {
706         if(ret == 0)
707             return -1; /* normal end-of-file */
708 
709         return -2; /* truncated */
710     }
711 
712     if(block_len < (ajint) BAM_CORE_SIZE)
713         ajErr("block_len: %d core_size: %d", block_len, BAM_CORE_SIZE);
714 
715     if(ajSeqBamBgzfRead(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE)
716         return -3;
717 
718     if(bamBigendian)
719     {
720         ajByteRevInt(&block_len);
721 
722         for(i = 0; i < 8; ++i)
723             ajByteRevUint(x + i);
724     }
725 
726     c->tid = x[0];
727     c->pos = x[1];
728     c->bin = x[2]>>16;
729     c->qual = x[2]>>8&0xff;
730     c->l_qname = x[2]&0xff;
731     c->flag = x[3]>>16;
732     c->n_cigar = x[3]&0xffff;
733     c->l_qseq = x[4];
734     c->mtid = x[5];
735     c->mpos = x[6];
736     c->isize = x[7];
737     b->data_len = block_len - BAM_CORE_SIZE;
738 
739     if(b->m_data < b->data_len)
740     {
741 	oldsize = b->m_data;
742         b->m_data = b->data_len;
743         kroundup32(b->m_data);
744         AJRESIZE0(b->data, oldsize, b->m_data);
745     }
746 
747     if(ajSeqBamBgzfRead(fp, b->data, b->data_len) != b->data_len)
748     {
749         ajErr("ajSeqBamBgzfRead len: %d failed", b->data_len);
750         return -4;
751     }
752 
753     b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname -
754         c->l_qseq - (c->l_qseq+1)/2;
755 
756     if(bamBigendian)
757         bamSwapEndianData(c, b->data_len, b->data);
758 
759     return 4 + block_len;
760 }
761 
762 
763 
764 
765 /* @func ajSeqBamWrite ********************************************************
766 **
767 ** Writes BAM alignment data to a BGZ file
768 **
769 ** @param [u] fp [AjPSeqBamBgzf]BAM file handler
770 ** @param [r] b [const AjPSeqBam] alignment to write
771 ** @return [int] number of bytes written to the file
772 **
773 **
774 ** @release 6.3.0
775 ******************************************************************************/
776 
ajSeqBamWrite(AjPSeqBamBgzf fp,const AjPSeqBam b)777 int ajSeqBamWrite(AjPSeqBamBgzf fp, const AjPSeqBam b)
778 {
779     const AjPSeqBamCore c;
780     int data_len;
781     unsigned char *data = NULL;
782 
783     ajuint x[8];
784     ajuint block_len;
785     ajuint y;
786     int i;
787 
788     c         = &b->core;
789     data_len  = b->data_len;
790     data      = b->data;
791     block_len = data_len + BAM_CORE_SIZE;
792 
793     if(!bamInitialized)
794     {
795         bamInitialized = ajTrue;
796 	bamBigendian = ajUtilGetBigendian();
797     }
798 
799     assert(BAM_CORE_SIZE == 32);
800 
801     x[0] = c->tid;
802     x[1] = c->pos;
803     x[2] = (ajuint)c->bin<<16 | c->qual<<8 | c->l_qname;
804     x[3] = (ajuint)c->flag<<16 | c->n_cigar;
805     x[4] = c->l_qseq;
806     x[5] = c->mtid;
807     x[6] = c->mpos;
808     x[7] = c->isize;
809 
810     if(bamBigendian)
811     {
812         for(i = 0; i < 8; ++i)
813             ajByteRevUint(x + i);
814 
815         y = block_len;
816         ajByteRevUint(&y);
817         ajSeqBamBgzfWrite(fp, &y, 4);
818         bamSwapEndianData(c, data_len, data);
819     }
820     else
821         ajSeqBamBgzfWrite(fp, &block_len, 4);
822 
823     ajSeqBamBgzfWrite(fp, x, BAM_CORE_SIZE);
824     ajSeqBamBgzfWrite(fp, data, data_len);
825 
826     if(bamBigendian)
827         bamSwapEndianData(c, data_len, data);
828 
829     return 4 + block_len;
830 }
831 
832 
833 
834 
835 /* @func ajSeqBamGetLibrary ***************************************************
836 **
837 ** Return the name of the library from the BAM file header RG tag
838 **
839 ** @param [u] h [AjPSeqBamHeader] BAM header
840 ** @param [r] b [const AjPSeqBam] BAM record
841 ** @return [const char*] Library name
842 **
843 **
844 ** @release 6.3.0
845 ******************************************************************************/
846 
ajSeqBamGetLibrary(AjPSeqBamHeader h,const AjPSeqBam b)847 const char* ajSeqBamGetLibrary(AjPSeqBamHeader h, const AjPSeqBam b)
848 {
849     const unsigned char *rg;
850 /*
851 ** FIXME: this should also check the LB tag associated with each alignment
852 */
853     if(h->dict == 0)
854         h->dict = bamHeaderParse2(h->text);
855 
856     if(h->rg2lib == 0)
857         h->rg2lib = bamHeaderTotable(h->dict, "RG", "ID", "LB");
858 
859     rg = bamGetAux(b, "RG");
860 
861     return (rg == 0) ? 0 : ajTableFetchC(h->rg2lib, (const char*)(rg + 1));
862 }
863 
864 
865 
866 
867 /* @datastatic BamPCache ******************************************************
868 **
869 ** BAM block cache
870 **
871 ** @alias BamSCache
872 ** @alias BamOCache
873 **
874 ** @attr block [unsigned char*] Block data
875 ** @attr end_offset [ajlong] End offset
876 ** @attr size [int]Block size
877 ** @attr padding [int] To alignment boundary
878 **
879 ******************************************************************************/
880 
881 typedef struct BamSCache
882 {
883     unsigned char *block;
884     ajlong end_offset;
885     int size;
886     int padding;
887 } BamOCache;
888 
889 #define BamPCache BamOCache*
890 
891 
892 /* @conststatic BAM variables *************************************************
893 **
894 ** Variables defined for BAM processing
895 **
896 ** @value AJSEQBAM_DEFAULT_BLOCK_SIZE [int] Default block size of 64k
897 ** @value AJSEQBAM_MAX_BLOCK_SIZE [int] Maximum block size also 64k
898 ** @value AJSEQBAM_BLOCK_FOOTER_LENGTH [int] Block footer length
899 ** @value AJSEQBAM_GZIP_ID1 [int] Gzip magic id 1
900 ** @value AJSEQBAM_GZIP_ID2 [int] Gzip magic id 2
901 ** @value AJSEQBAM_CM_DEFLATE [int] CM deflate
902 ** @value AJSEQBAM_FLG_FEXTRA [int] Fextra flag
903 ** @value AJSEQBAM_OS_UNKNOWN [int] Unkown OS value
904 ** @value AJSEQBAM_BGZF_ID1 [int] Bgfz magic id 1 'B'
905 ** @value AJSEQBAM_BGZF_ID2 [int] Bgfz magic id 2 'C'
906 ** @value AJSEQBAM_BGZF_LEN [int] Bgzf length
907 ** @value AJSEQBAM_BGZF_XLEN [int] Bgzf xlength (length plus 4)
908 ** @value AJSEQBAM_GZIP_WINDOW_BITS [int] Gzip window bits (-15 - no zlib header
909 ** @value AJSEQBAM_Z_DEFAULT_MEM_LEVEL [int] Z default memory level
910 ******************************************************************************/
911 
912 static const int AJSEQBAM_DEFAULT_BLOCK_SIZE = 64 * 1024;
913 static const int AJSEQBAM_MAX_BLOCK_SIZE = 64 * 1024;
914 
915 static const int AJSEQBAM_BLOCK_FOOTER_LENGTH = 8;
916 
917 static const int AJSEQBAM_GZIP_ID1 = 31;
918 static const int AJSEQBAM_GZIP_ID2 = 139;
919 static const int AJSEQBAM_CM_DEFLATE = 8;
920 static const int AJSEQBAM_FLG_FEXTRA = 4;
921 static const int AJSEQBAM_OS_UNKNOWN = 255;
922 static const int AJSEQBAM_BGZF_ID1 = 66;
923 static const int AJSEQBAM_BGZF_ID2 = 67;
924 static const int AJSEQBAM_BGZF_LEN = 2;
925 static const int AJSEQBAM_BGZF_XLEN = 6;
926 
927 static const int AJSEQBAM_GZIP_WINDOW_BITS = -15;
928 static const int AJSEQBAM_Z_DEFAULT_MEM_LEVEL = 8;
929 
930 
931 
932 
933 
934 static inline void packInt16(unsigned char* buffer, ajushort value);
935 
936 static inline void packInt32(unsigned char* buffer, ajuint value);
937 static inline int  unpackInt16(const unsigned char* buffer);
938 static inline int  BAMBGZFMIN(int x, int y);
939 
940 
941 
942 
943 /* @funcstatic packInt16 *****************************************************
944 **
945 ** Pack a 2-byte integer
946 **
947 ** @param [u] buffer [unsigned char*] Buffer of at least 2 bytes
948 ** @param [r] value [ajushort] Unsigned integer value
949 ** @return [void]
950 ******************************************************************************/
951 
packInt16(unsigned char * buffer,ajushort value)952 static inline void packInt16(unsigned char* buffer, ajushort value)
953 {
954     buffer[0] = (unsigned char) value;
955     buffer[1] = value >> 8;
956 }
957 
958 
959 
960 
961 /* @funcstatic unpackInt16 ****************************************************
962 **
963 ** Unpack a 2-byte integer
964 **
965 ** @param [r] buffer [const unsigned char*] Buffer of at least 2 bytes
966 ** @return [int] Integer value from first 2 bytes of buffer
967 **
968 ******************************************************************************/
969 
unpackInt16(const unsigned char * buffer)970 static inline int unpackInt16(const unsigned char* buffer)
971 {
972     return (buffer[0] | (buffer[1] << 8));
973 }
974 
975 
976 
977 
978 
979 /* @funcstatic packInt32 *****************************************************
980 **
981 ** Pack a 4-byte integer
982 **
983 ** @param [u] buffer [unsigned char*] Buffer of at least 4 bytes
984 ** @param [r] value [ajuint] Unsigned integer value
985 ** @return [void]
986 ******************************************************************************/
987 
packInt32(unsigned char * buffer,ajuint value)988 static inline void packInt32(unsigned char* buffer, ajuint value)
989 {
990     buffer[0] = value;
991     buffer[1] = value >> 8;
992     buffer[2] = value >> 16;
993     buffer[3] = value >> 24;
994 }
995 
996 
997 
998 
999 
1000 /* @funcstatic BAMBGZFMIN ****************************************************
1001 **
1002 ** Return minimum of two integers
1003 **
1004 ** @param [r] x [int] First integer
1005 ** @param [r] y [int] Second integer
1006 ** @return [int] Lowest integer value
1007 ******************************************************************************/
1008 
BAMBGZFMIN(int x,int y)1009 static inline int BAMBGZFMIN(int x, int y)
1010 {
1011     return (x < y) ? x : y;
1012 }
1013 
1014 
1015 
1016 
1017 
1018 /* @funcstatic bamReportError *************************************************
1019 **
1020 ** Save error message in the BGZ file error attribute
1021 **
1022 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1023 ** @param [r] message [const char*] message text
1024 ** @return [void]
1025 **
1026 **
1027 ** @release 6.3.0
1028 ******************************************************************************/
1029 
bamReportError(AjPSeqBamBgzf fp,const char * message)1030 static void bamReportError(AjPSeqBamBgzf fp, const char* message)
1031 {
1032     ajWarn("++bamReportError '%s'", message);
1033     fp->error = message;
1034 
1035     return;
1036 }
1037 
1038 
1039 
1040 
1041 /* @funcstatic bamBgzfNew *****************************************************
1042 **
1043 ** Initialize a BGZ file object
1044 **
1045 ** @return [AjPSeqBamBgzf] BGZ file object
1046 **
1047 **
1048 ** @release 6.3.0
1049 ******************************************************************************/
1050 
bamBgzfNew(void)1051 static AjPSeqBamBgzf bamBgzfNew(void)
1052 {
1053     AjPSeqBamBgzf fp;
1054 
1055     fp = calloc(1, sizeof(AjOSeqBamBgzf));
1056     fp->uncompressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1057     fp->uncompressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1058     fp->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1059     fp->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1060     fp->cache_size = 0;
1061     fp->cache = NULL;
1062 
1063     return fp;
1064 }
1065 
1066 
1067 
1068 
1069 /* @funcstatic bamBgzfOpenfdRead **********************************************
1070 **
1071 ** Open a BGZ file for reading
1072 **
1073 ** @param [r] fd [int] File descriptor of opened file
1074 ** @return [AjPSeqBamBgzf] BGZ file object
1075 **
1076 **
1077 ** @release 6.3.0
1078 ******************************************************************************/
1079 
bamBgzfOpenfdRead(int fd)1080 static AjPSeqBamBgzf bamBgzfOpenfdRead(int fd)
1081 {
1082     FILE* file;
1083     AjPSeqBamBgzf fp;
1084 
1085     file = ajSysFuncFdopen(fd, "r");
1086 
1087     if(file == 0)
1088         return 0;
1089 
1090     fp = bamBgzfNew();
1091     fp->file_descriptor = fd;
1092     fp->open_mode = 'r';
1093     fp->file = file;
1094     fp->cache = ajTableNew(512);
1095 
1096     fseek(fp->file, 0L, SEEK_SET);
1097 
1098     return fp;
1099 }
1100 
1101 
1102 
1103 
1104 /* @funcstatic bamBgzfOpenfdWrite *********************************************
1105 **
1106 ** Open a BGZ file for writing
1107 **
1108 ** @param [r] fd [int] File descriptor of opened file
1109 ** @param [r] is_uncompressed [char] If non-zero, file is uncompressed
1110 ** @return [AjPSeqBamBgzf] BGZ file object
1111 **
1112 **
1113 ** @release 6.3.0
1114 ******************************************************************************/
1115 
bamBgzfOpenfdWrite(int fd,char is_uncompressed)1116 static AjPSeqBamBgzf bamBgzfOpenfdWrite(int fd, char is_uncompressed)
1117 {
1118     FILE* file;
1119     AjPSeqBamBgzf fp;
1120 
1121     file = ajSysFuncFdopen(fd, "w");
1122 
1123     if(file == 0)
1124         return 0;
1125 
1126     fp = malloc(sizeof(AjOSeqBamBgzf));
1127     fp->file_descriptor = fd;
1128     fp->open_mode = 'w';
1129     fp->owned_file = 0;
1130     fp->is_uncompressed = is_uncompressed;
1131     fp->file = file;
1132     fp->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1133     fp->uncompressed_block = NULL;
1134     fp->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1135     fp->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1136     fp->block_address = 0;
1137     fp->block_offset = 0;
1138     fp->block_length = 0;
1139     fp->error = NULL;
1140 
1141     return fp;
1142 }
1143 
1144 
1145 
1146 
1147 /* @func ajSeqBamBgzfOpenC ****************************************************
1148 **
1149 ** Open a named BGZ file for reading or writing
1150 **
1151 ** @param [r] path [const char*] File path
1152 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1153 ** @return [AjPSeqBamBgzf] BGZ file object
1154 **
1155 **
1156 ** @release 6.3.0
1157 ******************************************************************************/
1158 
ajSeqBamBgzfOpenC(const char * path,const char * mode)1159 AjPSeqBamBgzf ajSeqBamBgzfOpenC(const char* path, const char*  mode)
1160 {
1161     AjPSeqBamBgzf fp = NULL;
1162     int fd;
1163     int oflag;
1164 
1165     /* The reading mode is preferred. */
1166     if(mode[0] == 'r' || mode[0] == 'R')
1167     {
1168         oflag = O_RDONLY;
1169 #ifdef WIN32
1170         oflag |= O_BINARY;
1171 #endif /* WIN32 */
1172         fd = open(path, oflag);
1173 
1174         if(fd == -1)
1175             return 0;
1176 
1177         fp = bamBgzfOpenfdRead(fd);
1178     }
1179     else if(mode[0] == 'w' || mode[0] == 'W')
1180     {
1181         oflag = O_WRONLY | O_CREAT | O_TRUNC;
1182 #ifdef WIN32
1183         oflag |= O_BINARY;
1184 #endif /* WIN32 */
1185         fd = open(path, oflag, 0666);
1186 
1187         if(fd == -1)
1188             return 0;
1189 
1190         fp = bamBgzfOpenfdWrite(fd, strstr(mode, "u")? 1 : 0);
1191     }
1192 
1193     if(fp != NULL)
1194         fp->owned_file = 1;
1195 
1196     return fp;
1197 }
1198 
1199 
1200 
1201 
1202 /* @func ajSeqBamBgzfOpenfd ***************************************************
1203 **
1204 ** Open a BGZip BAM file by file descriptor
1205 **
1206 ** @param [r] fd [int] open file descriptor
1207 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1208 ** @return [AjPSeqBamBgzf] BGZip BAM file
1209 **
1210 **
1211 ** @release 6.3.0
1212 ******************************************************************************/
1213 
ajSeqBamBgzfOpenfd(int fd,const char * mode)1214 AjPSeqBamBgzf ajSeqBamBgzfOpenfd(int fd, const char * mode)
1215 {
1216     if(fd == -1)
1217         return 0;
1218 
1219     if (mode[0] == 'r' || mode[0] == 'R')
1220         return bamBgzfOpenfdRead(fd);
1221     else if (mode[0] == 'w' || mode[0] == 'W')
1222         return bamBgzfOpenfdWrite(fd, strstr(mode, "u")? 1 : 0);
1223 
1224     return NULL;
1225 }
1226 
1227 
1228 
1229 
1230 /* @func ajSeqBamBgzfSetInfile ************************************************
1231 **
1232 ** Sets data structure to read from an open BGZip BAM file
1233 **
1234 ** @param [u] gzfile [AjPSeqBamBgzf] BGZip BAM file
1235 ** @param [u] inf [AjPFile] Open input file object
1236 ** @return [AjBool] True on success
1237 ** @release 6.5.0
1238 ******************************************************************************/
1239 
ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile,AjPFile inf)1240 AjBool ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile, AjPFile inf)
1241 {
1242     if(!inf)
1243         return ajFalse;
1244 
1245     gzfile->file_descriptor = fileno(ajFileGetFileptr(inf));
1246     gzfile->open_mode = 'r';
1247     gzfile->owned_file = 0;
1248     gzfile->is_uncompressed = '\0';
1249     gzfile->file = ajFileGetFileptr(inf);
1250     gzfile->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1251     gzfile->uncompressed_block = NULL;
1252     gzfile->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1253     gzfile->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1254     gzfile->block_address = 0;
1255     gzfile->block_offset = 0;
1256     gzfile->block_length = 0;
1257     gzfile->error = NULL;
1258 
1259     return ajTrue;
1260 }
1261 
1262 
1263 
1264 
1265 /* @func ajSeqBamBgzfSetOutfile ***********************************************
1266 **
1267 ** Sets data structure to write to an open BGZip BAM file
1268 **
1269 ** @param [u] gzfile [AjPSeqBamBgzf] BGZip BAM file
1270 ** @param [u] outf [AjPFile] Open output file object
1271 ** @return [AjBool] True on success
1272 ** @release 6.5.0
1273 ******************************************************************************/
1274 
ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile,AjPFile outf)1275 AjBool ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile, AjPFile outf)
1276 {
1277     if(!outf)
1278         return ajFalse;
1279 
1280     gzfile->file_descriptor = fileno(ajFileGetFileptr(outf));
1281     gzfile->open_mode = 'w';
1282     gzfile->owned_file = 0;
1283     gzfile->is_uncompressed = '\0';
1284     gzfile->file = ajFileGetFileptr(outf);
1285     gzfile->uncompressed_block_size = AJSEQBAM_DEFAULT_BLOCK_SIZE;
1286     gzfile->uncompressed_block = NULL;
1287     gzfile->compressed_block_size = AJSEQBAM_MAX_BLOCK_SIZE;
1288     gzfile->compressed_block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1289     gzfile->block_address = 0;
1290     gzfile->block_offset = 0;
1291     gzfile->block_length = 0;
1292     gzfile->error = NULL;
1293 
1294     return ajTrue;
1295 }
1296 
1297 
1298 
1299 
1300 /* @func ajSeqBamBgzfNew *****************************************************
1301 **
1302 ** Constructs a BGZip BAM file by file
1303 **
1304 ** @param [u] file [FILE*] open file
1305 ** @param [r] mode [const char*] File open mode 'r' 'R' 'w' or 'W'
1306 ** @return [AjPSeqBamBgzf] BGZip BAM file
1307 **
1308 **
1309 ** @release 6.5.0
1310 ******************************************************************************/
1311 
ajSeqBamBgzfNew(FILE * file,const char * mode)1312 AjPSeqBamBgzf ajSeqBamBgzfNew(FILE* file, const char*  mode)
1313 {
1314     AjPSeqBamBgzf fp;
1315 
1316     if(file == 0)
1317         return 0;
1318 
1319     fp = bamBgzfNew();
1320     fp->file_descriptor = fileno(file);
1321 
1322     if(mode[0] == 'r' || mode[0] == 'R')
1323     {
1324 	fp->open_mode = 'r';
1325 	fp->cache = ajTableNew(512);
1326     }
1327     else
1328 	fp->open_mode = 'w';
1329 
1330     fp->file = file;
1331     fseek(fp->file, 0L, SEEK_SET);
1332 
1333     return fp;
1334 }
1335 
1336 
1337 
1338 
1339 /* @funcstatic bamDeflateBlock ************************************************
1340 **
1341 ** Deflate an uncompressed block
1342 **
1343 ** @param [u] fp  [AjPSeqBamBgzf] BGZ file
1344 ** @param [r] block_length [int] Uncompressed block length
1345 ** @return [int] Length of compressed block
1346 **
1347 **
1348 ** @release 6.3.0
1349 ******************************************************************************/
1350 
bamDeflateBlock(AjPSeqBamBgzf fp,int block_length)1351 static int bamDeflateBlock(AjPSeqBamBgzf fp, int block_length)
1352 {
1353     /*
1354     ** Deflate the block in fp->uncompressed_block into fp->compressed_block.
1355     ** Also adds an extra field that stores the compressed block length.
1356     */
1357 
1358     char *buffer = NULL;
1359     int buffer_size;
1360     int input_length;
1361     int compressed_length;
1362     int compress_level;
1363     int status;
1364     z_stream zs;
1365     ajuint crc;
1366     int remaining;
1367 
1368     buffer      = fp->compressed_block;
1369     buffer_size = fp->compressed_block_size;
1370 
1371     /* Init gzip header */
1372     buffer[0] = AJSEQBAM_GZIP_ID1;
1373     buffer[1] = AJSEQBAM_GZIP_ID2;
1374     buffer[2] = AJSEQBAM_CM_DEFLATE;
1375     buffer[3] = AJSEQBAM_FLG_FEXTRA;
1376     buffer[4] = 0; /* mtime */
1377     buffer[5] = 0;
1378     buffer[6] = 0;
1379     buffer[7] = 0;
1380     buffer[8] = 0;
1381     buffer[9] = AJSEQBAM_OS_UNKNOWN;
1382     buffer[10] = AJSEQBAM_BGZF_XLEN;
1383     buffer[11] = 0;
1384     buffer[12] = AJSEQBAM_BGZF_ID1;
1385     buffer[13] = AJSEQBAM_BGZF_ID2;
1386     buffer[14] = AJSEQBAM_BGZF_LEN;
1387     buffer[15] = 0;
1388     buffer[16] = 0; /* placeholder for block length */
1389     buffer[17] = 0;
1390 
1391     /* loop to retry for blocks that do not compress enough */
1392     input_length = block_length;
1393     compressed_length = 0;
1394 
1395     while(1)
1396     {
1397 	compress_level = fp->is_uncompressed ? Z_NO_COMPRESSION :
1398 		Z_DEFAULT_COMPRESSION;
1399         zs.zalloc = NULL;
1400         zs.zfree  = NULL;
1401         zs.next_in   = fp->uncompressed_block;
1402         zs.avail_in  = input_length;
1403         zs.next_out  = (void*)&buffer[AJSEQBAM_BLOCK_HEADER_LENGTH];
1404         zs.avail_out = buffer_size - AJSEQBAM_BLOCK_HEADER_LENGTH -
1405             AJSEQBAM_BLOCK_FOOTER_LENGTH;
1406 
1407         status = deflateInit2(&zs, compress_level, Z_DEFLATED,
1408                               AJSEQBAM_GZIP_WINDOW_BITS,
1409                               AJSEQBAM_Z_DEFAULT_MEM_LEVEL,
1410                               Z_DEFAULT_STRATEGY);
1411 
1412         if(status != Z_OK)
1413         {
1414             bamReportError(fp, "deflate init failed");
1415             return -1;
1416         }
1417 
1418         status = deflate(&zs, Z_FINISH);
1419 
1420         if(status != Z_STREAM_END)
1421         {
1422             deflateEnd(&zs);
1423 
1424             if(status == Z_OK)
1425             {
1426                 /*
1427                 ** Not enough space in buffer.
1428                 ** Can happen in the rare case the input doesn't
1429                 ** compress enough.
1430                 ** Reduce the amount of input until it fits.
1431                 */
1432                 input_length -= 1024;
1433 
1434                 if(input_length <= 0)
1435                 {
1436                     /* should never happen */
1437                     bamReportError(fp, "input reduction failed");
1438                     return -1;
1439                 }
1440 
1441                 continue;
1442             }
1443 
1444             bamReportError(fp, "deflate failed");
1445             return -1;
1446         }
1447 
1448         status = deflateEnd(&zs);
1449 
1450         if(status != Z_OK)
1451         {
1452             bamReportError(fp, "deflate end failed");
1453             return -1;
1454         }
1455 
1456         compressed_length = zs.total_out;
1457         compressed_length += AJSEQBAM_BLOCK_HEADER_LENGTH +
1458             AJSEQBAM_BLOCK_FOOTER_LENGTH;
1459 
1460         if(compressed_length > AJSEQBAM_MAX_BLOCK_SIZE)
1461         {
1462             /* should never happen */
1463             bamReportError(fp, "deflate overflow");
1464             return -1;
1465         }
1466 
1467         break;
1468     }
1469 
1470     packInt16((unsigned char*)&buffer[16], compressed_length-1);
1471     crc = crc32(0L, NULL, 0L);
1472     crc = crc32(crc, fp->uncompressed_block, input_length);
1473     packInt32((unsigned char*)&buffer[compressed_length-8], crc);
1474     packInt32((unsigned char*)&buffer[compressed_length-4], input_length);
1475 
1476     remaining = block_length - input_length;
1477 
1478     if(remaining > 0)
1479     {
1480         if(remaining > input_length)
1481         {
1482             /* should never happen (check so we can use memcpy) */
1483             bamReportError(fp, "remainder too large");
1484             return -1;
1485         }
1486 
1487         memcpy((unsigned char*)fp->uncompressed_block,
1488                (unsigned char*)fp->uncompressed_block + input_length,
1489                remaining);
1490     }
1491 
1492     fp->block_offset = remaining;
1493 
1494     return compressed_length;
1495 }
1496 
1497 
1498 
1499 
1500 /* @funcstatic bamInflateBlock ************************************************
1501 **
1502 ** Uncompress a compressed block
1503 **
1504 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1505 ** @param [r] block_length  [int] Compressed block length
1506 ** @return [int] Uncompressed block length
1507 **
1508 **
1509 ** @release 6.3.0
1510 ******************************************************************************/
1511 
bamInflateBlock(AjPSeqBamBgzf fp,int block_length)1512 static int bamInflateBlock(AjPSeqBamBgzf fp, int block_length)
1513 {
1514     /* Inflate the block in fp->compressed_block into fp->uncompressed_block */
1515     int status;
1516     z_stream zs;
1517 
1518     zs.zalloc = NULL;
1519     zs.zfree = NULL;
1520     zs.next_in = (unsigned char*)fp->compressed_block + 18;
1521     zs.avail_in = block_length - 16;
1522     zs.next_out = fp->uncompressed_block;
1523     zs.avail_out = fp->uncompressed_block_size;
1524 
1525     status = inflateInit2(&zs, AJSEQBAM_GZIP_WINDOW_BITS);
1526 
1527     if(status != Z_OK)
1528     {
1529         bamReportError(fp, "inflate init failed");
1530         return -1;
1531     }
1532 
1533     status = inflate(&zs, Z_FINISH);
1534 
1535     if(status != Z_STREAM_END)
1536     {
1537         inflateEnd(&zs);
1538         bamReportError(fp, "inflate failed");
1539         return -1;
1540     }
1541 
1542     status = inflateEnd(&zs);
1543 
1544     if(status != Z_OK)
1545     {
1546         bamReportError(fp, "inflate failed");
1547         return -1;
1548     }
1549 
1550     return zs.total_out;
1551 }
1552 
1553 
1554 
1555 
1556 /* @funcstatic bamHeaderCheck *************************************************
1557 **
1558 ** Check a BAM file header
1559 **
1560 ** @param [r] header [const char*] Header buffer
1561 ** @return [int] 0 on success
1562 **
1563 **
1564 ** @release 6.3.0
1565 ******************************************************************************/
1566 
bamHeaderCheck(const char * header)1567 static int bamHeaderCheck(const char *header)
1568 {
1569     return (header[0] == AJSEQBAM_GZIP_ID1 &&
1570             header[1] == (char) AJSEQBAM_GZIP_ID2 &&
1571             header[2] == Z_DEFLATED &&
1572             (header[3] & AJSEQBAM_FLG_FEXTRA) != 0 &&
1573             unpackInt16((const unsigned char*)&header[10]) ==
1574             AJSEQBAM_BGZF_XLEN &&
1575             header[12] == AJSEQBAM_BGZF_ID1 &&
1576             header[13] == AJSEQBAM_BGZF_ID2 &&
1577             unpackInt16((const unsigned char*)&header[14]) ==
1578             AJSEQBAM_BGZF_LEN);
1579 }
1580 
1581 
1582 
1583 
1584 /* @funcstatic bamCacheFree ***************************************************
1585 **
1586 ** Free BAM cache data
1587 **
1588 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1589 ** @return [void]
1590 **
1591 **
1592 ** @release 6.3.0
1593 ******************************************************************************/
1594 
bamCacheFree(AjPSeqBamBgzf fp)1595 static void bamCacheFree(AjPSeqBamBgzf fp)
1596 {
1597 
1598     /* if fp->open_mode is not 'r' then fp->cache is expected to be null */
1599 
1600     if(!fp->cache)
1601         return;
1602 
1603     ajTableDel(&fp->cache);
1604 
1605     return;
1606 }
1607 
1608 
1609 
1610 
1611 /* @funcstatic bamCacheLoadBlock **********************************************
1612 **
1613 ** Load a block from cache
1614 **
1615 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1616 ** @param [r] block_address [ajlong] Block offset in file
1617 ** @return [int] Size of block
1618 **
1619 **
1620 ** @release 6.3.0
1621 ******************************************************************************/
1622 
bamCacheLoadBlock(AjPSeqBamBgzf fp,ajlong block_address)1623 static int bamCacheLoadBlock(AjPSeqBamBgzf fp, ajlong block_address)
1624 {
1625 	const BamPCache p;
1626 
1627 	p = ajTableFetchV(fp->cache, &block_address);
1628 
1629 	if(!p)
1630             return 0;
1631 
1632 	if(fp->block_length != 0)
1633             fp->block_offset = 0;
1634 
1635 	fp->block_address = block_address;
1636 	fp->block_length = p->size;
1637 	memcpy(fp->uncompressed_block, p->block, AJSEQBAM_MAX_BLOCK_SIZE);
1638 	fseek(fp->file, p->end_offset, SEEK_SET);
1639 
1640 	return p->size;
1641 }
1642 
1643 
1644 
1645 
1646 /* @funcstatic bamCacheBlock **************************************************
1647 **
1648 ** add a block to the cache
1649 ** remove an old block if it exceeds maximum
1650 **
1651 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1652 ** @param [r] size [int] Cache size of data from current position
1653 ** @return [void]
1654 **
1655 **
1656 ** @release 6.3.0
1657 ******************************************************************************/
1658 
bamCacheBlock(AjPSeqBamBgzf fp,int size)1659 static void bamCacheBlock(AjPSeqBamBgzf fp, int size)
1660 {
1661     BamPCache p;
1662     BamPCache oldp;
1663 
1664     if(AJSEQBAM_MAX_BLOCK_SIZE >= fp->cache_size)
1665         return;
1666 
1667     if((ajint) ajTableGetLength(fp->cache) * AJSEQBAM_MAX_BLOCK_SIZE >
1668        fp->cache_size)
1669     {
1670         /*
1671         ** A better way would be to remove the oldest block in the
1672         ** cache, but here a random one is removed for simplicity. This
1673         ** should not have a big impact on performance.
1674         */
1675 
1676         /* some way to remove a value - or half the values - from the table */
1677     }
1678 
1679     AJNEW0(p);
1680     p->size = fp->block_length;
1681     p->end_offset = fp->block_address + size;
1682     p->block = malloc(AJSEQBAM_MAX_BLOCK_SIZE);
1683     memcpy(p->block, fp->uncompressed_block, AJSEQBAM_MAX_BLOCK_SIZE);
1684     oldp = ajTablePut(fp->cache, &fp->block_address, p);
1685 
1686     if(oldp)
1687     {
1688         AJFREE(oldp->block);
1689         AJFREE(oldp);
1690     }
1691 
1692     return;
1693 }
1694 
1695 
1696 
1697 
1698 /* @funcstatic bamReadBlock ***************************************************
1699 **
1700 ** Read the next block of data from a BGZ file
1701 **
1702 ** @param [u] fp [AjPSeqBamBgzf] BGZ file handler
1703 ** @return [int] Zeo on success
1704 **
1705 **
1706 ** @release 6.3.0
1707 ******************************************************************************/
1708 
bamReadBlock(AjPSeqBamBgzf fp)1709 static int bamReadBlock(AjPSeqBamBgzf fp)
1710 {
1711     char header[AJSEQBAM_BLOCK_HEADER_LENGTH];
1712     int size = 0;
1713     ajlong block_address;
1714     int count;
1715     int block_length;
1716     char *compressed_block;
1717     int remaining;
1718 
1719     block_address = ftell(fp->file);
1720 
1721     if(bamCacheLoadBlock(fp, block_address))
1722         return 0;
1723 
1724     count = fread(header, 1, sizeof(header), fp->file);
1725 
1726     if(count == 0)
1727     {
1728         fp->block_length = 0;
1729         return 0;
1730     }
1731 
1732     size = count;
1733 
1734     if ((unsigned int) count != sizeof(header))
1735     {
1736         bamReportError(fp, "read failed");
1737         return -1;
1738     }
1739 
1740     if(!bamHeaderCheck(header))
1741     {
1742       /*bamReportError(fp, "invalid block header");*/
1743         return -1;
1744     }
1745 
1746     block_length = unpackInt16((unsigned char*)&header[16]) + 1;
1747     compressed_block = (char*) fp->compressed_block;
1748     memcpy(compressed_block, header, AJSEQBAM_BLOCK_HEADER_LENGTH);
1749     remaining = block_length - AJSEQBAM_BLOCK_HEADER_LENGTH;
1750     count = fread(&compressed_block[AJSEQBAM_BLOCK_HEADER_LENGTH], 1,
1751                   remaining, fp->file);
1752 
1753     if(count != remaining)
1754     {
1755         bamReportError(fp, "read failed");
1756         return -1;
1757     }
1758 
1759     size += count;
1760     count = bamInflateBlock(fp, block_length);
1761 
1762     if(count < 0)
1763         return -1;
1764 
1765     if(fp->block_length != 0)
1766     {
1767         /* Do not reset offset if this read follows a seek. */
1768         fp->block_offset = 0;
1769     }
1770 
1771     fp->block_address = block_address;
1772     fp->block_length = count;
1773     bamCacheBlock(fp, size);
1774 
1775     return 0;
1776 }
1777 
1778 
1779 
1780 
1781 /* @func ajSeqBamBgzfRead *****************************************************
1782 **
1783 ** Read a BAM file data record of a given length
1784 **
1785 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
1786 ** @param [u] data [void*] Buffer
1787 ** @param [r] length [int] length of buffer
1788 ** @return [int] Number of bytes read
1789 **               -1 on error
1790 **
1791 ** @release 6.3.0
1792 ******************************************************************************/
1793 
ajSeqBamBgzfRead(AjPSeqBamBgzf fp,void * data,int length)1794 int ajSeqBamBgzfRead(AjPSeqBamBgzf fp, void *data, int length)
1795 {
1796     int bytes_read;
1797     char *output;
1798     int available;
1799     int copy_length;
1800     char *buffer;
1801 
1802     if(length <= 0)
1803         return 0;
1804 
1805     if(fp->open_mode != 'r')
1806     {
1807         bamReportError(fp, "file not open for reading");
1808         return -1;
1809     }
1810 
1811     bytes_read = 0;
1812     output = data;
1813 
1814     while(bytes_read < length)
1815     {
1816         available = fp->block_length - fp->block_offset;
1817 
1818         if(available <= 0)
1819         {
1820             if(bamReadBlock(fp) != 0)
1821             {
1822                 ajDebug("bamReadBlock failed\n");
1823                 return -1;
1824             }
1825 
1826             available = fp->block_length - fp->block_offset;
1827 
1828             if(available <= 0)
1829                 break;
1830         }
1831 
1832         copy_length = BAMBGZFMIN(length-bytes_read, available);
1833         buffer = fp->uncompressed_block;
1834         memcpy(output, buffer + fp->block_offset, copy_length);
1835         fp->block_offset += copy_length;
1836         output += copy_length;
1837         bytes_read += copy_length;
1838     }
1839 
1840     if(fp->block_offset == fp->block_length)
1841     {
1842         fp->block_address = ftell(fp->file);
1843         fp->block_offset = 0;
1844         fp->block_length = 0;
1845     }
1846 
1847     return bytes_read;
1848 }
1849 
1850 
1851 
1852 
1853 /* @func ajSeqBamBgzfFlush ****************************************************
1854 **
1855 ** Flush block to output file
1856 **
1857 ** @param [u] fp [AjPSeqBamBgzf] Output file
1858 ** @return [int] 0 on success, -1 on failure
1859 **
1860 ** @release 6.3.0
1861 ******************************************************************************/
1862 
ajSeqBamBgzfFlush(AjPSeqBamBgzf fp)1863 int ajSeqBamBgzfFlush(AjPSeqBamBgzf fp)
1864 {
1865     int count;
1866     int block_length;
1867 
1868     while (fp->block_offset > 0)
1869     {
1870         block_length = bamDeflateBlock(fp, fp->block_offset);
1871 
1872         if (block_length < 0)
1873         {
1874             return -1;
1875         }
1876 
1877         count = fwrite(fp->compressed_block, 1, block_length, fp->file);
1878 
1879         if (count != block_length)
1880         {
1881             bamReportError(fp, "write failed");
1882             return -1;
1883         }
1884 
1885         fp->block_address += block_length;
1886     }
1887 
1888     return 0;
1889 }
1890 
1891 
1892 
1893 
1894 /* @func ajSeqBamBgzfWrite ****************************************************
1895 **
1896 ** Write length bytes from data to the file.
1897 **
1898 ** @param [u] fp [AjPSeqBamBgzf] BAM file
1899 ** @param [r] data [const void*] data buffer
1900 ** @param [r] length [int] Length of data buffer to be written
1901 ** @return [int] Returns the number of bytes written.
1902 **               Returns -1 on error.
1903 **
1904 ** @release 6.3.0
1905 ******************************************************************************/
1906 
ajSeqBamBgzfWrite(AjPSeqBamBgzf fp,const void * data,int length)1907 int ajSeqBamBgzfWrite(AjPSeqBamBgzf fp, const void* data, int length)
1908 {
1909     const char *input;
1910     int block_length;
1911     int bytes_written;
1912     int copy_length;
1913     char *buffer;
1914 
1915     if(fp->open_mode != 'w')
1916     {
1917         bamReportError(fp, "file not open for writing");
1918         return -1;
1919     }
1920 
1921     if(fp->uncompressed_block == NULL)
1922         fp->uncompressed_block = malloc(fp->uncompressed_block_size);
1923 
1924     input = data;
1925     block_length = fp->uncompressed_block_size;
1926     bytes_written = 0;
1927 
1928     while(bytes_written < length)
1929     {
1930         copy_length = BAMBGZFMIN(block_length - fp->block_offset,
1931                                  length - bytes_written);
1932 
1933         buffer = fp->uncompressed_block;
1934         memcpy(buffer + fp->block_offset, input, copy_length);
1935         fp->block_offset += copy_length;
1936         input += copy_length;
1937         bytes_written += copy_length;
1938 
1939         if(fp->block_offset == block_length)
1940         {
1941             if(ajSeqBamBgzfFlush(fp) != 0)
1942                 break;
1943         }
1944     }
1945 
1946     return bytes_written;
1947 }
1948 
1949 
1950 
1951 
1952 /* @func ajSeqBamBgzfClose ****************************************************
1953 **
1954 ** Close a BGZ file and free all associated resources.
1955 **
1956 ** Does not close the underlying file descriptor if created with bgzf_fdopen.
1957 **
1958 ** @param [u] fp [AjPSeqBamBgzf] BGZ file
1959 ** @return [int] Returns zero on success, -1 on error.
1960 **
1961 **
1962 ** @release 6.3.0
1963 ******************************************************************************/
1964 
ajSeqBamBgzfClose(AjPSeqBamBgzf fp)1965 int ajSeqBamBgzfClose(AjPSeqBamBgzf fp)
1966 {
1967     int block_length;
1968 
1969     if(!fp) return 0;
1970 
1971     if(fp->open_mode == 'w')
1972     {
1973         if(ajSeqBamBgzfFlush(fp) != 0)
1974             return -1;
1975         /* add an empty last block */
1976         block_length = bamDeflateBlock(fp, 0);
1977         fwrite(fp->compressed_block, 1, block_length, fp->file);
1978 
1979         if(fflush(fp->file) != 0)
1980         {
1981             bamReportError(fp, "flush failed");
1982             return -1;
1983         }
1984     }
1985 
1986     if(fp->owned_file)
1987     {
1988         if(fclose(fp->file) != 0)
1989             return -1;
1990     }
1991 
1992     free(fp->uncompressed_block);
1993     free(fp->compressed_block);
1994     bamCacheFree(fp);
1995     free(fp);
1996 
1997     return 0;
1998 }
1999 
2000 
2001 
2002 
2003 /* @func ajSeqBamBgzfEof ******************************************************
2004 **
2005 ** Check for end of file marker for a BGzip BAM file
2006 **
2007 ** @param [u] fp [AjPSeqBamBgzf] Undocumented
2008 ** @return [int] 1 if we are at the end of file mark
2009 **               0 if we are higher in the file
2010 **               -1 for an error
2011 **
2012 **
2013 ** @release 6.3.0
2014 ******************************************************************************/
2015 
ajSeqBamBgzfEof(AjPSeqBamBgzf fp)2016 int ajSeqBamBgzfEof(AjPSeqBamBgzf fp)
2017 {
2018     static unsigned char magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0"
2019         "\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
2020     unsigned char buf[28];
2021     ajint offset;
2022     int status;
2023 
2024     offset = ftell(fp->file);
2025 
2026     status = fseek(fp->file, -28, SEEK_END);
2027     if(status != 0)
2028     {
2029         ajDebug("seqBamBgzfEof seek end-28 failed errno %d: %s\n",
2030                 errno, strerror(errno));
2031         return -1;
2032     }
2033 
2034     /* Ubuntu warns if retval of fread is not tested */
2035     if(!fread(buf, 1, 28, fp->file))
2036     {
2037         ajDebug("seqBamBgzfEof fread failed errno %d: %s\n",
2038                 errno, strerror(errno));
2039     }
2040 
2041     status = fseek(fp->file, offset, SEEK_SET); /* back where we started */
2042     if(status != 0)
2043     {
2044         ajDebug("seqBamBgzfEof seek %d failed errno %d: %s\n",
2045                 offset,  errno, strerror(errno));
2046         return -1;
2047     }
2048 
2049     return (memcmp(magic, buf, 28) == 0) ? 1 : 0;
2050 }
2051 
2052 
2053 
2054 
2055 /* @func ajSeqBamHeaderRead ***************************************************
2056 **
2057 ** Read header section of BAM files
2058 **
2059 ** @param [u] gzfile [AjPSeqBamBgzf] BAM file handler
2060 ** @return [AjPSeqBamHeader] header object
2061 **
2062 ** @release 6.5.0
2063 ** @@
2064 ******************************************************************************/
2065 
ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile)2066 AjPSeqBamHeader ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile)
2067 {
2068     AjPSeqBamHeader header=NULL;
2069     char bambuf [4] = "   ";
2070     char* targetname;
2071     ajint status;
2072     ajuint i;
2073     ajuint ilen;
2074     ajuint ntargets;
2075     ajuint targetlen;
2076     ajuint namelen = 0;
2077 
2078     static AjBool bigendian = ajFalse;
2079 
2080     bigendian = ajUtilGetBigendian();
2081 
2082     status = ajSeqBamBgzfEof(gzfile);
2083     ajDebug("called ajSeqBamBgzfEof status %d errno %d ESPIPE %d\n",
2084             status, errno, ESPIPE);
2085 
2086     if (status < 0 && (errno == ESPIPE))
2087     {
2088 	/*
2089 	 ** If the file is a pipe, checking the EOF marker will *always* fail
2090 	 ** with ESPIPE.  Suppress the error message in this case.
2091 	 */
2092     }
2093     else if (status < 0) /* allow 0 - older BAM files have no EOF block */
2094     {
2095 	/* failed and not a PIPE */
2096 	return NULL;
2097     }
2098 
2099     status = ajSeqBamBgzfRead(gzfile, bambuf, 4);
2100 
2101     ajDebug("called ajSeqBamBgzfRead status %d\n", status);
2102 
2103     if(status < 0)
2104     {
2105 	return NULL;
2106     }
2107 
2108     if (strncmp(bambuf, "BAM\001", 4))
2109     {
2110 	return NULL;
2111     }
2112 
2113     /* BAM header */
2114 
2115     /* read plain text and the number of reference sequences */
2116     status = ajSeqBamBgzfRead(gzfile, &ilen, 4);
2117 
2118     if(status != 4)
2119 	ajErr("seqReadBam failed to read reference length");
2120 
2121     if(bigendian)
2122 	ajByteRevUint(&ilen);
2123 
2124     header = ajSeqBamHeaderNew();
2125 
2126     header->text = ajCharNewRes(ilen+1);
2127     ajSeqBamBgzfRead(gzfile, header->text, ilen);
2128     ajDebug("header plain text %u '%s'\n", ilen, header->text);
2129 
2130     header->dict = bamHeaderParse2(header->text);
2131     header->rg2lib = bamHeaderTotable(header->dict, "RG", "ID", "LB");
2132 
2133     ajSeqBamBgzfRead(gzfile, &ntargets, 4);
2134 
2135     if(bigendian)
2136 	ajByteRevUint(&ntargets);
2137 
2138     ajDebug("bam_header_read: # bam targets = %u\n", ntargets);
2139     header->n_targets = ntargets;
2140 
2141     AJCNEW0(header->target_name, header->n_targets);
2142     AJCNEW0(header->target_len, header->n_targets);
2143 
2144     /* read reference sequence names and lengths */
2145 
2146     for(i=0; i < ntargets; i++)
2147     {
2148 	ajSeqBamBgzfRead(gzfile, &namelen, 4);
2149 
2150 	if(bigendian)
2151 	    ajByteRevUint(&namelen);
2152 
2153 	targetname = ajCharNewRes(namelen+1);
2154 	ajSeqBamBgzfRead(gzfile, targetname, namelen);
2155 	ajSeqBamBgzfRead(gzfile, &targetlen, 4);
2156 
2157 	if(bigendian)
2158 	    ajByteRevUint(&targetlen);
2159 
2160 	header->target_name[i] = targetname;
2161 	header->target_len[i] = targetlen;
2162 
2163 	ajDebug("bam target[%u] %u '%s'\n", i, targetlen, targetname);
2164     }
2165 
2166     return header;
2167 
2168 }
2169 
2170 
2171 
2172 
2173 /* @funcstatic bamDestroyHeaderHash *******************************************
2174 **
2175 ** Free the hash table in a BAM header object
2176 **
2177 ** @param [u] header [AjPSeqBamHeader] Header object
2178 ** @return [void]
2179 **
2180 **
2181 ** @release 6.3.0
2182 ******************************************************************************/
2183 
bamDestroyHeaderHash(AjPSeqBamHeader header)2184 static void bamDestroyHeaderHash(AjPSeqBamHeader header)
2185 {
2186     if(header->hash)
2187         ajTableFree(&header->hash);
2188 
2189     return;
2190 }
2191 
2192 
2193 
2194 
2195 /* @funcstatic bamHeaderFree **************************************************
2196 **
2197 ** Destructor for a BAM header list
2198 **
2199 ** @param [d] header [AjPList*] Header list object
2200 ** @return [void]
2201 **
2202 **
2203 ** @release 6.3.0
2204 ******************************************************************************/
2205 
bamHeaderFree(AjPList * header)2206 static void bamHeaderFree(AjPList *header)
2207 {
2208     BamPHeaderLine hline = NULL;
2209 
2210     while(ajListGetLength(*header))
2211     {
2212         ajListPop(*header, (void**)&hline);
2213 
2214         if(hline)
2215             bamHeaderLineFree(&hline);
2216     }
2217 
2218     ajListFree(header);
2219 
2220     return;
2221 }
2222 
2223 
2224 
2225 
2226 /* @funcstatic bamHeaderLineFree **********************************************
2227 **
2228 ** Free a BAM header line
2229 **
2230 ** @param [d] Phline [BamPHeaderLine*] Header line object
2231 ** @return [void]
2232 **
2233 **
2234 ** @release 6.3.0
2235 ******************************************************************************/
2236 
bamHeaderLineFree(BamPHeaderLine * Phline)2237 static void bamHeaderLineFree(BamPHeaderLine *Phline)
2238 {
2239     BamPHeaderLine hline = *Phline;
2240     AjPList tags = hline->tags;
2241     BamPHeaderTag htag = NULL;
2242 
2243     while(ajListGetLength(tags))
2244     {
2245         ajListPop(tags, (void**)&htag);
2246         AJFREE(htag->value);
2247         AJFREE(htag);
2248     }
2249 
2250     ajListFree(&tags);
2251     AJFREE(*Phline);
2252 
2253     return;
2254 }
2255 
2256 
2257 
2258 
2259 /* @funcstatic bamHeaderParse2 ************************************************
2260 **
2261 ** Parse BAM file header text
2262 **
2263 ** @param [r] headerText [const char*] Header text
2264 ** @return [AjPList] list of header records
2265 **
2266 **
2267 ** @release 6.3.0
2268 ******************************************************************************/
2269 
bamHeaderParse2(const char * headerText)2270 static AjPList bamHeaderParse2(const char *headerText)
2271 {
2272     AjPList hlines = NULL;
2273     BamPHeaderLine hline;
2274     const char *text;
2275     char *buf=NULL;
2276     size_t nbuf = 0;
2277 
2278     if(!headerText)
2279         return 0;
2280 
2281     text = headerText;
2282     hlines = ajListNew();
2283 
2284     while((text=bamNextline(&buf, &nbuf, text)))
2285     {
2286         hline = bamHeaderLineParse(buf);
2287 
2288         if(hline && bamHeaderLineValidate(hline))
2289             ajListPushAppend(hlines, hline);
2290         else
2291         {
2292             if(hline)
2293                 bamHeaderLineFree(&hline);
2294 
2295             bamHeaderFree(&hlines);
2296             AJFREE(buf);
2297 
2298             return NULL;
2299         }
2300     }
2301 
2302     AJFREE(buf);
2303 
2304     return hlines;
2305 }
2306 
2307 
2308 
2309 
2310 /* @funcstatic bamHeaderTotable ***********************************************
2311 **
2312 ** Extract table data from a list of BAM header records
2313 **
2314 ** @param [r] dict [const AjPList] List of header records
2315 ** @param [r] type [const char[2]] Type
2316 ** @param [r] key_tag [const char[2]] Key tag
2317 ** @param [r] value_tag [const char[2]] Value tag
2318 ** @return [AjPTable] Table of BAM header tags and values
2319 **
2320 **
2321 ** @release 6.3.0
2322 ******************************************************************************/
2323 
bamHeaderTotable(const AjPList dict,const char type[2],const char key_tag[2],const char value_tag[2])2324 static AjPTable bamHeaderTotable(const AjPList dict, const char type[2],
2325                                  const char key_tag[2],
2326                                  const char value_tag[2])
2327 {
2328     AjPTable tbl = NULL;
2329     BamPHeaderLine hline = NULL;
2330     AjIList k;
2331     BamPHeaderTag key;
2332     BamPHeaderTag value;
2333 
2334     tbl = ajTablecharNew(512);
2335 
2336     if(dict == 0)       /* return an empty (not null) hash table */
2337         return tbl;
2338 
2339     k = ajListIterNewread(dict);
2340 
2341     while(!ajListIterDone(k))
2342     {
2343         hline = ajListIterGet(k);
2344 
2345         if(hline->type[0]!=type[0] || hline->type[1]!=type[1])
2346             continue;
2347 
2348         key   = bamHeaderLineHasTag(hline,key_tag);
2349         value = bamHeaderLineHasTag(hline,value_tag);
2350 
2351         if(!key || !value)
2352             continue;
2353 
2354         if(ajTablePut(tbl, key->value, value->value))
2355             ajWarn("[bamHeaderTotable] The key %s not unique.\n",
2356                    key->value);
2357     }
2358 
2359     ajListIterDel(&k);
2360 
2361     return tbl;
2362 }
2363 
2364 
2365 
2366 
2367 /* @funcstatic bamNextline ****************************************************
2368 **
2369 ** Mimics the behaviour of getline, except it returns pointer to the
2370 ** next chunk of the text or NULL if everything has been read.
2371 **
2372 ** The lineptr should be freed by the caller. The newline character is
2373 ** stripped.
2374 **
2375 ** @param [u] lineptr [char**] Buffer
2376 ** @param [u] n [size_t*] Buffer current size, can be extended
2377 ** @param [r] text [const char*] Text starting at current position
2378 ** @return [const char*] Next chunk of text, NULL when all is done.
2379 **
2380 **
2381 ** @release 6.3.0
2382 ******************************************************************************/
2383 
bamNextline(char ** lineptr,size_t * n,const char * text)2384 static const char* bamNextline(char **lineptr, size_t *n, const char *text)
2385 {
2386     ajuint len;
2387     const char *to = NULL;
2388 
2389     to = text;
2390 
2391     if(!*to)
2392         return NULL;
2393 
2394     while(*to && *to!='\n' && *to!='\r')
2395         to++;
2396 
2397     len = to - text + 1;
2398 
2399     if(*to)
2400     {
2401         /* Advance the pointer for the next call */
2402         if(*to=='\n')
2403             to++;
2404         else if(*to=='\r' && *(to+1)=='\n')
2405             to+=2;
2406     }
2407 
2408     if(!len)
2409         return to;
2410 
2411     if(!*lineptr)
2412     {
2413         *lineptr = malloc(len);
2414         *n = len;
2415     }
2416     else if(*n<len)
2417     {
2418         *lineptr = realloc(*lineptr, len);
2419         *n = len;
2420     }
2421 
2422     if(!*lineptr)
2423         ajDie("[bamNextline] Insufficient memory!\n");
2424 
2425     memcpy(*lineptr,text,len);
2426     (*lineptr)[len-1] = 0;
2427 
2428     return to;
2429 }
2430 
2431 
2432 
2433 
2434 /* @func ajSeqBamHeaderLineParse **********************************************
2435 **
2436 ** Parse a header line and creates a table with tag-value pairs
2437 ** in the header line
2438 **
2439 ** @param [r] headerLine [const char*] Header line
2440 ** @return [AjPTable] Table with tag-value pairs
2441 **
2442 **
2443 ** @release 6.3.0
2444 ******************************************************************************/
2445 
ajSeqBamHeaderLineParse(const char * headerLine)2446 AjPTable ajSeqBamHeaderLineParse(const char *headerLine)
2447 {
2448     BamPHeaderLine hline=NULL;
2449     AjPTable ret=NULL;
2450 
2451     hline = bamHeaderLineParse(headerLine);
2452 
2453     if(!hline)
2454 	return NULL;
2455 
2456     bamHeaderLineValidate(hline);
2457 
2458     ret = bamHeaderLineToTablemap(hline);
2459 
2460     if(!ret)
2461 	return NULL;
2462 
2463     ajTablePut(ret,
2464 	    ajCharNewC("type"),
2465 	    ajCharNewResLenC(hline->type,3,2));
2466 
2467     bamHeaderLineFree(&hline);
2468 
2469     return ret;
2470 }
2471 
2472 
2473 
2474 
2475 /* @funcstatic bamHeaderLineToTablemap ****************************************
2476 **
2477 ** Looks for a tag key and returns the tags and values in a
2478 ** table
2479 **
2480 ** @param [r] hline [const BamPHeaderLine] Header line
2481 ** @return [AjPTable] Table of tags and values
2482 **
2483 **
2484 ** @release 6.5.0
2485 ******************************************************************************/
2486 
bamHeaderLineToTablemap(const BamPHeaderLine hline)2487 static AjPTable bamHeaderLineToTablemap(const BamPHeaderLine hline)
2488 {
2489     AjPList tags = hline->tags;
2490     BamPHeaderTag tag = NULL;
2491     AjPTable hlinemap = NULL;
2492     AjIList iter = NULL;
2493 
2494     iter = ajListIterNewread(tags);
2495     hlinemap = ajTablecharNew(16);
2496     ajTableSetDestroyvalue(hlinemap, &ajMemFree);
2497 
2498     while(!ajListIterDone(iter))
2499     {
2500         tag = ajListIterGet(iter);
2501 
2502         ajTablePut(hlinemap, ajCharNewResLenC(tag->key,3,2),
2503         	   ajCharNewC(tag->value));
2504     }
2505 
2506     ajListIterDel(&iter);
2507 
2508     return hlinemap;
2509 }
2510 
2511 
2512 
2513 
2514 /* @funcstatic bamHeaderLineParse *********************************************
2515 **
2516 ** Parse a header line and creates a header line object
2517 **
2518 ** @param [r] headerLine [const char*] Header line
2519 ** @return [BamPHeaderLine] Header line object
2520 **
2521 **
2522 ** @release 6.3.0
2523 ******************************************************************************/
2524 
bamHeaderLineParse(const char * headerLine)2525 static BamPHeaderLine bamHeaderLineParse(const char *headerLine)
2526 {
2527     BamPHeaderLine hline;
2528     BamPHeaderTag tag;
2529     const char *from, *to;
2530     int itype;
2531 
2532     from = headerLine;
2533 
2534     if(*from != '@')
2535     {
2536         ajWarn("[bamHeaderLineParse] expected '@', got [%s]\n", headerLine);
2537         return 0;
2538     }
2539 
2540     to = ++from;
2541 
2542     while(*to && *to!='\t')
2543         to++;
2544 
2545     if(to-from != 2)
2546     {
2547         ajWarn("[bamHeaderLineParse] expected '@XY', got [%s]\n", headerLine);
2548         return 0;
2549     }
2550 
2551     AJNEW(hline);
2552     hline->type[0] = from[0];
2553     hline->type[1] = from[1];
2554     hline->tags = NULL;
2555 
2556     itype = bamTagExists(hline->type, types);
2557 
2558     from = to;
2559 
2560     while(*to && *to=='\t')
2561         to++;
2562 
2563     if(to-from != 1)
2564     {
2565         ajWarn("[bamHeaderLineParse] multiple tabs on line [%s] (%d)\n",
2566                headerLine,(int)(to-from));
2567         return 0;
2568     }
2569 
2570     from = to;
2571 
2572     while(*from)
2573     {
2574         while(*to && *to!='\t')
2575             to++;
2576 
2577         if(!required_tags[itype] && !optional_tags[itype])
2578             tag = bamTagNew("  ",from,to-1);
2579         else
2580             tag = bamTagNew(from,from+3,to-1);
2581 
2582         if(bamHeaderLineHasTag(hline,tag->key))
2583             ajWarn("The tag '%c%c' is present (at least) twice on line [%s]\n",
2584                    tag->key[0],tag->key[1], headerLine);
2585 
2586         if(!hline->tags)
2587             hline->tags = ajListNew();
2588 
2589         ajListPushAppend(hline->tags, tag);
2590 
2591         from = to;
2592 
2593         while(*to && *to=='\t')
2594             to++;
2595 
2596         if(*to && to-from != 1)
2597         {
2598             ajWarn("[bamHeaderLineParse] multiple tabs on line [%s] (%d)\n",
2599                    headerLine,(int)(to-from));
2600             return 0;
2601         }
2602 
2603         from = to;
2604     }
2605 
2606     return hline;
2607 }
2608 
2609 
2610 
2611 
2612 /* @funcstatic bamHeaderLineValidate ******************************************
2613 **
2614 ** Validate a BAM header record
2615 **
2616 ** Must be of an existing type, all tags must be recognised and all
2617 ** required tags must be present
2618 **
2619 ** @param [u] hline [BamPHeaderLine] Header line object
2620 ** @return [int] 1 on success, 0 and  a message on failure
2621 **
2622 **
2623 ** @release 6.3.0
2624 ******************************************************************************/
2625 
bamHeaderLineValidate(BamPHeaderLine hline)2626 static int bamHeaderLineValidate(BamPHeaderLine hline)
2627 {
2628     AjPList tags = NULL;
2629     BamPHeaderTag tag = NULL;
2630     int itype, itag;
2631     AjIList iter = NULL;
2632 
2633     /* Is the type correct? */
2634     itype = bamTagExists(hline->type, types);
2635 
2636     if(itype==-1)
2637     {
2638         ajWarn("The type [%c%c] not recognised.\n",
2639                hline->type[0],hline->type[1]);
2640         return 0;
2641     }
2642 
2643     /* Has all required tags? */
2644     itag = 0;
2645 
2646     while(required_tags[itype] && required_tags[itype][itag])
2647     {
2648         if(!bamHeaderLineHasTag(hline,required_tags[itype][itag]))
2649         {
2650             ajWarn("The tag [%c%c] required for [%c%c] not present.\n",
2651                    required_tags[itype][itag][0],
2652                    required_tags[itype][itag][1],
2653                    hline->type[0],hline->type[1]);
2654 
2655             return 0;
2656         }
2657 
2658         itag++;
2659     }
2660 
2661     /* Are all tags recognised? */
2662     tags = hline->tags;
2663     iter = ajListIterNewread(tags);
2664 
2665     while(!ajListIterDone(iter))
2666     {
2667         tag = ajListIterGet(iter);
2668 
2669         if (!bamTagExists(tag->key,required_tags[itype]) &&
2670             !bamTagExists(tag->key,optional_tags[itype]))
2671         {
2672             ajWarn("Unknown tag [%c%c] for [%c%c].\n",
2673                    tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
2674 
2675             ajListIterDel(&iter);
2676 
2677             return 0;
2678         }
2679     }
2680 
2681     ajListIterDel(&iter);
2682 
2683     return 1;
2684 }
2685 
2686 
2687 
2688 
2689 /* @funcstatic bamHeaderLineHasTag ********************************************
2690 **
2691 ** Looks for a tag key and returns the tag and value as a
2692 ** BamPheaderTag object if found
2693 **
2694 ** @param [r] hline [const BamPHeaderLine] Header line
2695 ** @param [r] key [const char*] Key to look for
2696 ** @return [BamPHeaderTag] Header tag if found, NULL if not
2697 **
2698 **
2699 ** @release 6.3.0
2700 ******************************************************************************/
2701 
bamHeaderLineHasTag(const BamPHeaderLine hline,const char * key)2702 static BamPHeaderTag bamHeaderLineHasTag(const BamPHeaderLine hline,
2703                                          const char *key)
2704 {
2705     AjPList tags = hline->tags;
2706     BamPHeaderTag tag = NULL;
2707     BamPHeaderTag ret = NULL;
2708     AjIList iter = NULL;
2709 
2710     iter = ajListIterNewread(tags);
2711 
2712     while(!ajListIterDone(iter))
2713     {
2714         tag = ajListIterGet(iter);
2715 
2716         if(tag->key[0]==key[0] && tag->key[1]==key[1])
2717         {
2718             ret = tag;
2719             break;
2720         }
2721     }
2722 
2723     ajListIterDel(&iter);
2724 
2725     return ret;
2726 }
2727 
2728 
2729 
2730 
2731 /* @funcstatic bamTagExists ***************************************************
2732 **
2733 ** Look for a tag "XY" in a predefined const char *[] array.
2734 **
2735 ** @param [r] tag [const char*] query tag name
2736 ** @param [r] tags [const char**] array of tage names
2737 ** @return [int] tag number on success, -1 on failure
2738 **
2739 **
2740 ** @release 6.3.0
2741 ******************************************************************************/
2742 
bamTagExists(const char * tag,const char ** tags)2743 static int bamTagExists(const char *tag, const char **tags)
2744 {
2745     int itag=0;
2746 
2747     if(!tags)
2748         return -1;
2749 
2750     while(tags[itag])
2751     {
2752         if(tags[itag][0]==tag[0] && tags[itag][1]==tag[1])
2753             return itag;
2754 
2755         itag++;
2756     }
2757 
2758     return -1;
2759 }
2760 
2761 
2762 
2763 
2764 /* @funcstatic bamTagNew ******************************************************
2765 **
2766 **  name points to "XY", value_from points to the first character of
2767 **  the value string and value_to points to the last character of the
2768 **  value string.
2769 **
2770 ** @param [r] name [const char*] tag name
2771 ** @param [r] value_from [const char*] first character of value
2772 ** @param [r] value_to [const char*] last character of value
2773 ** @return [BamPHeaderTag] Header tag object
2774 **
2775 **
2776 ** @release 6.3.0
2777 ******************************************************************************/
2778 
bamTagNew(const char * name,const char * value_from,const char * value_to)2779 static BamPHeaderTag bamTagNew(const char *name, const char *value_from,
2780                                const char *value_to)
2781 {
2782     BamPHeaderTag tag = NULL;
2783     int len;
2784 
2785     AJNEW0(tag);
2786     len = value_to-value_from+1;
2787 
2788     tag->key[0] = name[0];
2789     tag->key[1] = name[1];
2790     tag->value = malloc(len+1);
2791     memcpy(tag->value,value_from,len+1);
2792     tag->value[len] = 0;
2793 
2794     return tag;
2795 }
2796 
2797 
2798 
2799 
2800 #define MBAMSKIPTAG(s) do {                                             \
2801         int type = toupper(*(s));                                       \
2802         ++(s);                                                          \
2803         if (type == 'C' || type == 'A')                                 \
2804             ++(s);                                                      \
2805         else if (type == 'S')                                           \
2806             (s) += 2;                                                   \
2807         else if (type == 'I' || type == 'F')                            \
2808             (s) += 4;                                                   \
2809         else if (type == 'D')                                           \
2810             (s) += 8;                                                   \
2811         else if (type == 'Z' || type == 'H')                            \
2812         {                                                               \
2813             while(*(s))                                                 \
2814                 ++(s);                                                  \
2815             ++(s);                                                      \
2816         }                                                               \
2817     } while(0)
2818 
2819 
2820 
2821 
2822 
2823 /* @func ajSeqBamAuxAppend ****************************************************
2824 **
2825 ** Append auxiliary data tag to a BAM record
2826 **
2827 ** @param [u] b [AjPSeqBam] BAM record
2828 ** @param [r] tag [const char[2]] Tag name
2829 ** @param [r] type [char] Tag type
2830 ** @param [r] len [int] data length
2831 ** @param [r] data [const unsigned char*] Tag type
2832 ** @return [void]
2833 **
2834 **
2835 ** @release 6.5.0
2836 ******************************************************************************/
2837 
ajSeqBamAuxAppend(AjPSeqBam b,const char tag[2],char type,int len,const unsigned char * data)2838 void ajSeqBamAuxAppend(AjPSeqBam b, const char tag[2],
2839 		       char type, int len, const unsigned char* data)
2840 {
2841     int ori_len = b->data_len;
2842 
2843     b->data_len += 3 + len;
2844     b->l_aux += 3 + len;
2845 
2846     if (b->m_data < b->data_len) {
2847 	b->m_data = b->data_len;
2848 	kroundup32(b->m_data);
2849 	b->data = (unsigned char*)realloc(b->data, b->m_data);
2850     }
2851     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
2852     b->data[ori_len + 2] = type;
2853     memcpy(b->data + ori_len + 3, data, len);
2854 }
2855 
2856 
2857 
2858 
2859 /* @funcstatic bamGetAux ******************************************************
2860 **
2861 ** Get auxiliary data tag from a BAM record
2862 **
2863 ** @param [r] b [const AjPSeqBam] BAM record
2864 ** @param [r] tag [const char[2]] Tag name
2865 ** @return [unsigned char*] Tag value
2866 **
2867 **
2868 ** @release 6.3.0
2869 ******************************************************************************/
2870 
bamGetAux(const AjPSeqBam b,const char tag[2])2871 static unsigned char* bamGetAux(const AjPSeqBam b, const char tag[2])
2872 {
2873     unsigned char *s;
2874     int y;
2875     int x;
2876 
2877     y = tag[0]<<8 | tag[1];
2878 
2879     s = MAJSEQBAMAUX(b);
2880 
2881     while(s < b->data + b->data_len)
2882     {
2883         x = (int)s[0]<<8 | s[1];
2884         s += 2;
2885 
2886         if(x == y)
2887             return s;
2888 
2889         MBAMSKIPTAG(s);
2890     }
2891 
2892     return 0;
2893 }
2894 
2895 
2896 
2897 
2898 /* @func ajSeqBamBgzfSeek *****************************************************
2899 **
2900 ** Seek function for a BAM file
2901 **
2902 ** @param [u] fp [AjPSeqBamBgzf] BGZ file object
2903 ** @param [r] pos [ajlong] Buffer
2904 ** @param [r] where [int] length of buffer
2905 ** @return [ajlong] Number of bytes read,
2906 **                  -1 on error
2907 **
2908 ** @release 6.5.0
2909 ******************************************************************************/
2910 
ajSeqBamBgzfSeek(AjPSeqBamBgzf fp,ajlong pos,int where)2911 ajlong ajSeqBamBgzfSeek(AjPSeqBamBgzf fp, ajlong pos, int where)
2912 {
2913     int block_offset;
2914     ajlong block_address;
2915 
2916     if (fp->open_mode != 'r')
2917     {
2918 	ajErr("file not open for read");
2919 	return -1;
2920     }
2921 
2922     if (where != SEEK_SET)
2923     {
2924 	ajErr("unimplemented seek option");
2925 	return -1;
2926     }
2927 
2928     block_offset = pos & 0xFFFF;
2929     block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
2930 
2931     if (fseek(fp->file, block_address, SEEK_SET) != 0)
2932     {
2933 	ajErr("seek failed");
2934 	return -1;
2935     }
2936 
2937     fp->block_length = 0;  /* indicates current block is not loaded */
2938     fp->block_address = block_address;
2939     fp->block_offset = block_offset;
2940 
2941     return 0;
2942 }
2943 
2944 
2945 
2946 
2947 /* @func ajSeqBamValidate *****************************************************
2948 **
2949 ** Simple validation of a BAM record
2950 **
2951 ** Other fields could also be checked, especially the auxiliary data
2952 **
2953 ** @param [r] header [const AjPSeqBamHeader] header object
2954 ** @param [r] b [const AjPSeqBam] BAM alignment record
2955 ** @return [int] 1 on success, 0 otherwise
2956 **
2957 ** @release 6.5.0
2958 ******************************************************************************/
2959 
ajSeqBamValidate(const AjPSeqBamHeader header,const AjPSeqBam b)2960 int ajSeqBamValidate(const AjPSeqBamHeader header, const AjPSeqBam b)
2961 {
2962     char *s;
2963 
2964     if (b->core.tid < -1 || b->core.mtid < -1)
2965 	return 0;
2966 
2967     if (header && (b->core.tid >= header->n_targets
2968 	    || b->core.mtid >= header->n_targets))
2969 	return 0;
2970 
2971     if (b->data_len < b->core.l_qname)
2972 	return 0;
2973 
2974     s = memchr(MAJSEQBAMQNAME(b), '\0', b->core.l_qname);
2975 
2976     if (s != &MAJSEQBAMQNAME(b)[b->core.l_qname-1])
2977 	return 0;
2978 
2979     return 1;
2980 }
2981 
2982 
2983 
2984 
2985 /* @func ajSeqBamHeaderGetRefseqTags ******************************************
2986 **
2987 ** Return reference sequence tags, in a table which includes sub-tables for
2988 ** each individual reference sequence.
2989 ** Each sub-table including tag value pairs for a reference sequence.
2990 **
2991 ** @param [r] header [const AjPSeqBamHeader] BAM header
2992 ** @return [AjPTable] Table of tables for reference sequence tags
2993 **
2994 **
2995 ** @release 6.5.0
2996 ******************************************************************************/
2997 
ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header)2998 AjPTable ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header)
2999 {
3000     AjPTable ret = NULL;
3001     BamPHeaderLine hline = NULL;
3002     AjIList k = NULL;
3003     BamPHeaderTag uri = NULL;
3004     BamPHeaderTag md5 = NULL;
3005     BamPHeaderTag species = NULL;
3006     BamPHeaderTag seqname = NULL;
3007     BamPHeaderTag assemid = NULL;
3008     AjPTable tags = NULL;
3009     const AjPList dict = NULL;
3010 
3011     dict = header->dict;
3012 
3013     if(dict == NULL)
3014 	return ret;
3015 
3016     ret = ajTablecharNewConst(16);
3017 
3018     k = ajListIterNewread(dict);
3019 
3020     while(!ajListIterDone(k))
3021     {
3022 	hline = ajListIterGet(k);
3023 
3024 	if(hline->type[0] != 'S' || hline->type[1] != 'Q')
3025 	    continue;
3026 
3027 	seqname = bamHeaderLineHasTag(hline,"SN");
3028 
3029 	if(!seqname)
3030 	{
3031 	    ajErr("ajSeqBamHeaderGetRefseqTags: missing SN tag"
3032 		  " in SQ header line");
3033 	    return NULL;
3034 	}
3035 
3036 	tags = ajTablecharNew(8);
3037 
3038 	assemid = bamHeaderLineHasTag(hline,"AS");
3039 	if(assemid)
3040 	    ajTablePut(tags, ajCharNewC(assemid->key), assemid->value);
3041 
3042 	uri   = bamHeaderLineHasTag(hline,"UR");
3043 	if(uri)
3044 	    ajTablePut(tags, ajCharNewC(uri->key), uri->value);
3045 
3046 	md5 = bamHeaderLineHasTag(hline,"M5");
3047 	if(md5)
3048 	    ajTablePut(tags, ajCharNewC(md5->key), md5->value);
3049 
3050 	species = bamHeaderLineHasTag(hline,"SP");
3051 	if(species)
3052 	    ajTablePut(tags, ajCharNewC(species->key), species->value);
3053 
3054 	ajTablePut(ret, seqname->value, tags);
3055 
3056     }
3057 
3058     ajListIterDel(&k);
3059 
3060     return ret;
3061 }
3062 
3063 
3064 
3065 
3066 /* @func ajSeqBamHeaderGetReadgroupTags****************************************
3067 **
3068 ** Return read group tags, in a table which includes sub-tables for
3069 ** each individual read group.
3070 ** Each sub-table including tag value pairs for a read group.
3071 **
3072 ** @param [r] header [const AjPSeqBamHeader] BAM header
3073 ** @return [AjPTable] Table of tables for read-group tags
3074 **
3075 **
3076 ** @release 6.5.0
3077 ******************************************************************************/
3078 
ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header)3079 AjPTable ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header)
3080 {
3081     AjPTable ret = NULL;
3082     BamPHeaderLine hline = NULL;
3083     AjIList k = NULL;
3084     BamPHeaderTag tag = NULL;
3085     AjPTable tags = NULL;
3086     AjIList iter = NULL;
3087     const AjPList dict = NULL;
3088     char* id = NULL;
3089 
3090     dict = header->dict;
3091 
3092     if(dict == NULL)
3093 	return ret;
3094 
3095     ret = ajTablecharNewConst(16);
3096 
3097     k = ajListIterNewread(dict);
3098 
3099     while(!ajListIterDone(k))
3100     {
3101 	hline = ajListIterGet(k);
3102 
3103 	if(hline->type[0] != 'R' || hline->type[1] != 'G') /* RG header line */
3104 	    continue;
3105 
3106 	tags = ajTablecharNew(8);
3107 
3108 	iter = ajListIterNewread(hline->tags);
3109 
3110 	while(!ajListIterDone(iter))
3111 	{
3112 	    tag = ajListIterGet(iter);
3113 
3114 	    if(tag->key[0]=='I' && tag->key[1]=='D') /* ID tag for RG line */
3115 	    {
3116 		id = tag->value;
3117 		continue;
3118 	    }
3119 	    ajTablePut(tags, ajCharNewResLenC(tag->key, 3, 2), tag->value);
3120 	}
3121 
3122 	ajListIterDel(&iter);
3123 
3124 	if(!id)
3125 	{
3126 	    ajErr("ajSeqBamHeaderGetReadgroupTags: missing ID tag"
3127 		    " in RG header line");
3128 	    return NULL;
3129 	}
3130 
3131 	ajTablePut(ret, id, tags);
3132 
3133     }
3134 
3135     ajListIterDel(&k);
3136 
3137     return ret;
3138 }
3139 
3140 
3141 
3142 
3143 /* @func ajSeqBamHeaderGetSortorder *******************************************
3144 **
3145 ** Return sort-order defined in header
3146 **
3147 ** @param [r] header [const AjPSeqBamHeader] BAM header
3148 ** @return [const char*] sort order for the BAM file
3149 **
3150 **
3151 ** @release 6.5.0
3152 ******************************************************************************/
3153 
ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header)3154 const char* ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header)
3155 {
3156     const char* ret = NULL;
3157     BamPHeaderLine hline = NULL;
3158     AjIList k = NULL;
3159     BamPHeaderTag so = NULL;
3160 
3161     if(!header || !header->dict)
3162 	return NULL;
3163 
3164     k = ajListIterNewread(header->dict);
3165 
3166     while(!ajListIterDone(k))
3167     {
3168 	hline = ajListIterGet(k);
3169 
3170 	if(hline->type[0] != 'H' || hline->type[1] != 'D')
3171 	    continue;
3172 
3173 	so = bamHeaderLineHasTag(hline,"SO");
3174 
3175 	if(so)
3176 	    ret = so->value;
3177 
3178 	break;
3179 
3180     }
3181 
3182     ajListIterDel(&k);
3183 
3184     return ret;
3185 }
3186