1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield, Rob Davies
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36 */
37
38 #ifdef HAVE_CONFIG_H
39 #include "io_lib_config.h"
40 #endif
41
42 #include <string.h>
43 #include <assert.h>
44
45 #include "io_lib/sam_header.h"
46 #include "io_lib/string_alloc.h"
47
48 #ifdef SAMTOOLS
49 #define sam_hdr_parse sam_hdr_parse_
50 #endif
51
sam_hdr_error(char * msg,char * line,int len,int lno)52 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
53 int j;
54
55 for (j = 0; j < len && line[j] != '\n'; j++)
56 ;
57 fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
58 }
59
sam_hdr_dump(SAM_hdr * hdr)60 void sam_hdr_dump(SAM_hdr *hdr) {
61 HashIter *iter = HashTableIterCreate();
62 HashItem *hi;
63 int i;
64
65 printf("===DUMP===\n");
66 while ((hi = HashTableIterNext(hdr->h, iter))) {
67 SAM_hdr_type *t1, *t2;
68
69 t1 = t2 = hi->data.p;
70 printf("Type %.2s, count %d\n", hi->key, t1->prev->order+1);
71
72 do {
73 SAM_hdr_tag *tag;
74 printf(">>>%d ", t1->order);
75 for (tag = t1->tag; tag; tag=tag->next) {
76 printf("\"%.2s\":\"%.*s\"\t",
77 tag->str, tag->len-3, tag->str+3);
78 }
79 putchar('\n');
80 t1 = t1->next;
81 } while (t1 != t2);
82 }
83
84 /* Dump out PG chains */
85 printf("\n@PG chains:\n");
86 for (i = 0; i < hdr->npg_end; i++) {
87 int j;
88 printf(" %d:", i);
89 for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
90 printf("%s%d(%.*s)",
91 j == hdr->pg_end[i] ? " " : "->",
92 j, hdr->pg[j].name_len, hdr->pg[j].name);
93 }
94 printf("\n");
95 }
96
97 puts("===END DUMP===");
98
99 HashTableIterDestroy(iter);
100 }
101
102 /* Updates the HashTables in the SAM_hdr structure.
103 *
104 * Returns 0 on success;
105 * -1 on failure
106 */
sam_hdr_update_hashes(SAM_hdr * sh,const char * type,SAM_hdr_type * h_type)107 static int sam_hdr_update_hashes(SAM_hdr *sh,
108 const char *type,
109 SAM_hdr_type *h_type) {
110 /* Add to reference hash? */
111 if (type[0] == 'S' && type[1] == 'Q') {
112 SAM_hdr_tag *tag;
113 int nref = sh->nref;
114
115 sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
116 if (!sh->ref)
117 return -1;
118
119 tag = h_type->tag;
120 sh->ref[nref].name = NULL;
121 sh->ref[nref].len = 0;
122 sh->ref[nref].ty = h_type;
123 sh->ref[nref].tag = tag;
124
125 while (tag) {
126 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
127 if (!(sh->ref[nref].name = malloc(tag->len)))
128 return -1;
129 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
130 sh->ref[nref].name[tag->len-3] = 0;
131 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
132 sh->ref[nref].len = atoi(tag->str+3);
133 }
134 tag = tag->next;
135 }
136
137 if (sh->ref[nref].name) {
138 HashData hd;
139 hd.i = nref;
140 if (!HashTableAdd(sh->ref_hash, sh->ref[nref].name, 0, hd, NULL))
141 return -1;
142 }
143
144 sh->nref++;
145 }
146
147 /* Add to read-group hash? */
148 if (type[0] == 'R' && type[1] == 'G') {
149 SAM_hdr_tag *tag;
150 int nrg = sh->nrg;
151
152 sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
153 if (!sh->rg)
154 return -1;
155
156 tag = h_type->tag;
157 sh->rg[nrg].name = NULL;
158 sh->rg[nrg].name_len = 0;
159 sh->rg[nrg].ty = h_type;
160 sh->rg[nrg].tag = tag;
161 sh->rg[nrg].id = nrg;
162
163 while (tag) {
164 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
165 if (!(sh->rg[nrg].name = malloc(tag->len)))
166 return -1;
167 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
168 sh->rg[nrg].name[tag->len-3] = 0;
169 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
170 }
171 tag = tag->next;
172 }
173
174 if (sh->rg[nrg].name) {
175 HashData hd;
176 hd.i = nrg;
177 if (!HashTableAdd(sh->rg_hash, sh->rg[nrg].name, 0, hd, NULL))
178 return -1;
179 }
180
181 sh->nrg++;
182 }
183
184 /* Add to program hash? */
185 if (type[0] == 'P' && type[1] == 'G') {
186 SAM_hdr_tag *tag;
187 int npg = sh->npg;
188
189 sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
190 if (!sh->pg)
191 return -1;
192
193 tag = h_type->tag;
194 sh->pg[npg].name = NULL;
195 sh->pg[npg].name_len = 0;
196 sh->pg[npg].ty = h_type;
197 sh->pg[npg].tag = tag;
198 sh->pg[npg].id = npg;
199 sh->pg[npg].prev_id = -1;
200
201 while (tag) {
202 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
203 if (!(sh->pg[npg].name = malloc(tag->len)))
204 return -1;
205 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
206 sh->pg[npg].name[tag->len-3] = 0;
207 sh->pg[npg].name_len = strlen(sh->pg[npg].name);
208 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
209 // Resolve later if needed
210 HashItem *hi = HashTableSearch(sh->pg_hash,
211 tag->str+3,
212 tag->len-3);
213 if (hi) {
214 sh->pg[npg].prev_id = sh->pg[hi->data.i].id;
215
216 /* Unmark previous entry as a PG termination */
217 if (sh->npg_end > 0 &&
218 sh->pg_end[sh->npg_end-1] == hi->data.i) {
219 sh->npg_end--;
220 } else {
221 int i;
222 for (i = 0; i < sh->npg_end; i++) {
223 if (sh->pg_end[i] == hi->data.i) {
224 memmove(&sh->pg_end[i], &sh->pg_end[i+1],
225 (sh->npg_end-i-1)*sizeof(*sh->pg_end));
226 sh->npg_end--;
227 }
228 }
229 }
230 } else {
231 sh->pg[npg].prev_id = -1;
232 }
233 }
234 tag = tag->next;
235 }
236
237 if (sh->pg[npg].name) {
238 HashData hd;
239 hd.i = npg;
240 if (!HashTableAdd(sh->pg_hash, sh->pg[npg].name, 0, hd, NULL))
241 return -1;
242 }
243
244 /* Add to npg_end[] array. Remove later if we find a PP line */
245 if (sh->npg_end >= sh->npg_end_alloc) {
246 sh->npg_end_alloc = sh->npg_end_alloc
247 ? sh->npg_end_alloc*2
248 : 4;
249 sh->pg_end = realloc(sh->pg_end,
250 sh->npg_end_alloc * sizeof(int));
251 if (!sh->pg_end)
252 return -1;
253 }
254 sh->pg_end[sh->npg_end++] = npg;
255
256 sh->npg++;
257 }
258
259 return 0;
260 }
261
262 /*
263 * Appends a formatted line to an existing SAM header.
264 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
265 * optional new-line. If it contains more than 1 line then multiple lines
266 * will be added in order.
267 *
268 * Input text is of maximum length len or as terminated earlier by a NUL.
269 * Len may be 0 if unknown, in which case lines must be NUL-terminated.
270 *
271 * Returns 0 on success
272 * -1 on failure
273 */
sam_hdr_add_lines(SAM_hdr * sh,const char * lines,int len)274 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
275 int i, lno, text_offset;
276 HashItem *hi;
277 HashData hd;
278 char *hdr;
279
280 if (!len)
281 len = strlen(lines);
282
283 text_offset = DSTRING_LEN(sh->text);
284 dstring_nappend(sh->text, lines, len);
285 hdr = DSTRING_STR(sh->text) + text_offset;
286
287 for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
288 char *type;
289 int l_start = i, new;
290 SAM_hdr_type *h_type;
291 SAM_hdr_tag *h_tag, *last;
292
293 if (hdr[i] != '@') {
294 int j;
295 for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
296 ;
297 sam_hdr_error("Header line does not start with '@'",
298 &hdr[l_start], len - l_start, lno);
299 return -1;
300 }
301
302 type = &hdr[i+1];
303 if (len - i < 3 ||
304 type[0] < 'A' || type[0] > 'z' ||
305 type[1] < 'A' || type[1] > 'z') {
306 sam_hdr_error("Header line does not have a two character key",
307 &hdr[l_start], len - l_start, lno);
308 return -1;
309 }
310
311 i += 3;
312 if (i >= len || hdr[i] == '\n')
313 continue;
314
315 // Add the header line type
316 if (!(h_type = pool_alloc(sh->type_pool)))
317 return -1;
318 hd.p = h_type;
319 if (!(hi = HashTableAdd(sh->h, type, 2, hd, &new)))
320 return -1;
321
322 // Form the ring, either with self or other lines of this type
323 if (!new) {
324 SAM_hdr_type *t = hi->data.p, *p;
325 p = t->prev;
326
327 assert(p->next == t);
328 p->next = h_type;
329 h_type->prev = p;
330
331 t->prev = h_type;
332 h_type->next = t;
333 h_type->order = p->order+1;
334 } else {
335 h_type->prev = h_type->next = h_type;
336 h_type->order = 0;
337 if (!(type[0] == 'H' && type[1] == 'D')) {
338 // Whenever new type is added to hashtable record in
339 // type_order unless HD, which should always be first.
340 dstring_nappend(sh->type_order, type, 2);
341 sh->ntypes++;
342 }
343 }
344
345 // Parse the tags on this line
346 last = NULL;
347 if (type[0] == 'C' && type[1] == 'O') {
348 int j;
349 if (hdr[i] != '\t') {
350 sam_hdr_error("Missing tab",
351 &hdr[l_start], len - l_start, lno);
352 return -1;
353 }
354
355 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
356 ;
357
358 if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
359 return -1;
360 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
361 h_tag->len = j-i;
362 h_tag->next = NULL;
363 if (!h_tag->str)
364 return -1;
365
366 i = j;
367
368 } else {
369 do {
370 int j;
371 if (hdr[i] != '\t') {
372 sam_hdr_error("Missing tab",
373 &hdr[l_start], len - l_start, lno);
374 return -1;
375 }
376
377 for (j = ++i;
378 j < len && hdr[j] != '\0' &&
379 hdr[j] != '\n' && hdr[j] != '\t';
380 j++)
381 ;
382
383 if (!(h_tag = pool_alloc(sh->tag_pool)))
384 return -1;
385 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
386 h_tag->len = j-i;
387 h_tag->next = NULL;
388 if (!h_tag->str)
389 return -1;
390
391 if (h_tag->len < 3 || h_tag->str[2] != ':') {
392 sam_hdr_error("Malformed key:value pair",
393 &hdr[l_start], len - l_start, lno);
394 return -1;
395 }
396
397 if (last)
398 last->next = h_tag;
399 else
400 h_type->tag = h_tag;
401
402 last = h_tag;
403 i = j;
404 } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
405 }
406
407 /* Update RG/SQ hashes */
408 if (-1 == sam_hdr_update_hashes(sh, type, h_type))
409 return -1;
410 }
411
412 return 0;
413 }
414
415 /*
416 * Adds a single line to a SAM header.
417 * Specify type and one or more key,value pairs, ending with the NULL key.
418 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
419 *
420 * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
421 * -1 on failure
422 */
sam_hdr_add(SAM_hdr * sh,const char * type,...)423 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
424 va_list args;
425 va_start(args, type);
426 return sam_hdr_vadd(sh, type, args, NULL);
427 }
428
sam_hdr_vadd(SAM_hdr * sh,const char * type,va_list ap,...)429 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
430 va_list args;
431 HashItem *hi;
432 HashData hd;
433 SAM_hdr_type *h_type;
434 SAM_hdr_tag *h_tag, *last;
435 int new;
436
437 #if defined(HAVE_VA_COPY)
438 va_list ap_local;
439 #endif
440
441 if (-1 == dstring_append_char(sh->text, '@'))
442 return -1;
443 if (-1 == dstring_nappend(sh->text, type, 2))
444 return -1;
445
446 if (!(h_type = pool_alloc(sh->type_pool)))
447 return -1;
448 hd.p = h_type;
449 if (!(hi = HashTableAdd(sh->h, (char *)type, 2, hd, &new)))
450 return -1;
451
452 // Form the ring, either with self or other lines of this type
453 if (!new) {
454 SAM_hdr_type *t = hi->data.p, *p;
455 p = t->prev;
456
457 assert(p->next = t);
458 p->next = h_type;
459 h_type->prev = p;
460
461 t->prev = h_type;
462 h_type->next = t;
463 h_type->order = p->order + 1;
464 } else {
465 if (!(type[0] == 'H' && type[1] == 'D')) {
466 // Whenever new type is added to hashtable record in
467 // type_order unless HD, which should always be first.
468 dstring_nappend(sh->type_order, type, 2);
469 sh->ntypes++;
470 }
471 h_type->prev = h_type->next = h_type;
472 h_type->order = 0;
473 }
474
475 last = NULL;
476
477 // Any ... varargs
478 va_start(args, ap);
479 for (;;) {
480 char *k, *v;
481 int idx;
482
483 if (!(k = (char *)va_arg(args, char *)))
484 break;
485 v = va_arg(args, char *);
486
487 if (-1 == dstring_append_char(sh->text, '\t'))
488 return -1;
489
490 if (!(h_tag = pool_alloc(sh->tag_pool)))
491 return -1;
492 idx = DSTRING_LEN(sh->text);
493
494 if (-1 == dstring_append(sh->text, k))
495 return -1;
496 if (-1 == dstring_append_char(sh->text, ':'))
497 return -1;
498 if (-1 == dstring_append(sh->text, v))
499 return -1;
500
501 h_tag->len = DSTRING_LEN(sh->text) - idx;
502 h_tag->str = string_ndup(sh->str_pool,
503 DSTRING_STR(sh->text) + idx,
504 h_tag->len);
505 h_tag->next = NULL;
506 if (!h_tag->str)
507 return -1;
508
509 if (last)
510 last->next = h_tag;
511 else
512 h_type->tag = h_tag;
513
514 last = h_tag;
515 }
516 va_end(args);
517
518 #if defined(HAVE_VA_COPY)
519 va_copy(ap_local, ap);
520 # define ap ap_local
521 #endif
522
523 // Plus the specified va_list params
524 for (;;) {
525 char *k, *v;
526 int idx;
527
528 if (!(k = (char *)va_arg(ap, char *)))
529 break;
530 v = va_arg(ap, char *);
531
532 if (-1 == dstring_append_char(sh->text, '\t'))
533 return -1;
534
535 if (!(h_tag = pool_alloc(sh->tag_pool)))
536 return -1;
537 idx = DSTRING_LEN(sh->text);
538
539 if (-1 == dstring_append(sh->text, k))
540 return -1;
541 if (-1 == dstring_append_char(sh->text, ':'))
542 return -1;
543 if (-1 == dstring_append(sh->text, v))
544 return -1;
545
546 h_tag->len = DSTRING_LEN(sh->text) - idx;
547 h_tag->str = string_ndup(sh->str_pool,
548 DSTRING_STR(sh->text) + idx,
549 h_tag->len);
550 h_tag->next = NULL;
551 if (!h_tag->str)
552 return -1;
553
554 if (last)
555 last->next = h_tag;
556 else
557 h_type->tag = h_tag;
558
559 last = h_tag;
560 }
561 va_end(ap);
562
563 if (-1 == dstring_append_char(sh->text, '\n'))
564 return -1;
565
566 if (-1 == sam_hdr_update_hashes(sh, type, h_type))
567 return -1;
568
569 return h_type->order;
570 }
571
572 /*
573 * Returns the first header item matching 'type'. If ID is non-NULL it checks
574 * for the tag ID: and compares against the specified ID.
575 *
576 * Returns NULL if no type/ID is found
577 */
sam_hdr_find(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)578 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
579 char *ID_key, char *ID_value) {
580 HashItem *hi;
581 SAM_hdr_type *t1, *t2;
582
583 /* Special case for types we have prebuilt hashes on */
584 if (ID_key) {
585 if (type[0] == 'S' && type[1] == 'Q' &&
586 ID_key[0] == 'S' && ID_key[1] == 'N') {
587 hi = HashTableSearch(hdr->ref_hash, ID_value, strlen(ID_value));
588 return hi ? hdr->ref[hi->data.i].ty : NULL;
589 }
590
591 if (type[0] == 'R' && type[1] == 'G' &&
592 ID_key[0] == 'I' && ID_key[1] == 'D') {
593 hi = HashTableSearch(hdr->rg_hash, ID_value, strlen(ID_value));
594 return hi ? hdr->rg[hi->data.i].ty : NULL;
595 }
596
597 if (type[0] == 'P' && type[1] == 'G' &&
598 ID_key[0] == 'I' && ID_key[1] == 'D') {
599 hi = HashTableSearch(hdr->pg_hash, ID_value, strlen(ID_value));
600 return hi ? hdr->pg[hi->data.i].ty : NULL;
601 }
602 }
603
604 if (!(hi = HashTableSearch(hdr->h, type, 2)))
605 return NULL;
606
607 if (!ID_key)
608 return hi->data.p;
609
610 t1 = t2 = hi->data.p;
611 do {
612 SAM_hdr_tag *tag;
613 for (tag = t1->tag; tag; tag = tag->next) {
614 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
615 char *cp1 = tag->str+3;
616 char *cp2 = ID_value;
617 while (*cp1 && *cp1 == *cp2)
618 cp1++, cp2++;
619 if (*cp2 || *cp1)
620 continue;
621 return t1;
622 }
623 }
624 t1 = t1->next;
625 } while (t1 != t2);
626
627 return NULL;
628 }
629
630 /*
631 * As per SAM_hdr_type, but returns a complete line of formatted text
632 * for a specific head type/ID combination. If ID is NULL then it returns
633 * the first line of the specified type.
634 *
635 * The returned string is malloced and should be freed by the calling
636 * function with free().
637 *
638 * Returns NULL if no type/ID is found.
639 */
sam_hdr_find_line(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)640 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
641 char *ID_key, char *ID_value) {
642 SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
643 dstring_t *ds;
644 SAM_hdr_tag *tag;
645 char *str = dstring_str(hdr->text);
646 int r = 0;
647
648 if (!ty)
649 return NULL;
650
651 if (NULL == (ds = dstring_create(NULL)))
652 return NULL;
653
654 // Paste together the line from the hashed copy
655 r |= dstring_append_char(ds, '@');
656 r |= dstring_append(ds, type);
657 for (tag = ty->tag; tag; tag = tag->next) {
658 r |= dstring_append_char(ds, '\t');
659 r |= dstring_nappend(ds, tag->str, tag->len);
660 }
661
662 if (r) {
663 dstring_destroy(ds);
664 return NULL;
665 }
666
667 // Steal the dstring copy and return that.
668 str = DSTRING_STR(ds);
669 DSTRING_STR(ds) = NULL;
670 dstring_destroy(ds);
671
672 return str;
673 }
674
675
676 /*
677 * Looks for a specific key in a single sam header line.
678 * If prev is non-NULL it also fills this out with the previous tag, to
679 * permit use in key removal. *prev is set to NULL when the tag is the first
680 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
681 * tag in the existing list.
682 *
683 * Returns the tag pointer on success
684 * NULL on failure
685 */
sam_hdr_find_key(SAM_hdr * sh,SAM_hdr_type * type,char * key,SAM_hdr_tag ** prev)686 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
687 SAM_hdr_type *type,
688 char *key,
689 SAM_hdr_tag **prev) {
690 SAM_hdr_tag *tag, *p = NULL;
691
692 for (tag = type->tag; tag; p = tag, tag = tag->next) {
693 if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
694 if (prev)
695 *prev = p;
696 return tag;
697 }
698 }
699
700 if (prev)
701 *prev = p;
702
703 return NULL;
704 }
705
706
707 /*
708 * Adds or updates tag key,value pairs in a header line.
709 * Eg for adding M5 tags to @SQ lines or updating sort order for the
710 * @HD line (although use the sam_hdr_sort_order() function for
711 * HD manipulation, which is a wrapper around this funuction).
712 *
713 * Specify multiple key,value pairs ending in NULL.
714 *
715 * Returns 0 on success
716 * -1 on failure
717 */
sam_hdr_update(SAM_hdr * hdr,SAM_hdr_type * type,...)718 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
719 va_list ap;
720
721 va_start(ap, type);
722
723 for (;;) {
724 char *k, *v;
725 int idx;
726 SAM_hdr_tag *tag, *prev;
727
728 if (!(k = (char *)va_arg(ap, char *)))
729 break;
730 v = va_arg(ap, char *);
731
732 tag = sam_hdr_find_key(hdr, type, k, &prev);
733 if (!tag) {
734 if (!(tag = pool_alloc(hdr->tag_pool)))
735 return -1;
736 if (prev)
737 prev->next = tag;
738 else
739 type->tag = tag;
740
741 tag->next = NULL;
742 }
743
744 idx = DSTRING_LEN(hdr->text);
745 if (0 != dstring_appendf(hdr->text, "%2.2s:%s", k, v))
746 return -1;
747 tag->len = DSTRING_LEN(hdr->text) - idx;
748 tag->str = string_ndup(hdr->str_pool,
749 DSTRING_STR(hdr->text) + idx,
750 tag->len);
751 if (!tag->str)
752 return -1;
753 }
754
755 va_end(ap);
756
757 return 0;
758 }
759
760
761 /*
762 * Returns the sort order:
763 */
sam_hdr_sort_order(SAM_hdr * hdr)764 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
765 return hdr->sort_order;
766 }
767
sam_hdr_parse_sort_order(SAM_hdr * hdr)768 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
769 HashItem *hi;
770 enum sam_sort_order so;
771
772 so = ORDER_UNKNOWN;
773 if ((hi = HashTableSearch(hdr->h, "HD", 2))) {
774 SAM_hdr_type *ty = hi->data.p;
775 SAM_hdr_tag *tag;
776 for (tag = ty->tag; tag; tag = tag->next) {
777 if (tag->str[0] == 'S' && tag->str[1] == 'O') {
778 if (strcmp(tag->str+3, "unsorted") == 0)
779 so = ORDER_UNSORTED;
780 else if (strcmp(tag->str+3, "queryname") == 0)
781 so = ORDER_NAME;
782 else if (strcmp(tag->str+3, "coordinate") == 0)
783 so = ORDER_COORD;
784 else if (strcmp(tag->str+3, "unknown") == 0)
785 so = ORDER_UNKNOWN;
786 else
787 fprintf(stderr, "Unknown sort order field: %s\n",
788 tag->str+3);
789 }
790 }
791 }
792
793 return so;
794 }
795
796
797 /*
798 * Reconstructs the dstring from the header hash table.
799 * Returns 0 on success
800 * -1 on failure
801 */
sam_hdr_rebuild(SAM_hdr * hdr)802 int sam_hdr_rebuild(SAM_hdr *hdr) {
803 /* Order: HD then others */
804 HashItem *hi;
805 HashIter *iter = HashTableIterCreate();
806 dstring_t *ds = dstring_create(NULL);
807
808 char *type_order = DSTRING_STR(hdr->type_order);
809 int i, ntypes = hdr->ntypes;
810
811 if (!iter || !ds)
812 return -1;
813
814 if ((hi = HashTableSearch(hdr->h, "HD", 2))) {
815 SAM_hdr_type *ty = hi->data.p;
816 SAM_hdr_tag *tag;
817 if (-1 == dstring_append(ds, "@HD"))
818 return -1;
819 for (tag = ty->tag; tag; tag = tag->next) {
820 if (-1 == dstring_append_char(ds, '\t'))
821 return -1;
822 if (-1 == dstring_nappend(ds, tag->str, tag->len))
823 return -1;
824 }
825 if (-1 == dstring_append_char(ds, '\n'))
826 return -1;
827 }
828
829 for (i = 0; i < ntypes; i++) {
830 // output types according to type_order
831 if ((hi = HashTableSearch(hdr->h, type_order+i*2, 2))) {
832 SAM_hdr_type *t1, *t2;
833 if (hi->key[0] == 'H' && hi->key[1] == 'D')
834 continue;
835 t1 = t2 = hi->data.p;
836 do {
837 SAM_hdr_tag *tag;
838 if (-1 == dstring_append_char(ds, '@'))
839 return -1;
840 if (-1 == dstring_nappend(ds, hi->key, 2))
841 return -1;
842 for (tag = t1->tag; tag; tag=tag->next) {
843 if (-1 == dstring_append_char(ds, '\t'))
844 return -1;
845 if (-1 == dstring_nappend(ds, tag->str, tag->len))
846 return -1;
847 }
848 if (-1 == dstring_append_char(ds, '\n'))
849 return -1;
850 t1 = t1->next;
851 } while (t1 != t2);
852 }
853 }
854
855 // The above loop will have found all header entries as we cannot add to
856 // hdr->h without also adding to the type_order / ntypes.
857
858 HashTableIterDestroy(iter);
859
860 dstring_destroy(hdr->text);
861 hdr->text = ds;
862
863 return 0;
864 }
865
866
867 /*
868 * Creates an empty SAM header, ready to be populated.
869 *
870 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
871 * NULL on failure
872 */
sam_hdr_new()873 SAM_hdr *sam_hdr_new() {
874 SAM_hdr *sh = calloc(1, sizeof(*sh));
875
876 if (!sh)
877 return NULL;
878
879 sh->ntypes = 0;
880 if (!(sh->type_order = dstring_create(NULL)))
881 goto err;
882
883 sh->h = HashTableCreate(16, HASH_FUNC_HSIEH |
884 HASH_DYNAMIC_SIZE);
885 if (!sh->h)
886 goto err;
887
888 sh->ID_cnt = 1;
889 sh->ref_count = 1;
890
891 sh->nref = 0;
892 sh->ref = NULL;
893 if (!(sh->ref_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
894 HASH_DYNAMIC_SIZE |
895 HASH_NONVOLATILE_KEYS)))
896 goto err;
897
898 sh->nrg = 0;
899 sh->rg = NULL;
900 if (!(sh->rg_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
901 HASH_DYNAMIC_SIZE |
902 HASH_NONVOLATILE_KEYS)))
903 goto err;
904
905 sh->npg = 0;
906 sh->pg = NULL;
907 sh->npg_end = sh->npg_end_alloc = 0;
908 sh->pg_end = NULL;
909 if (!(sh->pg_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
910 HASH_DYNAMIC_SIZE |
911 HASH_NONVOLATILE_KEYS)))
912 goto err;
913
914 if (!(sh->text = dstring_create(NULL)))
915 goto err;
916
917 if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
918 goto err;
919
920 if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
921 goto err;
922
923 if (!(sh->str_pool = string_pool_create(8192)))
924 goto err;
925
926 return sh;
927
928 err:
929 if (sh->type_order)
930 dstring_destroy(sh->type_order);
931
932 if (sh->h)
933 HashTableDestroy(sh->h, 0);
934
935 if (sh->tag_pool)
936 pool_destroy(sh->tag_pool);
937
938 if (sh->type_pool)
939 pool_destroy(sh->type_pool);
940
941 if (sh->str_pool)
942 string_pool_destroy(sh->str_pool);
943
944 free(sh);
945
946 return NULL;
947 }
948
949
950 /*
951 * Tokenises a SAM header into a hash table.
952 * Also extracts a few bits on specific data types, such as @RG lines.
953 *
954 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
955 * NULL on failure
956 */
sam_hdr_parse(const char * hdr,int len)957 SAM_hdr *sam_hdr_parse(const char *hdr, int len) {
958 /* Make an empty SAM_hdr */
959 SAM_hdr *sh;
960
961 sh = sam_hdr_new();
962 if (NULL == sh) return NULL;
963
964 if (NULL == hdr) return sh; // empty header is permitted
965
966 /* Parse the header, line by line */
967 if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
968 sam_hdr_free(sh);
969 return NULL;
970 }
971
972 /* Obtain sort order */
973 sh->sort_order = sam_hdr_parse_sort_order(sh);
974
975 //sam_hdr_dump(sh);
976 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
977 //sam_hdr_rebuild(sh);
978 //printf(">>%s<<", DSTRING_STR(sh->text));
979
980 //parse_references(sh);
981 //parse_read_groups(sh);
982
983 sam_hdr_link_pg(sh);
984 //sam_hdr_dump(sh);
985
986 return sh;
987 }
988
989 /*
990 * Produces a duplicate copy of hdr and returns it.
991 * Returns NULL on failure
992 */
sam_hdr_dup(SAM_hdr * hdr)993 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
994 if (-1 == sam_hdr_rebuild(hdr))
995 return NULL;
996
997 return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr));
998 }
999
1000 /*! Increments a reference count on hdr.
1001 *
1002 * This permits multiple files to share the same header, all calling
1003 * sam_hdr_free when done, without causing errors for other open files.
1004 */
sam_hdr_incr_ref(SAM_hdr * hdr)1005 void sam_hdr_incr_ref(SAM_hdr *hdr) {
1006 hdr->ref_count++;
1007 }
1008
1009 /*! Increments a reference count on hdr.
1010 *
1011 * This permits multiple files to share the same header, all calling
1012 * sam_hdr_free when done, without causing errors for other open files.
1013 *
1014 * If the reference count hits zero then the header is automatically
1015 * freed. This makes it a synonym for sam_hdr_free().
1016 */
sam_hdr_decr_ref(SAM_hdr * hdr)1017 void sam_hdr_decr_ref(SAM_hdr *hdr) {
1018 sam_hdr_free(hdr);
1019 }
1020
1021 /*! Deallocates all storage used by a SAM_hdr struct.
1022 *
1023 * This also decrements the header reference count. If after decrementing
1024 * it is still non-zero then the header is assumed to be in use by another
1025 * caller and the free is not done.
1026 *
1027 * This is a synonym for sam_hdr_dec_ref().
1028 */
sam_hdr_free(SAM_hdr * hdr)1029 void sam_hdr_free(SAM_hdr *hdr) {
1030 if (!hdr)
1031 return;
1032
1033 if (--hdr->ref_count > 0)
1034 return;
1035
1036 if (hdr->type_order)
1037 dstring_destroy(hdr->type_order);
1038
1039 if (hdr->text)
1040 dstring_destroy(hdr->text);
1041
1042 if (hdr->h)
1043 HashTableDestroy(hdr->h, 0);
1044
1045 if (hdr->ref_hash)
1046 HashTableDestroy(hdr->ref_hash, 0);
1047
1048 if (hdr->ref) {
1049 int i;
1050 for (i = 0; i < hdr->nref; i++)
1051 if (hdr->ref[i].name)
1052 free(hdr->ref[i].name);
1053 free(hdr->ref);
1054 }
1055
1056 if (hdr->rg_hash)
1057 HashTableDestroy(hdr->rg_hash, 0);
1058
1059 if (hdr->rg) {
1060 int i;
1061 for (i = 0; i < hdr->nrg; i++)
1062 if (hdr->rg[i].name)
1063 free(hdr->rg[i].name);
1064 free(hdr->rg);
1065 }
1066
1067 if (hdr->pg_hash)
1068 HashTableDestroy(hdr->pg_hash, 0);
1069
1070 if (hdr->pg) {
1071 int i;
1072 for (i = 0; i < hdr->npg; i++)
1073 if (hdr->pg[i].name)
1074 free(hdr->pg[i].name);
1075 free(hdr->pg);
1076 }
1077
1078 if (hdr->pg_end)
1079 free(hdr->pg_end);
1080
1081 if (hdr->type_pool)
1082 pool_destroy(hdr->type_pool);
1083
1084 if (hdr->tag_pool)
1085 pool_destroy(hdr->tag_pool);
1086
1087 if (hdr->str_pool)
1088 string_pool_destroy(hdr->str_pool);
1089
1090 free(hdr);
1091 }
1092
sam_hdr_length(SAM_hdr * hdr)1093 int sam_hdr_length(SAM_hdr *hdr) {
1094 return dstring_length(hdr->text);
1095 }
1096
sam_hdr_str(SAM_hdr * hdr)1097 char *sam_hdr_str(SAM_hdr *hdr) {
1098 return dstring_str(hdr->text);
1099 }
1100
1101 /*
1102 * Looks up a reference sequence by name and returns the numerical ID.
1103 * Returns -1 if unknown reference.
1104 */
sam_hdr_name2ref(SAM_hdr * hdr,char * ref)1105 int sam_hdr_name2ref(SAM_hdr *hdr, char *ref) {
1106 HashItem *hi = HashTableSearch(hdr->ref_hash, ref, strlen(ref));
1107 return hi ? hi->data.i : -1;
1108 }
1109
1110 /*
1111 * Looks up a read-group by name and returns a pointer to the start of the
1112 * associated tag list.
1113 *
1114 * Returns NULL on failure
1115 */
sam_hdr_find_rg(SAM_hdr * hdr,char * rg)1116 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, char *rg) {
1117 HashItem *hi = HashTableSearch(hdr->rg_hash, rg, 0);
1118 return hi ? &hdr->rg[hi->data.i] : NULL;
1119 }
1120
1121
1122 /*
1123 * Fixes any PP links in @PG headers.
1124 * If the entries are in order then this doesn't need doing, but incase
1125 * our header is out of order this goes through the sh->pg[] array
1126 * setting the prev_id field.
1127 *
1128 * Note we can have multiple complete chains. This code should identify the
1129 * tails of these chains as these are the entries we have to link to in
1130 * subsequent PP records.
1131 *
1132 * Returns 0 on sucess
1133 * -1 on failure (indicating broken PG/PP records)
1134 */
sam_hdr_link_pg(SAM_hdr * hdr)1135 int sam_hdr_link_pg(SAM_hdr *hdr) {
1136 int i, j, ret = 0;
1137
1138 hdr->npg_end_alloc = hdr->npg;
1139 hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1140 if (!hdr->pg_end)
1141 return -1;
1142
1143 for (i = 0; i < hdr->npg; i++)
1144 hdr->pg_end[i] = i;
1145
1146 for (i = 0; i < hdr->npg; i++) {
1147 HashItem *hi;
1148 SAM_hdr_tag *tag;
1149
1150 for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1151 if (tag->str[0] == 'P' && tag->str[1] == 'P')
1152 break;
1153 }
1154 if (!tag) {
1155 /* Chain start points */
1156 continue;
1157 }
1158
1159 hi = HashTableSearch(hdr->pg_hash, tag->str+3, tag->len-3);
1160 if (!hi) {
1161 ret = -1;
1162 continue;
1163 }
1164
1165 hdr->pg[i].prev_id = hdr->pg[hi->data.i].id;
1166 hdr->pg_end[hi->data.i] = -1;
1167 }
1168
1169 for (i = j = 0; i < hdr->npg; i++) {
1170 if (hdr->pg_end[i] != -1)
1171 hdr->pg_end[j++] = hdr->pg_end[i];
1172 }
1173 hdr->npg_end = j;
1174
1175 return ret;
1176 }
1177
1178 /*
1179 * Returns a unique ID from a base name.
1180 *
1181 * The value returned is valid until the next call to
1182 * this function.
1183 */
sam_hdr_PG_ID(SAM_hdr * sh,const char * name)1184 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1185 if (!(HashTableSearch(sh->pg_hash, (char *)name, 0)))
1186 return name;
1187
1188 do {
1189 sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1190 } while (HashTableSearch(sh->pg_hash, sh->ID_buf, 0));
1191
1192 return sh->ID_buf;
1193 }
1194
1195 /*
1196 * Add an @PG line.
1197 *
1198 * If we wish complete control over this use sam_hdr_add() directly. This
1199 * function uses that, but attempts to do a lot of tedious house work for
1200 * you too.
1201 *
1202 * - It will generate a suitable ID if the supplied one clashes.
1203 * - It will generate multiple @PG records if we have multiple PG chains.
1204 *
1205 * Call it as per sam_hdr_add() with a series of key,value pairs ending
1206 * in NULL.
1207 *
1208 * Returns 0 on success
1209 * -1 on failure
1210 */
sam_hdr_add_PG(SAM_hdr * sh,const char * name,...)1211 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1212 va_list args;
1213 va_start(args, name);
1214
1215 if (sh->npg_end) {
1216 /* Copy ends array to avoid us looping while modifying it */
1217 int *end = malloc(sh->npg_end * sizeof(int));
1218 int i, nends = sh->npg_end;
1219
1220 if (!end)
1221 return -1;
1222
1223 memcpy(end, sh->pg_end, nends * sizeof(*end));
1224
1225 for (i = 0; i < nends; i++) {
1226 if (-1 == sam_hdr_vadd(sh, "PG", args,
1227 "ID", sam_hdr_PG_ID(sh, name),
1228 "PN", name,
1229 "PP", sh->pg[end[i]].name,
1230 NULL)) {
1231 free(end);
1232 return -1;
1233 }
1234 }
1235
1236 free(end);
1237 } else {
1238 if (-1 == sam_hdr_vadd(sh, "PG", args,
1239 "ID", sam_hdr_PG_ID(sh, name),
1240 "PN", name,
1241 NULL))
1242 return -1;
1243 }
1244
1245 //sam_hdr_dump(sh);
1246
1247 return 0;
1248 }
1249
1250 /*
1251 * A function to help with construction of CL tags in @PG records.
1252 * Takes an argc, argv pair and returns a single space-separated string.
1253 * This string should be deallocated by the calling function.
1254 *
1255 * Returns malloced char * on success
1256 * NULL on failure
1257 */
stringify_argv(int argc,char * argv[])1258 char *stringify_argv(int argc, char *argv[]) {
1259 char *str, *cp;
1260 size_t nbytes = 1;
1261 int i, j;
1262
1263 /* Allocate */
1264 for (i = 0; i < argc; i++) {
1265 nbytes += strlen(argv[i]) + 1;
1266 }
1267 if (!(str = malloc(nbytes)))
1268 return NULL;
1269
1270 /* Copy */
1271 cp = str;
1272 for (i = 0; i < argc; i++) {
1273 j = 0;
1274 while (argv[i][j]) {
1275 if (argv[i][j] == '\t')
1276 *cp++ = ' ';
1277 else
1278 *cp++ = argv[i][j];
1279 j++;
1280 }
1281 *cp++ = ' ';
1282 }
1283 *cp++ = 0;
1284
1285 return str;
1286 }
1287