1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include <config.h>
32
33 #include <string.h>
34 #include <assert.h>
35
36 #include "hts_internal.h"
37 #include "cram/sam_header.h"
38 #include "cram/string_alloc.h"
39
sam_hdr_error(char * msg,char * line,int len,int lno)40 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
41 int j;
42
43 for (j = 0; j < len && line[j] != '\n'; j++)
44 ;
45 hts_log_error("%s at line %d: \"%.*s\"", msg, lno, j, line);
46 }
47
sam_hdr_dump(SAM_hdr * hdr)48 void sam_hdr_dump(SAM_hdr *hdr) {
49 khint_t k;
50 int i;
51
52 printf("===DUMP===\n");
53 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
54 SAM_hdr_type *t1, *t2;
55 char c[2];
56
57 if (!kh_exist(hdr->h, k))
58 continue;
59
60 t1 = t2 = kh_val(hdr->h, k);
61 c[0] = kh_key(hdr->h, k)>>8;
62 c[1] = kh_key(hdr->h, k)&0xff;
63 printf("Type %.2s, count %d\n", c, t1->prev->order+1);
64
65 do {
66 SAM_hdr_tag *tag;
67 printf(">>>%d ", t1->order);
68 for (tag = t1->tag; tag; tag=tag->next) {
69 printf("\"%.2s\":\"%.*s\"\t",
70 tag->str, tag->len-3, tag->str+3);
71 }
72 putchar('\n');
73 t1 = t1->next;
74 } while (t1 != t2);
75 }
76
77 /* Dump out PG chains */
78 printf("\n@PG chains:\n");
79 for (i = 0; i < hdr->npg_end; i++) {
80 int j;
81 printf(" %d:", i);
82 for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
83 printf("%s%d(%.*s)",
84 j == hdr->pg_end[i] ? " " : "->",
85 j, hdr->pg[j].name_len, hdr->pg[j].name);
86 }
87 printf("\n");
88 }
89
90 puts("===END DUMP===");
91 }
92
93 /* Updates the hash tables in the SAM_hdr structure.
94 *
95 * Returns 0 on success;
96 * -1 on failure
97 */
sam_hdr_update_hashes(SAM_hdr * sh,int type,SAM_hdr_type * h_type)98 static int sam_hdr_update_hashes(SAM_hdr *sh,
99 int type,
100 SAM_hdr_type *h_type) {
101 /* Add to reference hash? */
102 if ((type>>8) == 'S' && (type&0xff) == 'Q') {
103 SAM_hdr_tag *tag;
104 SAM_SQ *new_ref;
105 int nref = sh->nref;
106
107 new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
108 if (!new_ref)
109 return -1;
110 sh->ref = new_ref;
111
112 tag = h_type->tag;
113 sh->ref[nref].name = NULL;
114 sh->ref[nref].len = 0;
115 sh->ref[nref].ty = h_type;
116 sh->ref[nref].tag = tag;
117
118 while (tag) {
119 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
120 if (!(sh->ref[nref].name = malloc(tag->len)))
121 return -1;
122 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
123 sh->ref[nref].name[tag->len-3] = 0;
124 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
125 sh->ref[nref].len = atoi(tag->str+3);
126 }
127 tag = tag->next;
128 }
129
130 if (sh->ref[nref].name) {
131 khint_t k;
132 int r;
133 k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
134 if (-1 == r) return -1;
135 kh_val(sh->ref_hash, k) = nref;
136 } else {
137 return -1; // SN should be present, according to spec.
138 }
139
140 sh->nref++;
141 }
142
143 /* Add to read-group hash? */
144 if ((type>>8) == 'R' && (type&0xff) == 'G') {
145 SAM_hdr_tag *tag;
146 SAM_RG *new_rg;
147 int nrg = sh->nrg;
148
149 new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
150 if (!new_rg)
151 return -1;
152 sh->rg = new_rg;
153
154 tag = h_type->tag;
155 sh->rg[nrg].name = NULL;
156 sh->rg[nrg].name_len = 0;
157 sh->rg[nrg].ty = h_type;
158 sh->rg[nrg].tag = tag;
159 sh->rg[nrg].id = nrg;
160
161 while (tag) {
162 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
163 if (!(sh->rg[nrg].name = malloc(tag->len)))
164 return -1;
165 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
166 sh->rg[nrg].name[tag->len-3] = 0;
167 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
168 }
169 tag = tag->next;
170 }
171
172 if (sh->rg[nrg].name) {
173 khint_t k;
174 int r;
175 k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
176 if (-1 == r) return -1;
177 kh_val(sh->rg_hash, k) = nrg;
178 } else {
179 return -1; // ID should be present, according to spec.
180 }
181
182 sh->nrg++;
183 }
184
185 /* Add to program hash? */
186 if ((type>>8) == 'P' && (type&0xff) == 'G') {
187 SAM_hdr_tag *tag;
188 SAM_PG *new_pg;
189 int npg = sh->npg;
190
191 new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
192 if (!new_pg)
193 return -1;
194 sh->pg = new_pg;
195
196 tag = h_type->tag;
197 sh->pg[npg].name = NULL;
198 sh->pg[npg].name_len = 0;
199 sh->pg[npg].ty = h_type;
200 sh->pg[npg].tag = tag;
201 sh->pg[npg].id = npg;
202 sh->pg[npg].prev_id = -1;
203
204 while (tag) {
205 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
206 if (!(sh->pg[npg].name = malloc(tag->len)))
207 return -1;
208 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
209 sh->pg[npg].name[tag->len-3] = 0;
210 sh->pg[npg].name_len = strlen(sh->pg[npg].name);
211 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
212 // Resolve later if needed
213 khint_t k;
214 char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
215 k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
216 tag->str[tag->len] = tmp;
217
218 if (k != kh_end(sh->pg_hash)) {
219 int p_id = kh_val(sh->pg_hash, k);
220 sh->pg[npg].prev_id = sh->pg[p_id].id;
221
222 /* Unmark previous entry as a PG termination */
223 if (sh->npg_end > 0 &&
224 sh->pg_end[sh->npg_end-1] == p_id) {
225 sh->npg_end--;
226 } else {
227 int i;
228 for (i = 0; i < sh->npg_end; i++) {
229 if (sh->pg_end[i] == p_id) {
230 memmove(&sh->pg_end[i], &sh->pg_end[i+1],
231 (sh->npg_end-i-1)*sizeof(*sh->pg_end));
232 sh->npg_end--;
233 }
234 }
235 }
236 } else {
237 sh->pg[npg].prev_id = -1;
238 }
239 }
240 tag = tag->next;
241 }
242
243 if (sh->pg[npg].name) {
244 khint_t k;
245 int r;
246 k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
247 if (-1 == r) return -1;
248 kh_val(sh->pg_hash, k) = npg;
249 } else {
250 return -1; // ID should be present, according to spec.
251 }
252
253 /* Add to npg_end[] array. Remove later if we find a PP line */
254 if (sh->npg_end >= sh->npg_end_alloc) {
255 int *new_pg_end;
256 int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4;
257
258 new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int));
259 if (!new_pg_end)
260 return -1;
261 sh->npg_end_alloc = new_alloc;
262 sh->pg_end = new_pg_end;
263 }
264 sh->pg_end[sh->npg_end++] = npg;
265
266 sh->npg++;
267 }
268
269 return 0;
270 }
271
272 /*
273 * Appends a formatted line to an existing SAM header.
274 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
275 * optional new-line. If it contains more than 1 line then multiple lines
276 * will be added in order.
277 *
278 * Input text is of maximum length len or as terminated earlier by a NUL.
279 * Len may be 0 if unknown, in which case lines must be NUL-terminated.
280 *
281 * Returns 0 on success
282 * -1 on failure
283 */
sam_hdr_add_lines(SAM_hdr * sh,const char * lines,int len)284 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
285 int i, lno, text_offset;
286 char *hdr;
287
288 if (!len)
289 len = strlen(lines);
290
291 text_offset = ks_len(&sh->text);
292 if (EOF == kputsn(lines, len, &sh->text))
293 return -1;
294 hdr = ks_str(&sh->text) + text_offset;
295
296 for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
297 khint32_t type;
298 khint_t k;
299
300 int l_start = i, new;
301 SAM_hdr_type *h_type;
302 SAM_hdr_tag *h_tag, *last;
303
304 if (hdr[i] != '@') {
305 int j;
306 for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
307 ;
308 sam_hdr_error("Header line does not start with '@'",
309 &hdr[l_start], len - l_start, lno);
310 return -1;
311 }
312
313 type = (hdr[i+1]<<8) | hdr[i+2];
314 if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
315 hdr[i+2] < 'A' || hdr[i+2] > 'z') {
316 sam_hdr_error("Header line does not have a two character key",
317 &hdr[l_start], len - l_start, lno);
318 return -1;
319 }
320
321 i += 3;
322 if (hdr[i] == '\n')
323 continue;
324
325 // Add the header line type
326 if (!(h_type = pool_alloc(sh->type_pool)))
327 return -1;
328 if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
329 return -1;
330
331 // Form the ring, either with self or other lines of this type
332 if (!new) {
333 SAM_hdr_type *t = kh_val(sh->h, k), *p;
334 p = t->prev;
335
336 assert(p->next == t);
337 p->next = h_type;
338 h_type->prev = p;
339
340 t->prev = h_type;
341 h_type->next = t;
342 h_type->order = p->order+1;
343 } else {
344 kh_val(sh->h, k) = h_type;
345 h_type->prev = h_type->next = h_type;
346 h_type->order = 0;
347 }
348
349 // Parse the tags on this line
350 last = NULL;
351 if ((type>>8) == 'C' && (type&0xff) == 'O') {
352 int j;
353 if (hdr[i] != '\t') {
354 sam_hdr_error("Missing tab",
355 &hdr[l_start], len - l_start, lno);
356 return -1;
357 }
358
359 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
360 ;
361
362 if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
363 return -1;
364 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
365 h_tag->len = j-i;
366 h_tag->next = NULL;
367 if (!h_tag->str)
368 return -1;
369
370 i = j;
371
372 } else {
373 do {
374 int j;
375 if (hdr[i] != '\t') {
376 sam_hdr_error("Missing tab",
377 &hdr[l_start], len - l_start, lno);
378 return -1;
379 }
380
381 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
382 ;
383
384 if (!(h_tag = pool_alloc(sh->tag_pool)))
385 return -1;
386 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
387 h_tag->len = j-i;
388 h_tag->next = NULL;
389 if (!h_tag->str)
390 return -1;
391
392 if (h_tag->len < 3 || h_tag->str[2] != ':') {
393 sam_hdr_error("Malformed key:value pair",
394 &hdr[l_start], len - l_start, lno);
395 return -1;
396 }
397
398 if (last)
399 last->next = h_tag;
400 else
401 h_type->tag = h_tag;
402
403 last = h_tag;
404 i = j;
405 } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
406 }
407
408 /* Update RG/SQ hashes */
409 if (-1 == sam_hdr_update_hashes(sh, type, h_type))
410 return -1;
411 }
412
413 return 0;
414 }
415
416 /*
417 * Adds a single line to a SAM header.
418 * Specify type and one or more key,value pairs, ending with the NULL key.
419 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
420 *
421 * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
422 * -1 on failure
423 */
sam_hdr_add(SAM_hdr * sh,const char * type,...)424 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
425 va_list args;
426 va_start(args, type);
427 return sam_hdr_vadd(sh, type, args, NULL);
428 }
429
430 /*
431 * sam_hdr_add with a va_list interface.
432 *
433 * Note: this function invokes va_arg at least once, making the value
434 * of ap indeterminate after the return. The caller should call
435 * va_start/va_end before/after calling this function or use va_copy.
436 */
sam_hdr_vadd(SAM_hdr * sh,const char * type,va_list ap,...)437 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
438 va_list args;
439 SAM_hdr_type *h_type;
440 SAM_hdr_tag *h_tag, *last;
441 int new;
442 khint32_t type_i = (type[0]<<8) | type[1], k;
443
444 if (EOF == kputc_('@', &sh->text))
445 return -1;
446 if (EOF == kputsn(type, 2, &sh->text))
447 return -1;
448
449 if (!(h_type = pool_alloc(sh->type_pool)))
450 return -1;
451 if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
452 return -1;
453
454 // Form the ring, either with self or other lines of this type
455 if (!new) {
456 SAM_hdr_type *t = kh_val(sh->h, k), *p;
457 p = t->prev;
458
459 assert(p->next == t);
460 p->next = h_type;
461 h_type->prev = p;
462
463 t->prev = h_type;
464 h_type->next = t;
465 h_type->order = p->order + 1;
466 } else {
467 kh_val(sh->h, k) = h_type;
468 h_type->prev = h_type->next = h_type;
469 h_type->order = 0;
470 }
471
472 last = NULL;
473
474 // Any ... varargs
475 va_start(args, ap);
476 for (;;) {
477 char *k, *v;
478 int idx;
479
480 if (!(k = (char *)va_arg(args, char *)))
481 break;
482 v = va_arg(args, char *);
483
484 if (EOF == kputc_('\t', &sh->text))
485 return -1;
486
487 if (!(h_tag = pool_alloc(sh->tag_pool)))
488 return -1;
489 idx = ks_len(&sh->text);
490
491 if (EOF == kputs(k, &sh->text))
492 return -1;
493 if (EOF == kputc_(':', &sh->text))
494 return -1;
495 if (EOF == kputs(v, &sh->text))
496 return -1;
497
498 h_tag->len = ks_len(&sh->text) - idx;
499 h_tag->str = string_ndup(sh->str_pool,
500 ks_str(&sh->text) + idx,
501 h_tag->len);
502 h_tag->next = NULL;
503 if (!h_tag->str)
504 return -1;
505
506 if (last)
507 last->next = h_tag;
508 else
509 h_type->tag = h_tag;
510
511 last = h_tag;
512 }
513 va_end(args);
514
515 // Plus the specified va_list params
516 for (;;) {
517 char *k, *v;
518 int idx;
519
520 if (!(k = (char *)va_arg(ap, char *)))
521 break;
522 v = va_arg(ap, char *);
523
524 if (EOF == kputc_('\t', &sh->text))
525 return -1;
526
527 if (!(h_tag = pool_alloc(sh->tag_pool)))
528 return -1;
529 idx = ks_len(&sh->text);
530
531 if (EOF == kputs(k, &sh->text))
532 return -1;
533 if (EOF == kputc_(':', &sh->text))
534 return -1;
535 if (EOF == kputs(v, &sh->text))
536 return -1;
537
538 h_tag->len = ks_len(&sh->text) - idx;
539 h_tag->str = string_ndup(sh->str_pool,
540 ks_str(&sh->text) + idx,
541 h_tag->len);
542 h_tag->next = NULL;
543 if (!h_tag->str)
544 return -1;
545
546 if (last)
547 last->next = h_tag;
548 else
549 h_type->tag = h_tag;
550
551 last = h_tag;
552 }
553 va_end(ap);
554
555 if (EOF == kputc('\n', &sh->text))
556 return -1;
557
558 int itype = (type[0]<<8) | type[1];
559 if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
560 return -1;
561
562 return h_type->order;
563 }
564
565 /*
566 * Returns the first header item matching 'type'. If ID is non-NULL it checks
567 * for the tag ID: and compares against the specified ID.
568 *
569 * Returns NULL if no type/ID is found
570 */
sam_hdr_find(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)571 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
572 char *ID_key, char *ID_value) {
573 SAM_hdr_type *t1, *t2;
574 int itype = (type[0]<<8)|(type[1]);
575 khint_t k;
576
577 /* Special case for types we have prebuilt hashes on */
578 if (ID_key) {
579 if (type[0] == 'S' && type[1] == 'Q' &&
580 ID_key[0] == 'S' && ID_key[1] == 'N') {
581 k = kh_get(m_s2i, hdr->ref_hash, ID_value);
582 return k != kh_end(hdr->ref_hash)
583 ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
584 : NULL;
585 }
586
587 if (type[0] == 'R' && type[1] == 'G' &&
588 ID_key[0] == 'I' && ID_key[1] == 'D') {
589 k = kh_get(m_s2i, hdr->rg_hash, ID_value);
590 return k != kh_end(hdr->rg_hash)
591 ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
592 : NULL;
593 }
594
595 if (type[0] == 'P' && type[1] == 'G' &&
596 ID_key[0] == 'I' && ID_key[1] == 'D') {
597 k = kh_get(m_s2i, hdr->pg_hash, ID_value);
598 return k != kh_end(hdr->pg_hash)
599 ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
600 : NULL;
601 }
602 }
603
604 k = kh_get(sam_hdr, hdr->h, itype);
605 if (k == kh_end(hdr->h))
606 return NULL;
607
608 if (!ID_key)
609 return kh_val(hdr->h, k);
610
611 t1 = t2 = kh_val(hdr->h, k);
612 do {
613 SAM_hdr_tag *tag;
614 for (tag = t1->tag; tag; tag = tag->next) {
615 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
616 char *cp1 = tag->str+3;
617 char *cp2 = ID_value;
618 while (*cp1 && *cp1 == *cp2)
619 cp1++, cp2++;
620 if (*cp2 || *cp1)
621 continue;
622 return t1;
623 }
624 }
625 t1 = t1->next;
626 } while (t1 != t2);
627
628 return NULL;
629 }
630
631 /*
632 * As per SAM_hdr_type, but returns a complete line of formatted text
633 * for a specific head type/ID combination. If ID is NULL then it returns
634 * the first line of the specified type.
635 *
636 * The returned string is malloced and should be freed by the calling
637 * function with free().
638 *
639 * Returns NULL if no type/ID is found.
640 */
sam_hdr_find_line(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)641 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
642 char *ID_key, char *ID_value) {
643 SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
644 kstring_t ks = KS_INITIALIZER;
645 SAM_hdr_tag *tag;
646 int r = 0;
647
648 if (!ty)
649 return NULL;
650
651 // Paste together the line from the hashed copy
652 r |= (kputc_('@', &ks) == EOF);
653 r |= (kputs(type, &ks) == EOF);
654 for (tag = ty->tag; tag; tag = tag->next) {
655 r |= (kputc_('\t', &ks) == EOF);
656 r |= (kputsn(tag->str, tag->len, &ks) == EOF);
657 }
658
659 if (r) {
660 KS_FREE(&ks);
661 return NULL;
662 }
663
664 return ks_str(&ks);
665 }
666
667
668 /*
669 * Looks for a specific key in a single sam header line.
670 * If prev is non-NULL it also fills this out with the previous tag, to
671 * permit use in key removal. *prev is set to NULL when the tag is the first
672 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
673 * tag in the existing list.
674 *
675 * Returns the tag pointer on success
676 * NULL on failure
677 */
sam_hdr_find_key(SAM_hdr * sh,SAM_hdr_type * type,char * key,SAM_hdr_tag ** prev)678 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
679 SAM_hdr_type *type,
680 char *key,
681 SAM_hdr_tag **prev) {
682 SAM_hdr_tag *tag, *p = NULL;
683
684 for (tag = type->tag; tag; p = tag, tag = tag->next) {
685 if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
686 if (prev)
687 *prev = p;
688 return tag;
689 }
690 }
691
692 if (prev)
693 *prev = p;
694
695 return NULL;
696 }
697
698
699 /*
700 * Adds or updates tag key,value pairs in a header line.
701 * Eg for adding M5 tags to @SQ lines or updating sort order for the
702 * @HD line (although use the sam_hdr_sort_order() function for
703 * HD manipulation, which is a wrapper around this funuction).
704 *
705 * Specify multiple key,value pairs ending in NULL.
706 *
707 * Returns 0 on success
708 * -1 on failure
709 */
sam_hdr_update(SAM_hdr * hdr,SAM_hdr_type * type,...)710 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
711 va_list ap;
712
713 va_start(ap, type);
714
715 for (;;) {
716 char *k, *v;
717 int idx;
718 SAM_hdr_tag *tag, *prev;
719
720 if (!(k = (char *)va_arg(ap, char *)))
721 break;
722 v = va_arg(ap, char *);
723
724 tag = sam_hdr_find_key(hdr, type, k, &prev);
725 if (!tag) {
726 if (!(tag = pool_alloc(hdr->tag_pool)))
727 return -1;
728 if (prev)
729 prev->next = tag;
730 else
731 type->tag = tag;
732
733 tag->next = NULL;
734 }
735
736 idx = ks_len(&hdr->text);
737 if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
738 return -1;
739 tag->len = ks_len(&hdr->text) - idx;
740 tag->str = string_ndup(hdr->str_pool,
741 ks_str(&hdr->text) + idx,
742 tag->len);
743 if (!tag->str)
744 return -1;
745 }
746
747 va_end(ap);
748
749 return 0;
750 }
751
752 #define K(a) (((a)[0]<<8)|((a)[1]))
753
754 /*
755 * Returns the sort order:
756 */
sam_hdr_sort_order(SAM_hdr * hdr)757 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
758 return hdr->sort_order;
759 }
760
sam_hdr_parse_sort_order(SAM_hdr * hdr)761 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
762 khint_t k;
763 enum sam_sort_order so;
764
765 so = ORDER_UNKNOWN;
766 k = kh_get(sam_hdr, hdr->h, K("HD"));
767 if (k != kh_end(hdr->h)) {
768 SAM_hdr_type *ty = kh_val(hdr->h, k);
769 SAM_hdr_tag *tag;
770 for (tag = ty->tag; tag; tag = tag->next) {
771 if (tag->str[0] == 'S' && tag->str[1] == 'O') {
772 if (strcmp(tag->str+3, "unsorted") == 0)
773 so = ORDER_UNSORTED;
774 else if (strcmp(tag->str+3, "queryname") == 0)
775 so = ORDER_NAME;
776 else if (strcmp(tag->str+3, "coordinate") == 0)
777 so = ORDER_COORD;
778 else if (strcmp(tag->str+3, "unknown") != 0)
779 hts_log_error("Unknown sort order field: %s", tag->str+3);
780 }
781 }
782 }
783
784 return so;
785 }
786
787
788 /*
789 * Reconstructs the kstring from the header hash table.
790 * Returns 0 on success
791 * -1 on failure
792 */
sam_hdr_rebuild(SAM_hdr * hdr)793 int sam_hdr_rebuild(SAM_hdr *hdr) {
794 /* Order: HD then others */
795 kstring_t ks = KS_INITIALIZER;
796 khint_t k;
797
798
799 k = kh_get(sam_hdr, hdr->h, K("HD"));
800 if (k != kh_end(hdr->h)) {
801 SAM_hdr_type *ty = kh_val(hdr->h, k);
802 SAM_hdr_tag *tag;
803 if (EOF == kputs("@HD", &ks))
804 return -1;
805 for (tag = ty->tag; tag; tag = tag->next) {
806 if (EOF == kputc_('\t', &ks))
807 return -1;
808 if (EOF == kputsn_(tag->str, tag->len, &ks))
809 return -1;
810 }
811 if (EOF == kputc('\n', &ks))
812 return -1;
813 }
814
815 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
816 SAM_hdr_type *t1, *t2;
817
818 if (!kh_exist(hdr->h, k))
819 continue;
820
821 if (kh_key(hdr->h, k) == K("HD"))
822 continue;
823
824 t1 = t2 = kh_val(hdr->h, k);
825 do {
826 SAM_hdr_tag *tag;
827 char c[2];
828
829 if (EOF == kputc_('@', &ks))
830 return -1;
831 c[0] = kh_key(hdr->h, k)>>8;
832 c[1] = kh_key(hdr->h, k)&0xff;
833 if (EOF == kputsn_(c, 2, &ks))
834 return -1;
835 for (tag = t1->tag; tag; tag=tag->next) {
836 if (EOF == kputc_('\t', &ks))
837 return -1;
838 if (EOF == kputsn_(tag->str, tag->len, &ks))
839 return -1;
840 }
841 if (EOF == kputc('\n', &ks))
842 return -1;
843 t1 = t1->next;
844 } while (t1 != t2);
845 }
846
847 if (ks_str(&hdr->text))
848 KS_FREE(&hdr->text);
849
850 hdr->text = ks;
851
852 return 0;
853 }
854
855
856 /*
857 * Creates an empty SAM header, ready to be populated.
858 *
859 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
860 * NULL on failure
861 */
sam_hdr_new()862 SAM_hdr *sam_hdr_new() {
863 SAM_hdr *sh = calloc(1, sizeof(*sh));
864
865 if (!sh)
866 return NULL;
867
868 sh->h = kh_init(sam_hdr);
869 if (!sh->h)
870 goto err;
871
872 sh->ID_cnt = 1;
873 sh->ref_count = 1;
874
875 sh->nref = 0;
876 sh->ref = NULL;
877 if (!(sh->ref_hash = kh_init(m_s2i)))
878 goto err;
879
880 sh->nrg = 0;
881 sh->rg = NULL;
882 if (!(sh->rg_hash = kh_init(m_s2i)))
883 goto err;
884
885 sh->npg = 0;
886 sh->pg = NULL;
887 sh->npg_end = sh->npg_end_alloc = 0;
888 sh->pg_end = NULL;
889 if (!(sh->pg_hash = kh_init(m_s2i)))
890 goto err;
891
892 KS_INIT(&sh->text);
893
894 if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
895 goto err;
896
897 if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
898 goto err;
899
900 if (!(sh->str_pool = string_pool_create(8192)))
901 goto err;
902
903 return sh;
904
905 err:
906 if (sh->h)
907 kh_destroy(sam_hdr, sh->h);
908
909 if (sh->tag_pool)
910 pool_destroy(sh->tag_pool);
911
912 if (sh->type_pool)
913 pool_destroy(sh->type_pool);
914
915 if (sh->str_pool)
916 string_pool_destroy(sh->str_pool);
917
918 free(sh);
919
920 return NULL;
921 }
922
923
924 /*
925 * Tokenises a SAM header into a hash table.
926 * Also extracts a few bits on specific data types, such as @RG lines.
927 *
928 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
929 * NULL on failure
930 */
sam_hdr_parse_(const char * hdr,int len)931 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
932 /* Make an empty SAM_hdr */
933 SAM_hdr *sh;
934
935 sh = sam_hdr_new();
936 if (NULL == sh) return NULL;
937
938 if (NULL == hdr) return sh; // empty header is permitted
939
940 /* Parse the header, line by line */
941 if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
942 sam_hdr_free(sh);
943 return NULL;
944 }
945
946 /* Obtain sort order */
947 sh->sort_order = sam_hdr_parse_sort_order(sh);
948
949 //sam_hdr_dump(sh);
950 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
951 //sam_hdr_rebuild(sh);
952 //printf(">>%s<<", ks_str(sh->text));
953
954 //parse_references(sh);
955 //parse_read_groups(sh);
956
957 sam_hdr_link_pg(sh);
958 //sam_hdr_dump(sh);
959
960 return sh;
961 }
962
963 /*
964 * Produces a duplicate copy of hdr and returns it.
965 * Returns NULL on failure
966 */
sam_hdr_dup(SAM_hdr * hdr)967 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
968 if (-1 == sam_hdr_rebuild(hdr))
969 return NULL;
970
971 return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
972 }
973
974 /*! Increments a reference count on hdr.
975 *
976 * This permits multiple files to share the same header, all calling
977 * sam_hdr_free when done, without causing errors for other open files.
978 */
sam_hdr_incr_ref(SAM_hdr * hdr)979 void sam_hdr_incr_ref(SAM_hdr *hdr) {
980 hdr->ref_count++;
981 }
982
983 /*! Increments a reference count on hdr.
984 *
985 * This permits multiple files to share the same header, all calling
986 * sam_hdr_free when done, without causing errors for other open files.
987 *
988 * If the reference count hits zero then the header is automatically
989 * freed. This makes it a synonym for sam_hdr_free().
990 */
sam_hdr_decr_ref(SAM_hdr * hdr)991 void sam_hdr_decr_ref(SAM_hdr *hdr) {
992 sam_hdr_free(hdr);
993 }
994
995 /*! Deallocates all storage used by a SAM_hdr struct.
996 *
997 * This also decrements the header reference count. If after decrementing
998 * it is still non-zero then the header is assumed to be in use by another
999 * caller and the free is not done.
1000 *
1001 * This is a synonym for sam_hdr_dec_ref().
1002 */
sam_hdr_free(SAM_hdr * hdr)1003 void sam_hdr_free(SAM_hdr *hdr) {
1004 if (!hdr)
1005 return;
1006
1007 if (--hdr->ref_count > 0)
1008 return;
1009
1010 if (ks_str(&hdr->text))
1011 KS_FREE(&hdr->text);
1012
1013 if (hdr->h)
1014 kh_destroy(sam_hdr, hdr->h);
1015
1016 if (hdr->ref_hash)
1017 kh_destroy(m_s2i, hdr->ref_hash);
1018
1019 if (hdr->ref) {
1020 int i;
1021 for (i = 0; i < hdr->nref; i++)
1022 if (hdr->ref[i].name)
1023 free(hdr->ref[i].name);
1024 free(hdr->ref);
1025 }
1026
1027 if (hdr->rg_hash)
1028 kh_destroy(m_s2i, hdr->rg_hash);
1029
1030 if (hdr->rg) {
1031 int i;
1032 for (i = 0; i < hdr->nrg; i++)
1033 if (hdr->rg[i].name)
1034 free(hdr->rg[i].name);
1035 free(hdr->rg);
1036 }
1037
1038 if (hdr->pg_hash)
1039 kh_destroy(m_s2i, hdr->pg_hash);
1040
1041 if (hdr->pg) {
1042 int i;
1043 for (i = 0; i < hdr->npg; i++)
1044 if (hdr->pg[i].name)
1045 free(hdr->pg[i].name);
1046 free(hdr->pg);
1047 }
1048
1049 if (hdr->pg_end)
1050 free(hdr->pg_end);
1051
1052 if (hdr->type_pool)
1053 pool_destroy(hdr->type_pool);
1054
1055 if (hdr->tag_pool)
1056 pool_destroy(hdr->tag_pool);
1057
1058 if (hdr->str_pool)
1059 string_pool_destroy(hdr->str_pool);
1060
1061 free(hdr);
1062 }
1063
sam_hdr_length(SAM_hdr * hdr)1064 int sam_hdr_length(SAM_hdr *hdr) {
1065 return ks_len(&hdr->text);
1066 }
1067
sam_hdr_str(SAM_hdr * hdr)1068 char *sam_hdr_str(SAM_hdr *hdr) {
1069 return ks_str(&hdr->text);
1070 }
1071
1072 /*
1073 * Looks up a reference sequence by name and returns the numerical ID.
1074 * Returns -1 if unknown reference.
1075 */
sam_hdr_name2ref(SAM_hdr * hdr,const char * ref)1076 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1077 khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1078 return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1079 }
1080
1081 /*
1082 * Looks up a read-group by name and returns a pointer to the start of the
1083 * associated tag list.
1084 *
1085 * Returns NULL on failure
1086 */
sam_hdr_find_rg(SAM_hdr * hdr,const char * rg)1087 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1088 khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1089 return k == kh_end(hdr->rg_hash)
1090 ? NULL
1091 : &hdr->rg[kh_val(hdr->rg_hash, k)];
1092 }
1093
1094
1095 /*
1096 * Fixes any PP links in @PG headers.
1097 * If the entries are in order then this doesn't need doing, but incase
1098 * our header is out of order this goes through the sh->pg[] array
1099 * setting the prev_id field.
1100 *
1101 * Note we can have multiple complete chains. This code should identify the
1102 * tails of these chains as these are the entries we have to link to in
1103 * subsequent PP records.
1104 *
1105 * Returns 0 on sucess
1106 * -1 on failure (indicating broken PG/PP records)
1107 */
sam_hdr_link_pg(SAM_hdr * hdr)1108 int sam_hdr_link_pg(SAM_hdr *hdr) {
1109 int i, j, ret = 0;
1110
1111 hdr->npg_end_alloc = hdr->npg;
1112 hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1113 if (!hdr->pg_end)
1114 return -1;
1115
1116 for (i = 0; i < hdr->npg; i++)
1117 hdr->pg_end[i] = i;
1118
1119 for (i = 0; i < hdr->npg; i++) {
1120 khint_t k;
1121 SAM_hdr_tag *tag;
1122 char tmp;
1123
1124 for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1125 if (tag->str[0] == 'P' && tag->str[1] == 'P')
1126 break;
1127 }
1128 if (!tag) {
1129 /* Chain start points */
1130 continue;
1131 }
1132
1133 tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1134 k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1135 tag->str[tag->len] = tmp;
1136
1137 if (k == kh_end(hdr->pg_hash)) {
1138 ret = -1;
1139 continue;
1140 }
1141
1142 hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1143 hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1144 }
1145
1146 for (i = j = 0; i < hdr->npg; i++) {
1147 if (hdr->pg_end[i] != -1)
1148 hdr->pg_end[j++] = hdr->pg_end[i];
1149 }
1150 hdr->npg_end = j;
1151
1152 return ret;
1153 }
1154
1155 /*
1156 * Returns a unique ID from a base name.
1157 *
1158 * The value returned is valid until the next call to
1159 * this function.
1160 */
sam_hdr_PG_ID(SAM_hdr * sh,const char * name)1161 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1162 khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1163 if (k == kh_end(sh->pg_hash))
1164 return name;
1165
1166 do {
1167 sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1168 k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1169 } while (k != kh_end(sh->pg_hash));
1170
1171 return sh->ID_buf;
1172 }
1173
1174 /*
1175 * Add an @PG line.
1176 *
1177 * If we wish complete control over this use sam_hdr_add() directly. This
1178 * function uses that, but attempts to do a lot of tedious house work for
1179 * you too.
1180 *
1181 * - It will generate a suitable ID if the supplied one clashes.
1182 * - It will generate multiple @PG records if we have multiple PG chains.
1183 *
1184 * Call it as per sam_hdr_add() with a series of key,value pairs ending
1185 * in NULL.
1186 *
1187 * Returns 0 on success
1188 * -1 on failure
1189 */
sam_hdr_add_PG(SAM_hdr * sh,const char * name,...)1190 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1191 va_list args;
1192
1193 if (sh->npg_end) {
1194 /* Copy ends array to avoid us looping while modifying it */
1195 int *end = malloc(sh->npg_end * sizeof(int));
1196 int i, nends = sh->npg_end;
1197
1198 if (!end)
1199 return -1;
1200
1201 memcpy(end, sh->pg_end, nends * sizeof(*end));
1202
1203 for (i = 0; i < nends; i++) {
1204 va_start(args, name);
1205 if (-1 == sam_hdr_vadd(sh, "PG", args,
1206 "ID", sam_hdr_PG_ID(sh, name),
1207 "PN", name,
1208 "PP", sh->pg[end[i]].name,
1209 NULL)) {
1210 free(end);
1211 return -1;
1212 }
1213 va_end(args);
1214 }
1215
1216 free(end);
1217 } else {
1218 va_start(args, name);
1219 if (-1 == sam_hdr_vadd(sh, "PG", args,
1220 "ID", sam_hdr_PG_ID(sh, name),
1221 "PN", name,
1222 NULL))
1223 return -1;
1224 va_end(args);
1225 }
1226
1227 //sam_hdr_dump(sh);
1228
1229 return 0;
1230 }
1231
1232 /*
1233 * A function to help with construction of CL tags in @PG records.
1234 * Takes an argc, argv pair and returns a single space-separated string.
1235 * This string should be deallocated by the calling function.
1236 *
1237 * Returns malloced char * on success
1238 * NULL on failure
1239 */
stringify_argv(int argc,char * argv[])1240 char *stringify_argv(int argc, char *argv[]) {
1241 char *str, *cp;
1242 size_t nbytes = 1;
1243 int i, j;
1244
1245 /* Allocate */
1246 for (i = 0; i < argc; i++) {
1247 if (i > 0) nbytes += 1;
1248 nbytes += strlen(argv[i]);
1249 }
1250 if (!(str = malloc(nbytes)))
1251 return NULL;
1252
1253 /* Copy */
1254 cp = str;
1255 for (i = 0; i < argc; i++) {
1256 if (i > 0) *cp++ = ' ';
1257 j = 0;
1258 while (argv[i][j]) {
1259 if (argv[i][j] == '\t')
1260 *cp++ = ' ';
1261 else
1262 *cp++ = argv[i][j];
1263 j++;
1264 }
1265 }
1266 *cp++ = 0;
1267
1268 return str;
1269 }
1270