1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield, Rob Davies
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36  */
37 
38 #ifdef HAVE_CONFIG_H
39 #include "io_lib_config.h"
40 #endif
41 
42 #include <string.h>
43 #include <assert.h>
44 
45 #include "io_lib/sam_header.h"
46 #include "io_lib/string_alloc.h"
47 
48 #ifdef SAMTOOLS
49 #define sam_hdr_parse sam_hdr_parse_
50 #endif
51 
sam_hdr_error(char * msg,char * line,int len,int lno)52 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
53     int j;
54 
55     for (j = 0; j < len && line[j] != '\n'; j++)
56 	;
57     fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
58 }
59 
sam_hdr_dump(SAM_hdr * hdr)60 void sam_hdr_dump(SAM_hdr *hdr) {
61     HashIter *iter = HashTableIterCreate();
62     HashItem *hi;
63     int i;
64 
65     printf("===DUMP===\n");
66     while ((hi = HashTableIterNext(hdr->h, iter))) {
67 	SAM_hdr_type *t1, *t2;
68 
69 	t1 = t2 = hi->data.p;
70 	printf("Type %.2s, count %d\n", hi->key, t1->prev->order+1);
71 
72 	do {
73 	    SAM_hdr_tag *tag;
74 	    printf(">>>%d ", t1->order);
75 	    for (tag = t1->tag; tag; tag=tag->next) {
76 		printf("\"%.2s\":\"%.*s\"\t",
77 		       tag->str, tag->len-3, tag->str+3);
78 	    }
79 	    putchar('\n');
80 	    t1 = t1->next;
81 	} while (t1 != t2);
82     }
83 
84     /* Dump out PG chains */
85     printf("\n@PG chains:\n");
86     for (i = 0; i < hdr->npg_end; i++) {
87 	int j;
88 	printf("  %d:", i);
89 	for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
90 	    printf("%s%d(%.*s)",
91 		   j == hdr->pg_end[i] ? " " : "->",
92 		   j, hdr->pg[j].name_len, hdr->pg[j].name);
93 	}
94 	printf("\n");
95     }
96 
97     puts("===END DUMP===");
98 
99     HashTableIterDestroy(iter);
100 }
101 
102 /* Updates the HashTables in the SAM_hdr structure.
103  *
104  * Returns 0 on success;
105  *        -1 on failure
106  */
sam_hdr_update_hashes(SAM_hdr * sh,const char * type,SAM_hdr_type * h_type)107 static int sam_hdr_update_hashes(SAM_hdr *sh,
108 				 const char *type,
109 				 SAM_hdr_type *h_type) {
110     /* Add to reference hash? */
111     if (type[0] == 'S' && type[1] == 'Q') {
112 	SAM_hdr_tag *tag;
113 	int nref = sh->nref;
114 
115 	sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
116 	if (!sh->ref)
117 	    return -1;
118 
119 	tag = h_type->tag;
120 	sh->ref[nref].name = NULL;
121 	sh->ref[nref].len  = 0;
122 	sh->ref[nref].ty = h_type;
123 	sh->ref[nref].tag  = tag;
124 
125 	while (tag) {
126 	    if (tag->str[0] == 'S' && tag->str[1] == 'N') {
127 		if (!(sh->ref[nref].name = malloc(tag->len)))
128 		    return -1;
129 		strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
130 		sh->ref[nref].name[tag->len-3] = 0;
131 	    } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
132 		sh->ref[nref].len = atoi(tag->str+3);
133 	    }
134 	    tag = tag->next;
135 	}
136 
137 	if (sh->ref[nref].name) {
138 	    HashData hd;
139 	    hd.i = nref;
140 	    if (!HashTableAdd(sh->ref_hash, sh->ref[nref].name, 0, hd, NULL))
141 		return -1;
142 	}
143 
144 	sh->nref++;
145     }
146 
147     /* Add to read-group hash? */
148     if (type[0] == 'R' && type[1] == 'G') {
149 	SAM_hdr_tag *tag;
150 	int nrg = sh->nrg;
151 
152 	sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
153 	if (!sh->rg)
154 	    return -1;
155 
156 	tag = h_type->tag;
157 	sh->rg[nrg].name = NULL;
158 	sh->rg[nrg].name_len = 0;
159 	sh->rg[nrg].ty   = h_type;
160 	sh->rg[nrg].tag  = tag;
161 	sh->rg[nrg].id   = nrg;
162 
163 	while (tag) {
164 	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
165 		if (!(sh->rg[nrg].name = malloc(tag->len)))
166 		    return -1;
167 		strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
168 		sh->rg[nrg].name[tag->len-3] = 0;
169 		sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
170 	    }
171 	    tag = tag->next;
172 	}
173 
174 	if (sh->rg[nrg].name) {
175 	    HashData hd;
176 	    hd.i = nrg;
177 	    if (!HashTableAdd(sh->rg_hash, sh->rg[nrg].name, 0, hd, NULL))
178 		return -1;
179 	}
180 
181 	sh->nrg++;
182     }
183 
184     /* Add to program hash? */
185     if (type[0] == 'P' && type[1] == 'G') {
186 	SAM_hdr_tag *tag;
187 	int npg = sh->npg;
188 
189 	sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
190 	if (!sh->pg)
191 	    return -1;
192 
193 	tag = h_type->tag;
194 	sh->pg[npg].name = NULL;
195 	sh->pg[npg].name_len = 0;
196 	sh->pg[npg].ty  = h_type;
197 	sh->pg[npg].tag  = tag;
198 	sh->pg[npg].id   = npg;
199 	sh->pg[npg].prev_id = -1;
200 
201 	while (tag) {
202 	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
203 		if (!(sh->pg[npg].name = malloc(tag->len)))
204 		    return -1;
205 		strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
206 		sh->pg[npg].name[tag->len-3] = 0;
207 		sh->pg[npg].name_len = strlen(sh->pg[npg].name);
208 	    } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
209 		// Resolve later if needed
210 		HashItem *hi = HashTableSearch(sh->pg_hash,
211 					       tag->str+3,
212 					       tag->len-3);
213 		if (hi) {
214 		    sh->pg[npg].prev_id = sh->pg[hi->data.i].id;
215 
216 		    /* Unmark previous entry as a PG termination */
217 		    if (sh->npg_end > 0 &&
218 			sh->pg_end[sh->npg_end-1] == hi->data.i) {
219 			sh->npg_end--;
220 		    } else {
221 			int i;
222 			for (i = 0; i < sh->npg_end; i++) {
223 			    if (sh->pg_end[i] == hi->data.i) {
224 				memmove(&sh->pg_end[i], &sh->pg_end[i+1],
225 					(sh->npg_end-i-1)*sizeof(*sh->pg_end));
226 				sh->npg_end--;
227 			    }
228 			}
229 		    }
230 		} else {
231 		    sh->pg[npg].prev_id = -1;
232 		}
233 	    }
234 	    tag = tag->next;
235 	}
236 
237 	if (sh->pg[npg].name) {
238 	    HashData hd;
239 	    hd.i = npg;
240 	    if (!HashTableAdd(sh->pg_hash, sh->pg[npg].name, 0, hd, NULL))
241 		return -1;
242 	}
243 
244 	/* Add to npg_end[] array. Remove later if we find a PP line */
245 	if (sh->npg_end >= sh->npg_end_alloc) {
246 	    sh->npg_end_alloc = sh->npg_end_alloc
247 		? sh->npg_end_alloc*2
248 		: 4;
249 	    sh->pg_end = realloc(sh->pg_end,
250 				 sh->npg_end_alloc * sizeof(int));
251 	    if (!sh->pg_end)
252 		return -1;
253 	}
254 	sh->pg_end[sh->npg_end++] = npg;
255 
256 	sh->npg++;
257     }
258 
259     return 0;
260 }
261 
262 /*
263  * Appends a formatted line to an existing SAM header.
264  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
265  * optional new-line. If it contains more than 1 line then multiple lines
266  * will be added in order.
267  *
268  * Input text is of maximum length len or as terminated earlier by a NUL.
269  * Len may be 0 if unknown, in which case lines must be NUL-terminated.
270  *
271  * Returns 0 on success
272  *        -1 on failure
273  */
sam_hdr_add_lines(SAM_hdr * sh,const char * lines,int len)274 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
275     int i, lno, text_offset;
276     HashItem *hi;
277     HashData hd;
278     char *hdr;
279 
280     if (!len)
281 	len = strlen(lines);
282 
283     text_offset = DSTRING_LEN(sh->text);
284     dstring_nappend(sh->text, lines, len);
285     hdr = DSTRING_STR(sh->text) + text_offset;
286 
287     for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
288 	char *type;
289 	int l_start = i, new;
290 	SAM_hdr_type *h_type;
291 	SAM_hdr_tag *h_tag, *last;
292 
293 	if (hdr[i] != '@') {
294 	    int j;
295 	    for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
296 		;
297 	    sam_hdr_error("Header line does not start with '@'",
298 			  &hdr[l_start], len - l_start, lno);
299 	    return -1;
300 	}
301 
302 	type = &hdr[i+1];
303         if (len - i < 3 ||
304             type[0] < 'A' || type[0] > 'z' ||
305 	    type[1] < 'A' || type[1] > 'z') {
306 	    sam_hdr_error("Header line does not have a two character key",
307 			  &hdr[l_start], len - l_start, lno);
308 	    return -1;
309 	}
310 
311 	i += 3;
312         if (i >= len || hdr[i] == '\n')
313 	    continue;
314 
315 	// Add the header line type
316 	if (!(h_type = pool_alloc(sh->type_pool)))
317 	    return -1;
318 	hd.p = h_type;
319 	if (!(hi = HashTableAdd(sh->h, type, 2, hd, &new)))
320 	    return -1;
321 
322 	// Form the ring, either with self or other lines of this type
323 	if (!new) {
324 	    SAM_hdr_type *t = hi->data.p, *p;
325 	    p = t->prev;
326 
327 	    assert(p->next == t);
328 	    p->next = h_type;
329 	    h_type->prev = p;
330 
331 	    t->prev = h_type;
332 	    h_type->next = t;
333 	    h_type->order = p->order+1;
334 	} else {
335 	    h_type->prev = h_type->next = h_type;
336 	    h_type->order = 0;
337 	    if (!(type[0] == 'H' && type[1] == 'D')) {
338 		// Whenever new type is added to hashtable record in
339 		// type_order unless HD, which should always be first.
340 		dstring_nappend(sh->type_order, type, 2);
341 		sh->ntypes++;
342 	    }
343 	}
344 
345 	// Parse the tags on this line
346 	last = NULL;
347 	if (type[0] == 'C' && type[1] == 'O') {
348 	    int j;
349 	    if (hdr[i] != '\t') {
350 		sam_hdr_error("Missing tab",
351 			      &hdr[l_start], len - l_start, lno);
352 		return -1;
353 	    }
354 
355 	    for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
356 		;
357 
358 	    if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
359 		return -1;
360 	    h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
361 	    h_tag->len = j-i;
362 	    h_tag->next = NULL;
363 	    if (!h_tag->str)
364 		return -1;
365 
366 	    i = j;
367 
368 	} else {
369 	    do {
370 		int j;
371 		if (hdr[i] != '\t') {
372 		    sam_hdr_error("Missing tab",
373 				  &hdr[l_start], len - l_start, lno);
374 		    return -1;
375 		}
376 
377 		for (j = ++i;
378 		     j < len && hdr[j] != '\0' &&
379 			        hdr[j] != '\n' && hdr[j] != '\t';
380 		     j++)
381 		    ;
382 
383 		if (!(h_tag = pool_alloc(sh->tag_pool)))
384 		    return -1;
385 		h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
386 		h_tag->len = j-i;
387 		h_tag->next = NULL;
388 		if (!h_tag->str)
389 		    return -1;
390 
391 		if (h_tag->len < 3 || h_tag->str[2] != ':') {
392 		    sam_hdr_error("Malformed key:value pair",
393 				  &hdr[l_start], len - l_start, lno);
394 		    return -1;
395 		}
396 
397 		if (last)
398 		    last->next = h_tag;
399 		else
400 		    h_type->tag = h_tag;
401 
402 		last = h_tag;
403 		i = j;
404 	    } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
405 	}
406 
407 	/* Update RG/SQ hashes */
408 	if (-1 == sam_hdr_update_hashes(sh, type, h_type))
409 	    return -1;
410     }
411 
412     return 0;
413 }
414 
415 /*
416  * Adds a single line to a SAM header.
417  * Specify type and one or more key,value pairs, ending with the NULL key.
418  * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
419  *
420  * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
421  *        -1 on failure
422  */
sam_hdr_add(SAM_hdr * sh,const char * type,...)423 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
424     va_list args;
425     va_start(args, type);
426     return sam_hdr_vadd(sh, type, args, NULL);
427 }
428 
sam_hdr_vadd(SAM_hdr * sh,const char * type,va_list ap,...)429 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
430     va_list args;
431     HashItem *hi;
432     HashData hd;
433     SAM_hdr_type *h_type;
434     SAM_hdr_tag *h_tag, *last;
435     int new;
436 
437 #if defined(HAVE_VA_COPY)
438     va_list ap_local;
439 #endif
440 
441     if (-1 == dstring_append_char(sh->text, '@'))
442 	return -1;
443     if (-1 == dstring_nappend(sh->text, type, 2))
444 	return -1;
445 
446     if (!(h_type = pool_alloc(sh->type_pool)))
447 	return -1;
448     hd.p = h_type;
449     if (!(hi = HashTableAdd(sh->h, (char *)type, 2, hd, &new)))
450 	return -1;
451 
452     // Form the ring, either with self or other lines of this type
453     if (!new) {
454 	SAM_hdr_type *t = hi->data.p, *p;
455 	p = t->prev;
456 
457 	assert(p->next = t);
458 	p->next = h_type;
459 	h_type->prev = p;
460 
461 	t->prev = h_type;
462 	h_type->next = t;
463 	h_type->order = p->order + 1;
464     } else {
465 	if (!(type[0] == 'H' && type[1] == 'D')) {
466 	    // Whenever new type is added to hashtable record in
467 	    // type_order unless HD, which should always be first.
468 	    dstring_nappend(sh->type_order, type, 2);
469 	    sh->ntypes++;
470 	}
471 	h_type->prev = h_type->next = h_type;
472 	h_type->order = 0;
473     }
474 
475     last = NULL;
476 
477     // Any ... varargs
478     va_start(args, ap);
479     for (;;) {
480 	char *k, *v;
481 	int idx;
482 
483 	if (!(k = (char *)va_arg(args, char *)))
484 	    break;
485 	v = va_arg(args, char *);
486 
487 	if (-1 == dstring_append_char(sh->text, '\t'))
488 	    return -1;
489 
490 	if (!(h_tag = pool_alloc(sh->tag_pool)))
491 	    return -1;
492 	idx = DSTRING_LEN(sh->text);
493 
494 	if (-1 == dstring_append(sh->text, k))
495 	    return -1;
496 	if (-1 == dstring_append_char(sh->text, ':'))
497 	    return -1;
498 	if (-1 == dstring_append(sh->text, v))
499 	    return -1;
500 
501 	h_tag->len = DSTRING_LEN(sh->text) - idx;
502 	h_tag->str = string_ndup(sh->str_pool,
503 				 DSTRING_STR(sh->text) + idx,
504 				 h_tag->len);
505 	h_tag->next = NULL;
506 	if (!h_tag->str)
507 	    return -1;
508 
509 	if (last)
510 	    last->next = h_tag;
511 	else
512 	    h_type->tag = h_tag;
513 
514 	last = h_tag;
515     }
516     va_end(args);
517 
518 #if defined(HAVE_VA_COPY)
519     va_copy(ap_local, ap);
520 #   define ap ap_local
521 #endif
522 
523     // Plus the specified va_list params
524     for (;;) {
525 	char *k, *v;
526 	int idx;
527 
528 	if (!(k = (char *)va_arg(ap, char *)))
529 	    break;
530 	v = va_arg(ap, char *);
531 
532 	if (-1 == dstring_append_char(sh->text, '\t'))
533 	    return -1;
534 
535 	if (!(h_tag = pool_alloc(sh->tag_pool)))
536 	    return -1;
537 	idx = DSTRING_LEN(sh->text);
538 
539 	if (-1 == dstring_append(sh->text, k))
540 	    return -1;
541 	if (-1 == dstring_append_char(sh->text, ':'))
542 	    return -1;
543 	if (-1 == dstring_append(sh->text, v))
544 	    return -1;
545 
546 	h_tag->len = DSTRING_LEN(sh->text) - idx;
547 	h_tag->str = string_ndup(sh->str_pool,
548 				 DSTRING_STR(sh->text) + idx,
549 				 h_tag->len);
550 	h_tag->next = NULL;
551 	if (!h_tag->str)
552 	    return -1;
553 
554 	if (last)
555 	    last->next = h_tag;
556 	else
557 	    h_type->tag = h_tag;
558 
559 	last = h_tag;
560     }
561     va_end(ap);
562 
563     if (-1 == dstring_append_char(sh->text, '\n'))
564 	return -1;
565 
566     if (-1 == sam_hdr_update_hashes(sh, type, h_type))
567 	return -1;
568 
569     return h_type->order;
570 }
571 
572 /*
573  * Returns the first header item matching 'type'. If ID is non-NULL it checks
574  * for the tag ID: and compares against the specified ID.
575  *
576  * Returns NULL if no type/ID is found
577  */
sam_hdr_find(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)578 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
579 			   char *ID_key, char *ID_value) {
580     HashItem *hi;
581     SAM_hdr_type *t1, *t2;
582 
583     /* Special case for types we have prebuilt hashes on */
584     if (ID_key) {
585 	if (type[0]   == 'S' && type[1]   == 'Q' &&
586 	    ID_key[0] == 'S' && ID_key[1] == 'N') {
587 	    hi = HashTableSearch(hdr->ref_hash, ID_value, strlen(ID_value));
588 	    return hi ? hdr->ref[hi->data.i].ty : NULL;
589 	}
590 
591 	if (type[0]   == 'R' && type[1]   == 'G' &&
592 	    ID_key[0] == 'I' && ID_key[1] == 'D') {
593 	    hi = HashTableSearch(hdr->rg_hash, ID_value, strlen(ID_value));
594 	    return hi ? hdr->rg[hi->data.i].ty : NULL;
595 	}
596 
597 	if (type[0]   == 'P' && type[1]   == 'G' &&
598 	    ID_key[0] == 'I' && ID_key[1] == 'D') {
599 	    hi = HashTableSearch(hdr->pg_hash, ID_value, strlen(ID_value));
600 	    return hi ? hdr->pg[hi->data.i].ty : NULL;
601 	}
602     }
603 
604     if (!(hi = HashTableSearch(hdr->h, type, 2)))
605 	return NULL;
606 
607     if (!ID_key)
608 	return hi->data.p;
609 
610     t1 = t2 = hi->data.p;
611     do {
612 	SAM_hdr_tag *tag;
613 	for (tag = t1->tag; tag; tag = tag->next) {
614 	    if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
615 		char *cp1 = tag->str+3;
616 		char *cp2 = ID_value;
617 		while (*cp1 && *cp1 == *cp2)
618 		    cp1++, cp2++;
619 		if (*cp2 || *cp1)
620 		    continue;
621 		return t1;
622 	    }
623 	}
624 	t1 = t1->next;
625     } while (t1 != t2);
626 
627     return NULL;
628 }
629 
630 /*
631  * As per SAM_hdr_type, but returns a complete line of formatted text
632  * for a specific head type/ID combination. If ID is NULL then it returns
633  * the first line of the specified type.
634  *
635  * The returned string is malloced and should be freed by the calling
636  * function with free().
637  *
638  * Returns NULL if no type/ID is found.
639  */
sam_hdr_find_line(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)640 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
641 			char *ID_key, char *ID_value) {
642     SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
643     dstring_t *ds;
644     SAM_hdr_tag *tag;
645     char *str = dstring_str(hdr->text);
646     int r = 0;
647 
648     if (!ty)
649 	return NULL;
650 
651     if (NULL == (ds = dstring_create(NULL)))
652 	return NULL;
653 
654     // Paste together the line from the hashed copy
655     r |= dstring_append_char(ds, '@');
656     r |= dstring_append(ds, type);
657     for (tag = ty->tag; tag; tag = tag->next) {
658 	r |= dstring_append_char(ds, '\t');
659 	r |= dstring_nappend(ds, tag->str, tag->len);
660     }
661 
662     if (r) {
663 	dstring_destroy(ds);
664 	return NULL;
665     }
666 
667     // Steal the dstring copy and return that.
668     str = DSTRING_STR(ds);
669     DSTRING_STR(ds) = NULL;
670     dstring_destroy(ds);
671 
672     return str;
673 }
674 
675 
676 /*
677  * Looks for a specific key in a single sam header line.
678  * If prev is non-NULL it also fills this out with the previous tag, to
679  * permit use in key removal. *prev is set to NULL when the tag is the first
680  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
681  * tag in the existing list.
682  *
683  * Returns the tag pointer on success
684  *         NULL on failure
685  */
sam_hdr_find_key(SAM_hdr * sh,SAM_hdr_type * type,char * key,SAM_hdr_tag ** prev)686 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
687 			      SAM_hdr_type *type,
688 			      char *key,
689 			      SAM_hdr_tag **prev) {
690     SAM_hdr_tag *tag, *p = NULL;
691 
692     for (tag = type->tag; tag; p = tag, tag = tag->next) {
693 	if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
694 	    if (prev)
695 		*prev = p;
696 	    return tag;
697 	}
698     }
699 
700     if (prev)
701 	*prev = p;
702 
703     return NULL;
704 }
705 
706 
707 /*
708  * Adds or updates tag key,value pairs in a header line.
709  * Eg for adding M5 tags to @SQ lines or updating sort order for the
710  * @HD line (although use the sam_hdr_sort_order() function for
711  * HD manipulation, which is a wrapper around this funuction).
712  *
713  * Specify multiple key,value pairs ending in NULL.
714  *
715  * Returns 0 on success
716  *        -1 on failure
717  */
sam_hdr_update(SAM_hdr * hdr,SAM_hdr_type * type,...)718 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
719     va_list ap;
720 
721     va_start(ap, type);
722 
723     for (;;) {
724 	char *k, *v;
725 	int idx;
726 	SAM_hdr_tag *tag, *prev;
727 
728 	if (!(k = (char *)va_arg(ap, char *)))
729 	    break;
730 	v = va_arg(ap, char *);
731 
732 	tag = sam_hdr_find_key(hdr, type, k, &prev);
733 	if (!tag) {
734 	    if (!(tag = pool_alloc(hdr->tag_pool)))
735 		return -1;
736 	    if (prev)
737 		prev->next = tag;
738 	    else
739 		type->tag = tag;
740 
741 	    tag->next = NULL;
742 	}
743 
744 	idx = DSTRING_LEN(hdr->text);
745 	if (0 != dstring_appendf(hdr->text, "%2.2s:%s", k, v))
746 	    return -1;
747 	tag->len = DSTRING_LEN(hdr->text) - idx;
748 	tag->str = string_ndup(hdr->str_pool,
749 			       DSTRING_STR(hdr->text) + idx,
750 			       tag->len);
751 	if (!tag->str)
752 	    return -1;
753     }
754 
755     va_end(ap);
756 
757     return 0;
758 }
759 
760 
761 /*
762  * Returns the sort order:
763  */
sam_hdr_sort_order(SAM_hdr * hdr)764 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
765     return hdr->sort_order;
766 }
767 
sam_hdr_parse_sort_order(SAM_hdr * hdr)768 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
769     HashItem *hi;
770     enum sam_sort_order so;
771 
772     so = ORDER_UNKNOWN;
773     if ((hi = HashTableSearch(hdr->h, "HD", 2))) {
774 	SAM_hdr_type *ty = hi->data.p;
775 	SAM_hdr_tag *tag;
776         for (tag = ty->tag; tag; tag = tag->next) {
777 	    if (tag->str[0] == 'S' && tag->str[1] == 'O') {
778 		if (strcmp(tag->str+3, "unsorted") == 0)
779 		    so = ORDER_UNSORTED;
780 		else if (strcmp(tag->str+3, "queryname") == 0)
781 		    so = ORDER_NAME;
782 		else if (strcmp(tag->str+3, "coordinate") == 0)
783 		    so = ORDER_COORD;
784 		else if (strcmp(tag->str+3, "unknown") == 0)
785 		    so = ORDER_UNKNOWN;
786 		else
787 		    fprintf(stderr, "Unknown sort order field: %s\n",
788 			    tag->str+3);
789 	    }
790 	}
791     }
792 
793     return so;
794 }
795 
796 
797 /*
798  * Reconstructs the dstring from the header hash table.
799  * Returns 0 on success
800  *        -1 on failure
801  */
sam_hdr_rebuild(SAM_hdr * hdr)802 int sam_hdr_rebuild(SAM_hdr *hdr) {
803     /* Order: HD then others */
804     HashItem *hi;
805     HashIter *iter = HashTableIterCreate();
806     dstring_t *ds = dstring_create(NULL);
807 
808     char *type_order = DSTRING_STR(hdr->type_order);
809     int i, ntypes = hdr->ntypes;
810 
811     if (!iter || !ds)
812 	return -1;
813 
814     if ((hi = HashTableSearch(hdr->h, "HD", 2))) {
815 	SAM_hdr_type *ty = hi->data.p;
816 	SAM_hdr_tag *tag;
817 	if (-1 == dstring_append(ds, "@HD"))
818 	    return -1;
819 	for (tag = ty->tag; tag; tag = tag->next) {
820 	    if (-1 == dstring_append_char(ds, '\t'))
821 		return -1;
822 	    if (-1 == dstring_nappend(ds, tag->str, tag->len))
823 		return -1;
824 	}
825 	if (-1 == dstring_append_char(ds, '\n'))
826 	    return -1;
827     }
828 
829     for (i = 0; i < ntypes; i++) {
830 	// output types according to type_order
831 	if ((hi = HashTableSearch(hdr->h, type_order+i*2, 2))) {
832 	    SAM_hdr_type *t1, *t2;
833 	    if (hi->key[0] == 'H' && hi->key[1] == 'D')
834 		continue;
835 	    t1 = t2 = hi->data.p;
836 	    do {
837 		SAM_hdr_tag *tag;
838 		if (-1 == dstring_append_char(ds, '@'))
839 		    return -1;
840 		if (-1 == dstring_nappend(ds, hi->key, 2))
841 		    return -1;
842 		for (tag = t1->tag; tag; tag=tag->next) {
843 		    if (-1 == dstring_append_char(ds, '\t'))
844 			return -1;
845 		    if (-1 == dstring_nappend(ds, tag->str, tag->len))
846 			return -1;
847 		}
848 		if (-1 == dstring_append_char(ds, '\n'))
849 		    return -1;
850 		t1 = t1->next;
851 	    } while (t1 != t2);
852 	}
853     }
854 
855     // The above loop will have found all header entries as we cannot add to
856     // hdr->h without also adding to the type_order / ntypes.
857 
858     HashTableIterDestroy(iter);
859 
860     dstring_destroy(hdr->text);
861     hdr->text = ds;
862 
863     return 0;
864 }
865 
866 
867 /*
868  * Creates an empty SAM header, ready to be populated.
869  *
870  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
871  *         NULL on failure
872  */
sam_hdr_new()873 SAM_hdr *sam_hdr_new() {
874     SAM_hdr *sh = calloc(1, sizeof(*sh));
875 
876     if (!sh)
877 	return NULL;
878 
879     sh->ntypes = 0;
880     if (!(sh->type_order = dstring_create(NULL)))
881 	goto err;
882 
883     sh->h = HashTableCreate(16, HASH_FUNC_HSIEH |
884 			    HASH_DYNAMIC_SIZE);
885     if (!sh->h)
886 	goto err;
887 
888     sh->ID_cnt = 1;
889     sh->ref_count = 1;
890 
891     sh->nref = 0;
892     sh->ref  = NULL;
893     if (!(sh->ref_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
894 					 HASH_DYNAMIC_SIZE |
895 					 HASH_NONVOLATILE_KEYS)))
896 	goto err;
897 
898     sh->nrg = 0;
899     sh->rg  = NULL;
900     if (!(sh->rg_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
901 					HASH_DYNAMIC_SIZE |
902 					HASH_NONVOLATILE_KEYS)))
903 	goto err;
904 
905     sh->npg = 0;
906     sh->pg  = NULL;
907     sh->npg_end = sh->npg_end_alloc = 0;
908     sh->pg_end = NULL;
909     if (!(sh->pg_hash = HashTableCreate(16, HASH_FUNC_HSIEH |
910 					HASH_DYNAMIC_SIZE |
911 					HASH_NONVOLATILE_KEYS)))
912 	goto err;
913 
914     if (!(sh->text = dstring_create(NULL)))
915 	goto err;
916 
917     if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
918 	goto err;
919 
920     if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
921 	goto err;
922 
923     if (!(sh->str_pool = string_pool_create(8192)))
924 	goto err;
925 
926     return sh;
927 
928  err:
929     if (sh->type_order)
930 	dstring_destroy(sh->type_order);
931 
932     if (sh->h)
933 	HashTableDestroy(sh->h, 0);
934 
935     if (sh->tag_pool)
936 	pool_destroy(sh->tag_pool);
937 
938     if (sh->type_pool)
939 	pool_destroy(sh->type_pool);
940 
941     if (sh->str_pool)
942 	string_pool_destroy(sh->str_pool);
943 
944     free(sh);
945 
946     return NULL;
947 }
948 
949 
950 /*
951  * Tokenises a SAM header into a hash table.
952  * Also extracts a few bits on specific data types, such as @RG lines.
953  *
954  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
955  *         NULL on failure
956  */
sam_hdr_parse(const char * hdr,int len)957 SAM_hdr *sam_hdr_parse(const char *hdr, int len) {
958     /* Make an empty SAM_hdr */
959     SAM_hdr *sh;
960 
961     sh = sam_hdr_new();
962     if (NULL == sh) return NULL;
963 
964     if (NULL == hdr) return sh; // empty header is permitted
965 
966     /* Parse the header, line by line */
967     if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
968 	sam_hdr_free(sh);
969 	return NULL;
970     }
971 
972     /* Obtain sort order */
973     sh->sort_order = sam_hdr_parse_sort_order(sh);
974 
975     //sam_hdr_dump(sh);
976     //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
977     //sam_hdr_rebuild(sh);
978     //printf(">>%s<<", DSTRING_STR(sh->text));
979 
980     //parse_references(sh);
981     //parse_read_groups(sh);
982 
983     sam_hdr_link_pg(sh);
984     //sam_hdr_dump(sh);
985 
986     return sh;
987 }
988 
989 /*
990  * Produces a duplicate copy of hdr and returns it.
991  * Returns NULL on failure
992  */
sam_hdr_dup(SAM_hdr * hdr)993 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
994     if (-1 == sam_hdr_rebuild(hdr))
995 	return NULL;
996 
997     return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr));
998 }
999 
1000 /*! Increments a reference count on hdr.
1001  *
1002  * This permits multiple files to share the same header, all calling
1003  * sam_hdr_free when done, without causing errors for other open  files.
1004  */
sam_hdr_incr_ref(SAM_hdr * hdr)1005 void sam_hdr_incr_ref(SAM_hdr *hdr) {
1006     hdr->ref_count++;
1007 }
1008 
1009 /*! Increments a reference count on hdr.
1010  *
1011  * This permits multiple files to share the same header, all calling
1012  * sam_hdr_free when done, without causing errors for other open  files.
1013  *
1014  * If the reference count hits zero then the header is automatically
1015  * freed. This makes it a synonym for sam_hdr_free().
1016  */
sam_hdr_decr_ref(SAM_hdr * hdr)1017 void sam_hdr_decr_ref(SAM_hdr *hdr) {
1018     sam_hdr_free(hdr);
1019 }
1020 
1021 /*! Deallocates all storage used by a SAM_hdr struct.
1022  *
1023  * This also decrements the header reference count. If after decrementing
1024  * it is still non-zero then the header is assumed to be in use by another
1025  * caller and the free is not done.
1026  *
1027  * This is a synonym for sam_hdr_dec_ref().
1028  */
sam_hdr_free(SAM_hdr * hdr)1029 void sam_hdr_free(SAM_hdr *hdr) {
1030     if (!hdr)
1031 	return;
1032 
1033     if (--hdr->ref_count > 0)
1034 	return;
1035 
1036     if (hdr->type_order)
1037 	dstring_destroy(hdr->type_order);
1038 
1039     if (hdr->text)
1040 	dstring_destroy(hdr->text);
1041 
1042     if (hdr->h)
1043 	HashTableDestroy(hdr->h, 0);
1044 
1045     if (hdr->ref_hash)
1046 	HashTableDestroy(hdr->ref_hash, 0);
1047 
1048     if (hdr->ref) {
1049 	int i;
1050 	for (i = 0; i < hdr->nref; i++)
1051 	    if (hdr->ref[i].name)
1052 		free(hdr->ref[i].name);
1053 	free(hdr->ref);
1054     }
1055 
1056     if (hdr->rg_hash)
1057 	HashTableDestroy(hdr->rg_hash, 0);
1058 
1059     if (hdr->rg) {
1060 	int i;
1061 	for (i = 0; i < hdr->nrg; i++)
1062 	    if (hdr->rg[i].name)
1063 		free(hdr->rg[i].name);
1064 	free(hdr->rg);
1065     }
1066 
1067     if (hdr->pg_hash)
1068 	HashTableDestroy(hdr->pg_hash, 0);
1069 
1070     if (hdr->pg) {
1071 	int i;
1072 	for (i = 0; i < hdr->npg; i++)
1073 	    if (hdr->pg[i].name)
1074 		free(hdr->pg[i].name);
1075 	free(hdr->pg);
1076     }
1077 
1078     if (hdr->pg_end)
1079 	free(hdr->pg_end);
1080 
1081     if (hdr->type_pool)
1082 	pool_destroy(hdr->type_pool);
1083 
1084     if (hdr->tag_pool)
1085 	pool_destroy(hdr->tag_pool);
1086 
1087     if (hdr->str_pool)
1088 	string_pool_destroy(hdr->str_pool);
1089 
1090     free(hdr);
1091 }
1092 
sam_hdr_length(SAM_hdr * hdr)1093 int sam_hdr_length(SAM_hdr *hdr) {
1094     return dstring_length(hdr->text);
1095 }
1096 
sam_hdr_str(SAM_hdr * hdr)1097 char *sam_hdr_str(SAM_hdr *hdr) {
1098     return dstring_str(hdr->text);
1099 }
1100 
1101 /*
1102  * Looks up a reference sequence by name and returns the numerical ID.
1103  * Returns -1 if unknown reference.
1104  */
sam_hdr_name2ref(SAM_hdr * hdr,char * ref)1105 int sam_hdr_name2ref(SAM_hdr *hdr, char *ref) {
1106     HashItem *hi = HashTableSearch(hdr->ref_hash, ref, strlen(ref));
1107     return hi ? hi->data.i : -1;
1108 }
1109 
1110 /*
1111  * Looks up a read-group by name and returns a pointer to the start of the
1112  * associated tag list.
1113  *
1114  * Returns NULL on failure
1115  */
sam_hdr_find_rg(SAM_hdr * hdr,char * rg)1116 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, char *rg) {
1117     HashItem *hi = HashTableSearch(hdr->rg_hash, rg, 0);
1118     return hi ? &hdr->rg[hi->data.i] : NULL;
1119 }
1120 
1121 
1122 /*
1123  * Fixes any PP links in @PG headers.
1124  * If the entries are in order then this doesn't need doing, but incase
1125  * our header is out of order this goes through the sh->pg[] array
1126  * setting the prev_id field.
1127  *
1128  * Note we can have multiple complete chains. This code should identify the
1129  * tails of these chains as these are the entries we have to link to in
1130  * subsequent PP records.
1131  *
1132  * Returns 0 on sucess
1133  *        -1 on failure (indicating broken PG/PP records)
1134  */
sam_hdr_link_pg(SAM_hdr * hdr)1135 int sam_hdr_link_pg(SAM_hdr *hdr) {
1136     int i, j, ret = 0;
1137 
1138     hdr->npg_end_alloc = hdr->npg;
1139     hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1140     if (!hdr->pg_end)
1141 	return -1;
1142 
1143     for (i = 0; i < hdr->npg; i++)
1144 	hdr->pg_end[i] = i;
1145 
1146     for (i = 0; i < hdr->npg; i++) {
1147 	HashItem *hi;
1148 	SAM_hdr_tag *tag;
1149 
1150 	for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1151 	    if (tag->str[0] == 'P' && tag->str[1] == 'P')
1152 		break;
1153 	}
1154 	if (!tag) {
1155 	    /* Chain start points */
1156 	    continue;
1157 	}
1158 
1159 	hi = HashTableSearch(hdr->pg_hash, tag->str+3, tag->len-3);
1160 	if (!hi) {
1161 	    ret = -1;
1162 	    continue;
1163 	}
1164 
1165 	hdr->pg[i].prev_id = hdr->pg[hi->data.i].id;
1166 	hdr->pg_end[hi->data.i] = -1;
1167     }
1168 
1169     for (i = j = 0; i < hdr->npg; i++) {
1170 	if (hdr->pg_end[i] != -1)
1171 	    hdr->pg_end[j++] = hdr->pg_end[i];
1172     }
1173     hdr->npg_end = j;
1174 
1175     return ret;
1176 }
1177 
1178 /*
1179  * Returns a unique ID from a base name.
1180  *
1181  * The value returned is valid until the next call to
1182  * this function.
1183  */
sam_hdr_PG_ID(SAM_hdr * sh,const char * name)1184 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1185     if (!(HashTableSearch(sh->pg_hash, (char *)name, 0)))
1186 	return name;
1187 
1188     do {
1189 	sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1190     } while (HashTableSearch(sh->pg_hash, sh->ID_buf, 0));
1191 
1192     return sh->ID_buf;
1193 }
1194 
1195 /*
1196  * Add an @PG line.
1197  *
1198  * If we wish complete control over this use sam_hdr_add() directly. This
1199  * function uses that, but attempts to do a lot of tedious house work for
1200  * you too.
1201  *
1202  * - It will generate a suitable ID if the supplied one clashes.
1203  * - It will generate multiple @PG records if we have multiple PG chains.
1204  *
1205  * Call it as per sam_hdr_add() with a series of key,value pairs ending
1206  * in NULL.
1207  *
1208  * Returns 0 on success
1209  *        -1 on failure
1210  */
sam_hdr_add_PG(SAM_hdr * sh,const char * name,...)1211 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1212     va_list args;
1213     va_start(args, name);
1214 
1215     if (sh->npg_end) {
1216 	/* Copy ends array to avoid us looping while modifying it */
1217 	int *end = malloc(sh->npg_end * sizeof(int));
1218 	int i, nends = sh->npg_end;
1219 
1220 	if (!end)
1221 	    return -1;
1222 
1223 	memcpy(end, sh->pg_end, nends * sizeof(*end));
1224 
1225 	for (i = 0; i < nends; i++) {
1226 	    if (-1 == sam_hdr_vadd(sh, "PG", args,
1227 				   "ID", sam_hdr_PG_ID(sh, name),
1228 				   "PN", name,
1229 				   "PP", sh->pg[end[i]].name,
1230 				   NULL)) {
1231 		free(end);
1232 		return  -1;
1233 	    }
1234 	}
1235 
1236 	free(end);
1237     } else {
1238 	if (-1 == sam_hdr_vadd(sh, "PG", args,
1239 			       "ID", sam_hdr_PG_ID(sh, name),
1240 			       "PN", name,
1241 			       NULL))
1242 	    return -1;
1243     }
1244 
1245     //sam_hdr_dump(sh);
1246 
1247     return 0;
1248 }
1249 
1250 /*
1251  * A function to help with construction of CL tags in @PG records.
1252  * Takes an argc, argv pair and returns a single space-separated string.
1253  * This string should be deallocated by the calling function.
1254  *
1255  * Returns malloced char * on success
1256  *         NULL on failure
1257  */
stringify_argv(int argc,char * argv[])1258 char *stringify_argv(int argc, char *argv[]) {
1259     char *str, *cp;
1260     size_t nbytes = 1;
1261     int i, j;
1262 
1263     /* Allocate */
1264     for (i = 0; i < argc; i++) {
1265 	nbytes += strlen(argv[i]) + 1;
1266     }
1267     if (!(str = malloc(nbytes)))
1268 	return NULL;
1269 
1270     /* Copy */
1271     cp = str;
1272     for (i = 0; i < argc; i++) {
1273 	j = 0;
1274 	while (argv[i][j]) {
1275 	    if (argv[i][j] == '\t')
1276 		*cp++ = ' ';
1277 	    else
1278 		*cp++ = argv[i][j];
1279 	    j++;
1280 	}
1281 	*cp++ = ' ';
1282     }
1283     *cp++ = 0;
1284 
1285     return str;
1286 }
1287