1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #include <config.h>
32 
33 #include <string.h>
34 #include <assert.h>
35 
36 #include "hts_internal.h"
37 #include "cram/sam_header.h"
38 #include "cram/string_alloc.h"
39 
sam_hdr_error(char * msg,char * line,int len,int lno)40 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
41     int j;
42 
43     for (j = 0; j < len && line[j] != '\n'; j++)
44 	;
45     hts_log_error("%s at line %d: \"%.*s\"", msg, lno, j, line);
46 }
47 
sam_hdr_dump(SAM_hdr * hdr)48 void sam_hdr_dump(SAM_hdr *hdr) {
49     khint_t k;
50     int i;
51 
52     printf("===DUMP===\n");
53     for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
54 	SAM_hdr_type *t1, *t2;
55 	char c[2];
56 
57 	if (!kh_exist(hdr->h, k))
58 	    continue;
59 
60 	t1 = t2 = kh_val(hdr->h, k);
61 	c[0] = kh_key(hdr->h, k)>>8;
62 	c[1] = kh_key(hdr->h, k)&0xff;
63 	printf("Type %.2s, count %d\n", c, t1->prev->order+1);
64 
65 	do {
66 	    SAM_hdr_tag *tag;
67 	    printf(">>>%d ", t1->order);
68 	    for (tag = t1->tag; tag; tag=tag->next) {
69 		printf("\"%.2s\":\"%.*s\"\t",
70 		       tag->str, tag->len-3, tag->str+3);
71 	    }
72 	    putchar('\n');
73 	    t1 = t1->next;
74 	} while (t1 != t2);
75     }
76 
77     /* Dump out PG chains */
78     printf("\n@PG chains:\n");
79     for (i = 0; i < hdr->npg_end; i++) {
80 	int j;
81 	printf("  %d:", i);
82 	for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
83 	    printf("%s%d(%.*s)",
84 		   j == hdr->pg_end[i] ? " " : "->",
85 		   j, hdr->pg[j].name_len, hdr->pg[j].name);
86 	}
87 	printf("\n");
88     }
89 
90     puts("===END DUMP===");
91 }
92 
93 /* Updates the hash tables in the SAM_hdr structure.
94  *
95  * Returns 0 on success;
96  *        -1 on failure
97  */
sam_hdr_update_hashes(SAM_hdr * sh,int type,SAM_hdr_type * h_type)98 static int sam_hdr_update_hashes(SAM_hdr *sh,
99 				 int type,
100 				 SAM_hdr_type *h_type) {
101     /* Add to reference hash? */
102     if ((type>>8) == 'S' && (type&0xff) == 'Q') {
103 	SAM_hdr_tag *tag;
104 	SAM_SQ *new_ref;
105 	int nref = sh->nref;
106 
107 	new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
108 	if (!new_ref)
109 	    return -1;
110 	sh->ref = new_ref;
111 
112 	tag = h_type->tag;
113 	sh->ref[nref].name = NULL;
114 	sh->ref[nref].len  = 0;
115 	sh->ref[nref].ty = h_type;
116 	sh->ref[nref].tag  = tag;
117 
118 	while (tag) {
119 	    if (tag->str[0] == 'S' && tag->str[1] == 'N') {
120 		if (!(sh->ref[nref].name = malloc(tag->len)))
121 		    return -1;
122 		strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
123 		sh->ref[nref].name[tag->len-3] = 0;
124 	    } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
125 		sh->ref[nref].len = atoi(tag->str+3);
126 	    }
127 	    tag = tag->next;
128 	}
129 
130 	if (sh->ref[nref].name) {
131 	    khint_t k;
132 	    int r;
133 	    k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
134 	    if (-1 == r) return -1;
135 	    kh_val(sh->ref_hash, k) = nref;
136 	} else {
137 	    return -1; // SN should be present, according to spec.
138 	}
139 
140 	sh->nref++;
141     }
142 
143     /* Add to read-group hash? */
144     if ((type>>8) == 'R' && (type&0xff) == 'G') {
145 	SAM_hdr_tag *tag;
146 	SAM_RG *new_rg;
147 	int nrg = sh->nrg;
148 
149 	new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
150 	if (!new_rg)
151 	    return -1;
152 	sh->rg = new_rg;
153 
154 	tag = h_type->tag;
155 	sh->rg[nrg].name = NULL;
156 	sh->rg[nrg].name_len = 0;
157 	sh->rg[nrg].ty   = h_type;
158 	sh->rg[nrg].tag  = tag;
159 	sh->rg[nrg].id   = nrg;
160 
161 	while (tag) {
162 	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
163 		if (!(sh->rg[nrg].name = malloc(tag->len)))
164 		    return -1;
165 		strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
166 		sh->rg[nrg].name[tag->len-3] = 0;
167 		sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
168 	    }
169 	    tag = tag->next;
170 	}
171 
172 	if (sh->rg[nrg].name) {
173 	    khint_t k;
174 	    int r;
175 	    k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
176 	    if (-1 == r) return -1;
177 	    kh_val(sh->rg_hash, k) = nrg;
178 	} else {
179 	    return -1; // ID should be present, according to spec.
180 	}
181 
182 	sh->nrg++;
183     }
184 
185     /* Add to program hash? */
186     if ((type>>8) == 'P' && (type&0xff) == 'G') {
187 	SAM_hdr_tag *tag;
188 	SAM_PG *new_pg;
189 	int npg = sh->npg;
190 
191 	new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
192 	if (!new_pg)
193 	    return -1;
194 	sh->pg = new_pg;
195 
196 	tag = h_type->tag;
197 	sh->pg[npg].name = NULL;
198 	sh->pg[npg].name_len = 0;
199 	sh->pg[npg].ty  = h_type;
200 	sh->pg[npg].tag  = tag;
201 	sh->pg[npg].id   = npg;
202 	sh->pg[npg].prev_id = -1;
203 
204 	while (tag) {
205 	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
206 		if (!(sh->pg[npg].name = malloc(tag->len)))
207 		    return -1;
208 		strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
209 		sh->pg[npg].name[tag->len-3] = 0;
210 		sh->pg[npg].name_len = strlen(sh->pg[npg].name);
211 	    } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
212 		// Resolve later if needed
213 		khint_t k;
214 		char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
215 		k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
216 		tag->str[tag->len] = tmp;
217 
218 		if (k != kh_end(sh->pg_hash)) {
219 		    int p_id = kh_val(sh->pg_hash, k);
220 		    sh->pg[npg].prev_id = sh->pg[p_id].id;
221 
222 		    /* Unmark previous entry as a PG termination */
223 		    if (sh->npg_end > 0 &&
224 			sh->pg_end[sh->npg_end-1] == p_id) {
225 			sh->npg_end--;
226 		    } else {
227 			int i;
228 			for (i = 0; i < sh->npg_end; i++) {
229 			    if (sh->pg_end[i] == p_id) {
230 				memmove(&sh->pg_end[i], &sh->pg_end[i+1],
231 					(sh->npg_end-i-1)*sizeof(*sh->pg_end));
232 				sh->npg_end--;
233 			    }
234 			}
235 		    }
236 		} else {
237 		    sh->pg[npg].prev_id = -1;
238 		}
239 	    }
240 	    tag = tag->next;
241 	}
242 
243 	if (sh->pg[npg].name) {
244 	    khint_t k;
245 	    int r;
246 	    k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
247 	    if (-1 == r) return -1;
248 	    kh_val(sh->pg_hash, k) = npg;
249 	} else {
250 	    return -1; // ID should be present, according to spec.
251 	}
252 
253 	/* Add to npg_end[] array. Remove later if we find a PP line */
254 	if (sh->npg_end >= sh->npg_end_alloc) {
255 	    int *new_pg_end;
256 	    int  new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4;
257 
258 	    new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int));
259 	    if (!new_pg_end)
260 		return -1;
261 	    sh->npg_end_alloc = new_alloc;
262 	    sh->pg_end = new_pg_end;
263 	}
264 	sh->pg_end[sh->npg_end++] = npg;
265 
266 	sh->npg++;
267     }
268 
269     return 0;
270 }
271 
272 /*
273  * Appends a formatted line to an existing SAM header.
274  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
275  * optional new-line. If it contains more than 1 line then multiple lines
276  * will be added in order.
277  *
278  * Input text is of maximum length len or as terminated earlier by a NUL.
279  * Len may be 0 if unknown, in which case lines must be NUL-terminated.
280  *
281  * Returns 0 on success
282  *        -1 on failure
283  */
sam_hdr_add_lines(SAM_hdr * sh,const char * lines,int len)284 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
285     int i, lno, text_offset;
286     char *hdr;
287 
288     if (!len)
289 	len = strlen(lines);
290 
291     text_offset = ks_len(&sh->text);
292     if (EOF == kputsn(lines, len, &sh->text))
293 	return -1;
294     hdr = ks_str(&sh->text) + text_offset;
295 
296     for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
297 	khint32_t type;
298 	khint_t k;
299 
300 	int l_start = i, new;
301 	SAM_hdr_type *h_type;
302 	SAM_hdr_tag *h_tag, *last;
303 
304 	if (hdr[i] != '@') {
305 	    int j;
306 	    for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
307 		;
308 	    sam_hdr_error("Header line does not start with '@'",
309 			  &hdr[l_start], len - l_start, lno);
310 	    return -1;
311 	}
312 
313 	type = (hdr[i+1]<<8) | hdr[i+2];
314 	if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
315 	    hdr[i+2] < 'A' || hdr[i+2] > 'z') {
316 	    sam_hdr_error("Header line does not have a two character key",
317 			  &hdr[l_start], len - l_start, lno);
318 	    return -1;
319 	}
320 
321 	i += 3;
322 	if (hdr[i] == '\n')
323 	    continue;
324 
325 	// Add the header line type
326 	if (!(h_type = pool_alloc(sh->type_pool)))
327 	    return -1;
328 	if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
329 	    return -1;
330 
331 	// Form the ring, either with self or other lines of this type
332 	if (!new) {
333 	    SAM_hdr_type *t = kh_val(sh->h, k), *p;
334 	    p = t->prev;
335 
336 	    assert(p->next == t);
337 	    p->next = h_type;
338 	    h_type->prev = p;
339 
340 	    t->prev = h_type;
341 	    h_type->next = t;
342 	    h_type->order = p->order+1;
343 	} else {
344 	    kh_val(sh->h, k) = h_type;
345 	    h_type->prev = h_type->next = h_type;
346 	    h_type->order = 0;
347 	}
348 
349 	// Parse the tags on this line
350 	last = NULL;
351 	if ((type>>8) == 'C' && (type&0xff) == 'O') {
352 	    int j;
353 	    if (hdr[i] != '\t') {
354 		sam_hdr_error("Missing tab",
355 			      &hdr[l_start], len - l_start, lno);
356 		return -1;
357 	    }
358 
359 	    for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
360 		;
361 
362 	    if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
363 		return -1;
364 	    h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
365 	    h_tag->len = j-i;
366 	    h_tag->next = NULL;
367 	    if (!h_tag->str)
368 		return -1;
369 
370 	    i = j;
371 
372 	} else {
373 	    do {
374 		int j;
375 		if (hdr[i] != '\t') {
376 		    sam_hdr_error("Missing tab",
377 				  &hdr[l_start], len - l_start, lno);
378 		    return -1;
379 		}
380 
381 		for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
382 		    ;
383 
384 		if (!(h_tag = pool_alloc(sh->tag_pool)))
385 		    return -1;
386 		h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
387 		h_tag->len = j-i;
388 		h_tag->next = NULL;
389 		if (!h_tag->str)
390 		    return -1;
391 
392 		if (h_tag->len < 3 || h_tag->str[2] != ':') {
393 		    sam_hdr_error("Malformed key:value pair",
394 				  &hdr[l_start], len - l_start, lno);
395 		    return -1;
396 		}
397 
398 		if (last)
399 		    last->next = h_tag;
400 		else
401 		    h_type->tag = h_tag;
402 
403 		last = h_tag;
404 		i = j;
405 	    } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
406 	}
407 
408 	/* Update RG/SQ hashes */
409 	if (-1 == sam_hdr_update_hashes(sh, type, h_type))
410 	    return -1;
411     }
412 
413     return 0;
414 }
415 
416 /*
417  * Adds a single line to a SAM header.
418  * Specify type and one or more key,value pairs, ending with the NULL key.
419  * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
420  *
421  * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
422  *        -1 on failure
423  */
sam_hdr_add(SAM_hdr * sh,const char * type,...)424 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
425     va_list args;
426     va_start(args, type);
427     return sam_hdr_vadd(sh, type, args, NULL);
428 }
429 
430 /*
431  * sam_hdr_add with a va_list interface.
432  *
433  * Note: this function invokes va_arg at least once, making the value
434  * of ap indeterminate after the return.  The caller should call
435  * va_start/va_end before/after calling this function or use va_copy.
436  */
sam_hdr_vadd(SAM_hdr * sh,const char * type,va_list ap,...)437 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
438     va_list args;
439     SAM_hdr_type *h_type;
440     SAM_hdr_tag *h_tag, *last;
441     int new;
442     khint32_t type_i = (type[0]<<8) | type[1], k;
443 
444     if (EOF == kputc_('@', &sh->text))
445 	return -1;
446     if (EOF == kputsn(type, 2, &sh->text))
447 	return -1;
448 
449     if (!(h_type = pool_alloc(sh->type_pool)))
450 	return -1;
451     if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
452 	return -1;
453 
454     // Form the ring, either with self or other lines of this type
455     if (!new) {
456 	SAM_hdr_type *t = kh_val(sh->h, k), *p;
457 	p = t->prev;
458 
459 	assert(p->next == t);
460 	p->next = h_type;
461 	h_type->prev = p;
462 
463 	t->prev = h_type;
464 	h_type->next = t;
465 	h_type->order = p->order + 1;
466     } else {
467 	kh_val(sh->h, k) = h_type;
468 	h_type->prev = h_type->next = h_type;
469 	h_type->order = 0;
470     }
471 
472     last = NULL;
473 
474     // Any ... varargs
475     va_start(args, ap);
476     for (;;) {
477 	char *k, *v;
478 	int idx;
479 
480 	if (!(k = (char *)va_arg(args, char *)))
481 	    break;
482 	v = va_arg(args, char *);
483 
484 	if (EOF == kputc_('\t', &sh->text))
485 	    return -1;
486 
487 	if (!(h_tag = pool_alloc(sh->tag_pool)))
488 	    return -1;
489 	idx = ks_len(&sh->text);
490 
491 	if (EOF == kputs(k, &sh->text))
492 	    return -1;
493 	if (EOF == kputc_(':', &sh->text))
494 	    return -1;
495 	if (EOF == kputs(v, &sh->text))
496 	    return -1;
497 
498 	h_tag->len = ks_len(&sh->text) - idx;
499 	h_tag->str = string_ndup(sh->str_pool,
500 				 ks_str(&sh->text) + idx,
501 				 h_tag->len);
502 	h_tag->next = NULL;
503 	if (!h_tag->str)
504 	    return -1;
505 
506 	if (last)
507 	    last->next = h_tag;
508 	else
509 	    h_type->tag = h_tag;
510 
511 	last = h_tag;
512     }
513     va_end(args);
514 
515     // Plus the specified va_list params
516     for (;;) {
517 	char *k, *v;
518 	int idx;
519 
520 	if (!(k = (char *)va_arg(ap, char *)))
521 	    break;
522 	v = va_arg(ap, char *);
523 
524 	if (EOF == kputc_('\t', &sh->text))
525 	    return -1;
526 
527 	if (!(h_tag = pool_alloc(sh->tag_pool)))
528 	    return -1;
529 	idx = ks_len(&sh->text);
530 
531 	if (EOF == kputs(k, &sh->text))
532 	    return -1;
533 	if (EOF == kputc_(':', &sh->text))
534 	    return -1;
535 	if (EOF == kputs(v, &sh->text))
536 	    return -1;
537 
538 	h_tag->len = ks_len(&sh->text) - idx;
539 	h_tag->str = string_ndup(sh->str_pool,
540 				 ks_str(&sh->text) + idx,
541 				 h_tag->len);
542 	h_tag->next = NULL;
543 	if (!h_tag->str)
544 	    return -1;
545 
546 	if (last)
547 	    last->next = h_tag;
548 	else
549 	    h_type->tag = h_tag;
550 
551 	last = h_tag;
552     }
553     va_end(ap);
554 
555     if (EOF == kputc('\n', &sh->text))
556 	return -1;
557 
558     int itype = (type[0]<<8) | type[1];
559     if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
560 	return -1;
561 
562     return h_type->order;
563 }
564 
565 /*
566  * Returns the first header item matching 'type'. If ID is non-NULL it checks
567  * for the tag ID: and compares against the specified ID.
568  *
569  * Returns NULL if no type/ID is found
570  */
sam_hdr_find(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)571 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
572 			   char *ID_key, char *ID_value) {
573     SAM_hdr_type *t1, *t2;
574     int itype = (type[0]<<8)|(type[1]);
575     khint_t k;
576 
577     /* Special case for types we have prebuilt hashes on */
578     if (ID_key) {
579 	if (type[0]   == 'S' && type[1]   == 'Q' &&
580 	    ID_key[0] == 'S' && ID_key[1] == 'N') {
581 	    k = kh_get(m_s2i, hdr->ref_hash, ID_value);
582 	    return k != kh_end(hdr->ref_hash)
583 		? hdr->ref[kh_val(hdr->ref_hash, k)].ty
584 		: NULL;
585 	}
586 
587 	if (type[0]   == 'R' && type[1]   == 'G' &&
588 	    ID_key[0] == 'I' && ID_key[1] == 'D') {
589 	    k = kh_get(m_s2i, hdr->rg_hash, ID_value);
590 	    return k != kh_end(hdr->rg_hash)
591 		? hdr->rg[kh_val(hdr->rg_hash, k)].ty
592 		: NULL;
593 	}
594 
595 	if (type[0]   == 'P' && type[1]   == 'G' &&
596 	    ID_key[0] == 'I' && ID_key[1] == 'D') {
597 	    k = kh_get(m_s2i, hdr->pg_hash, ID_value);
598 	    return k != kh_end(hdr->pg_hash)
599 		? hdr->pg[kh_val(hdr->pg_hash, k)].ty
600 		: NULL;
601 	}
602     }
603 
604     k = kh_get(sam_hdr, hdr->h, itype);
605     if (k == kh_end(hdr->h))
606 	return NULL;
607 
608     if (!ID_key)
609 	return kh_val(hdr->h, k);
610 
611     t1 = t2 = kh_val(hdr->h, k);
612     do {
613 	SAM_hdr_tag *tag;
614 	for (tag = t1->tag; tag; tag = tag->next) {
615 	    if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
616 		char *cp1 = tag->str+3;
617 		char *cp2 = ID_value;
618 		while (*cp1 && *cp1 == *cp2)
619 		    cp1++, cp2++;
620 		if (*cp2 || *cp1)
621 		    continue;
622 		return t1;
623 	    }
624 	}
625 	t1 = t1->next;
626     } while (t1 != t2);
627 
628     return NULL;
629 }
630 
631 /*
632  * As per SAM_hdr_type, but returns a complete line of formatted text
633  * for a specific head type/ID combination. If ID is NULL then it returns
634  * the first line of the specified type.
635  *
636  * The returned string is malloced and should be freed by the calling
637  * function with free().
638  *
639  * Returns NULL if no type/ID is found.
640  */
sam_hdr_find_line(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)641 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
642 			char *ID_key, char *ID_value) {
643     SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
644     kstring_t ks = KS_INITIALIZER;
645     SAM_hdr_tag *tag;
646     int r = 0;
647 
648     if (!ty)
649 	return NULL;
650 
651     // Paste together the line from the hashed copy
652     r |= (kputc_('@', &ks) == EOF);
653     r |= (kputs(type, &ks) == EOF);
654     for (tag = ty->tag; tag; tag = tag->next) {
655 	r |= (kputc_('\t', &ks) == EOF);
656 	r |= (kputsn(tag->str, tag->len, &ks) == EOF);
657     }
658 
659     if (r) {
660 	KS_FREE(&ks);
661 	return NULL;
662     }
663 
664     return ks_str(&ks);
665 }
666 
667 
668 /*
669  * Looks for a specific key in a single sam header line.
670  * If prev is non-NULL it also fills this out with the previous tag, to
671  * permit use in key removal. *prev is set to NULL when the tag is the first
672  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
673  * tag in the existing list.
674  *
675  * Returns the tag pointer on success
676  *         NULL on failure
677  */
sam_hdr_find_key(SAM_hdr * sh,SAM_hdr_type * type,char * key,SAM_hdr_tag ** prev)678 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
679 			      SAM_hdr_type *type,
680 			      char *key,
681 			      SAM_hdr_tag **prev) {
682     SAM_hdr_tag *tag, *p = NULL;
683 
684     for (tag = type->tag; tag; p = tag, tag = tag->next) {
685 	if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
686 	    if (prev)
687 		*prev = p;
688 	    return tag;
689 	}
690     }
691 
692     if (prev)
693 	*prev = p;
694 
695     return NULL;
696 }
697 
698 
699 /*
700  * Adds or updates tag key,value pairs in a header line.
701  * Eg for adding M5 tags to @SQ lines or updating sort order for the
702  * @HD line (although use the sam_hdr_sort_order() function for
703  * HD manipulation, which is a wrapper around this funuction).
704  *
705  * Specify multiple key,value pairs ending in NULL.
706  *
707  * Returns 0 on success
708  *        -1 on failure
709  */
sam_hdr_update(SAM_hdr * hdr,SAM_hdr_type * type,...)710 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
711     va_list ap;
712 
713     va_start(ap, type);
714 
715     for (;;) {
716 	char *k, *v;
717 	int idx;
718 	SAM_hdr_tag *tag, *prev;
719 
720 	if (!(k = (char *)va_arg(ap, char *)))
721 	    break;
722 	v = va_arg(ap, char *);
723 
724 	tag = sam_hdr_find_key(hdr, type, k, &prev);
725 	if (!tag) {
726 	    if (!(tag = pool_alloc(hdr->tag_pool)))
727 		return -1;
728 	    if (prev)
729 		prev->next = tag;
730 	    else
731 		type->tag = tag;
732 
733 	    tag->next = NULL;
734 	}
735 
736 	idx = ks_len(&hdr->text);
737 	if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
738 	    return -1;
739 	tag->len = ks_len(&hdr->text) - idx;
740 	tag->str = string_ndup(hdr->str_pool,
741 			       ks_str(&hdr->text) + idx,
742 			       tag->len);
743 	if (!tag->str)
744 	    return -1;
745     }
746 
747     va_end(ap);
748 
749     return 0;
750 }
751 
752 #define K(a) (((a)[0]<<8)|((a)[1]))
753 
754 /*
755  * Returns the sort order:
756  */
sam_hdr_sort_order(SAM_hdr * hdr)757 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
758     return hdr->sort_order;
759 }
760 
sam_hdr_parse_sort_order(SAM_hdr * hdr)761 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
762     khint_t k;
763     enum sam_sort_order so;
764 
765     so = ORDER_UNKNOWN;
766     k = kh_get(sam_hdr, hdr->h, K("HD"));
767     if (k != kh_end(hdr->h)) {
768 	SAM_hdr_type *ty = kh_val(hdr->h, k);
769 	SAM_hdr_tag *tag;
770         for (tag = ty->tag; tag; tag = tag->next) {
771 	    if (tag->str[0] == 'S' && tag->str[1] == 'O') {
772 		if (strcmp(tag->str+3, "unsorted") == 0)
773 		    so = ORDER_UNSORTED;
774 		else if (strcmp(tag->str+3, "queryname") == 0)
775 		    so = ORDER_NAME;
776 		else if (strcmp(tag->str+3, "coordinate") == 0)
777 		    so = ORDER_COORD;
778 		else if (strcmp(tag->str+3, "unknown") != 0)
779 		    hts_log_error("Unknown sort order field: %s", tag->str+3);
780 	    }
781 	}
782     }
783 
784     return so;
785 }
786 
787 
788 /*
789  * Reconstructs the kstring from the header hash table.
790  * Returns 0 on success
791  *        -1 on failure
792  */
sam_hdr_rebuild(SAM_hdr * hdr)793 int sam_hdr_rebuild(SAM_hdr *hdr) {
794     /* Order: HD then others */
795     kstring_t ks = KS_INITIALIZER;
796     khint_t k;
797 
798 
799     k = kh_get(sam_hdr, hdr->h, K("HD"));
800     if (k != kh_end(hdr->h)) {
801 	SAM_hdr_type *ty = kh_val(hdr->h, k);
802 	SAM_hdr_tag *tag;
803 	if (EOF == kputs("@HD", &ks))
804 	    return -1;
805 	for (tag = ty->tag; tag; tag = tag->next) {
806 	    if (EOF == kputc_('\t', &ks))
807 		return -1;
808 	    if (EOF == kputsn_(tag->str, tag->len, &ks))
809 		return -1;
810 	}
811 	if (EOF == kputc('\n', &ks))
812 	    return -1;
813     }
814 
815     for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
816 	SAM_hdr_type *t1, *t2;
817 
818 	if (!kh_exist(hdr->h, k))
819 	    continue;
820 
821 	if (kh_key(hdr->h, k) == K("HD"))
822 	    continue;
823 
824 	t1 = t2 = kh_val(hdr->h, k);
825 	do {
826 	    SAM_hdr_tag *tag;
827 	    char c[2];
828 
829 	    if (EOF == kputc_('@', &ks))
830 		return -1;
831 	    c[0] = kh_key(hdr->h, k)>>8;
832 	    c[1] = kh_key(hdr->h, k)&0xff;
833 	    if (EOF == kputsn_(c, 2, &ks))
834 		return -1;
835 	    for (tag = t1->tag; tag; tag=tag->next) {
836 		if (EOF == kputc_('\t', &ks))
837 		    return -1;
838 		if (EOF == kputsn_(tag->str, tag->len, &ks))
839 		    return -1;
840 	    }
841 	    if (EOF == kputc('\n', &ks))
842 		return -1;
843 	    t1 = t1->next;
844 	} while (t1 != t2);
845     }
846 
847     if (ks_str(&hdr->text))
848 	KS_FREE(&hdr->text);
849 
850     hdr->text = ks;
851 
852     return 0;
853 }
854 
855 
856 /*
857  * Creates an empty SAM header, ready to be populated.
858  *
859  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
860  *         NULL on failure
861  */
sam_hdr_new()862 SAM_hdr *sam_hdr_new() {
863     SAM_hdr *sh = calloc(1, sizeof(*sh));
864 
865     if (!sh)
866 	return NULL;
867 
868     sh->h = kh_init(sam_hdr);
869     if (!sh->h)
870 	goto err;
871 
872     sh->ID_cnt = 1;
873     sh->ref_count = 1;
874 
875     sh->nref = 0;
876     sh->ref  = NULL;
877     if (!(sh->ref_hash = kh_init(m_s2i)))
878 	goto err;
879 
880     sh->nrg = 0;
881     sh->rg  = NULL;
882     if (!(sh->rg_hash = kh_init(m_s2i)))
883 	goto err;
884 
885     sh->npg = 0;
886     sh->pg  = NULL;
887     sh->npg_end = sh->npg_end_alloc = 0;
888     sh->pg_end = NULL;
889     if (!(sh->pg_hash = kh_init(m_s2i)))
890 	goto err;
891 
892     KS_INIT(&sh->text);
893 
894     if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
895 	goto err;
896 
897     if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
898 	goto err;
899 
900     if (!(sh->str_pool = string_pool_create(8192)))
901 	goto err;
902 
903     return sh;
904 
905  err:
906     if (sh->h)
907 	kh_destroy(sam_hdr, sh->h);
908 
909     if (sh->tag_pool)
910 	pool_destroy(sh->tag_pool);
911 
912     if (sh->type_pool)
913 	pool_destroy(sh->type_pool);
914 
915     if (sh->str_pool)
916 	string_pool_destroy(sh->str_pool);
917 
918     free(sh);
919 
920     return NULL;
921 }
922 
923 
924 /*
925  * Tokenises a SAM header into a hash table.
926  * Also extracts a few bits on specific data types, such as @RG lines.
927  *
928  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
929  *         NULL on failure
930  */
sam_hdr_parse_(const char * hdr,int len)931 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
932     /* Make an empty SAM_hdr */
933     SAM_hdr *sh;
934 
935     sh = sam_hdr_new();
936     if (NULL == sh) return NULL;
937 
938     if (NULL == hdr) return sh; // empty header is permitted
939 
940     /* Parse the header, line by line */
941     if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
942 	sam_hdr_free(sh);
943 	return NULL;
944     }
945 
946     /* Obtain sort order */
947     sh->sort_order = sam_hdr_parse_sort_order(sh);
948 
949     //sam_hdr_dump(sh);
950     //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
951     //sam_hdr_rebuild(sh);
952     //printf(">>%s<<", ks_str(sh->text));
953 
954     //parse_references(sh);
955     //parse_read_groups(sh);
956 
957     sam_hdr_link_pg(sh);
958     //sam_hdr_dump(sh);
959 
960     return sh;
961 }
962 
963 /*
964  * Produces a duplicate copy of hdr and returns it.
965  * Returns NULL on failure
966  */
sam_hdr_dup(SAM_hdr * hdr)967 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
968     if (-1 == sam_hdr_rebuild(hdr))
969 	return NULL;
970 
971     return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
972 }
973 
974 /*! Increments a reference count on hdr.
975  *
976  * This permits multiple files to share the same header, all calling
977  * sam_hdr_free when done, without causing errors for other open  files.
978  */
sam_hdr_incr_ref(SAM_hdr * hdr)979 void sam_hdr_incr_ref(SAM_hdr *hdr) {
980     hdr->ref_count++;
981 }
982 
983 /*! Increments a reference count on hdr.
984  *
985  * This permits multiple files to share the same header, all calling
986  * sam_hdr_free when done, without causing errors for other open  files.
987  *
988  * If the reference count hits zero then the header is automatically
989  * freed. This makes it a synonym for sam_hdr_free().
990  */
sam_hdr_decr_ref(SAM_hdr * hdr)991 void sam_hdr_decr_ref(SAM_hdr *hdr) {
992     sam_hdr_free(hdr);
993 }
994 
995 /*! Deallocates all storage used by a SAM_hdr struct.
996  *
997  * This also decrements the header reference count. If after decrementing
998  * it is still non-zero then the header is assumed to be in use by another
999  * caller and the free is not done.
1000  *
1001  * This is a synonym for sam_hdr_dec_ref().
1002  */
sam_hdr_free(SAM_hdr * hdr)1003 void sam_hdr_free(SAM_hdr *hdr) {
1004     if (!hdr)
1005 	return;
1006 
1007     if (--hdr->ref_count > 0)
1008 	return;
1009 
1010     if (ks_str(&hdr->text))
1011 	KS_FREE(&hdr->text);
1012 
1013     if (hdr->h)
1014 	kh_destroy(sam_hdr, hdr->h);
1015 
1016     if (hdr->ref_hash)
1017 	kh_destroy(m_s2i, hdr->ref_hash);
1018 
1019     if (hdr->ref) {
1020 	int i;
1021 	for (i = 0; i < hdr->nref; i++)
1022 	    if (hdr->ref[i].name)
1023 		free(hdr->ref[i].name);
1024 	free(hdr->ref);
1025     }
1026 
1027     if (hdr->rg_hash)
1028 	kh_destroy(m_s2i, hdr->rg_hash);
1029 
1030     if (hdr->rg) {
1031 	int i;
1032 	for (i = 0; i < hdr->nrg; i++)
1033 	    if (hdr->rg[i].name)
1034 		free(hdr->rg[i].name);
1035 	free(hdr->rg);
1036     }
1037 
1038     if (hdr->pg_hash)
1039 	kh_destroy(m_s2i, hdr->pg_hash);
1040 
1041     if (hdr->pg) {
1042 	int i;
1043 	for (i = 0; i < hdr->npg; i++)
1044 	    if (hdr->pg[i].name)
1045 		free(hdr->pg[i].name);
1046 	free(hdr->pg);
1047     }
1048 
1049     if (hdr->pg_end)
1050 	free(hdr->pg_end);
1051 
1052     if (hdr->type_pool)
1053 	pool_destroy(hdr->type_pool);
1054 
1055     if (hdr->tag_pool)
1056 	pool_destroy(hdr->tag_pool);
1057 
1058     if (hdr->str_pool)
1059 	string_pool_destroy(hdr->str_pool);
1060 
1061     free(hdr);
1062 }
1063 
sam_hdr_length(SAM_hdr * hdr)1064 int sam_hdr_length(SAM_hdr *hdr) {
1065     return ks_len(&hdr->text);
1066 }
1067 
sam_hdr_str(SAM_hdr * hdr)1068 char *sam_hdr_str(SAM_hdr *hdr) {
1069     return ks_str(&hdr->text);
1070 }
1071 
1072 /*
1073  * Looks up a reference sequence by name and returns the numerical ID.
1074  * Returns -1 if unknown reference.
1075  */
sam_hdr_name2ref(SAM_hdr * hdr,const char * ref)1076 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1077     khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1078     return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1079 }
1080 
1081 /*
1082  * Looks up a read-group by name and returns a pointer to the start of the
1083  * associated tag list.
1084  *
1085  * Returns NULL on failure
1086  */
sam_hdr_find_rg(SAM_hdr * hdr,const char * rg)1087 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1088     khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1089     return k == kh_end(hdr->rg_hash)
1090 	? NULL
1091 	: &hdr->rg[kh_val(hdr->rg_hash, k)];
1092 }
1093 
1094 
1095 /*
1096  * Fixes any PP links in @PG headers.
1097  * If the entries are in order then this doesn't need doing, but incase
1098  * our header is out of order this goes through the sh->pg[] array
1099  * setting the prev_id field.
1100  *
1101  * Note we can have multiple complete chains. This code should identify the
1102  * tails of these chains as these are the entries we have to link to in
1103  * subsequent PP records.
1104  *
1105  * Returns 0 on sucess
1106  *        -1 on failure (indicating broken PG/PP records)
1107  */
sam_hdr_link_pg(SAM_hdr * hdr)1108 int sam_hdr_link_pg(SAM_hdr *hdr) {
1109     int i, j, ret = 0;
1110 
1111     hdr->npg_end_alloc = hdr->npg;
1112     hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1113     if (!hdr->pg_end)
1114 	return -1;
1115 
1116     for (i = 0; i < hdr->npg; i++)
1117 	hdr->pg_end[i] = i;
1118 
1119     for (i = 0; i < hdr->npg; i++) {
1120 	khint_t k;
1121 	SAM_hdr_tag *tag;
1122 	char tmp;
1123 
1124 	for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1125 	    if (tag->str[0] == 'P' && tag->str[1] == 'P')
1126 		break;
1127 	}
1128 	if (!tag) {
1129 	    /* Chain start points */
1130 	    continue;
1131 	}
1132 
1133 	tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1134 	k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1135 	tag->str[tag->len] = tmp;
1136 
1137 	if (k == kh_end(hdr->pg_hash)) {
1138 	    ret = -1;
1139 	    continue;
1140 	}
1141 
1142 	hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1143 	hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1144     }
1145 
1146     for (i = j = 0; i < hdr->npg; i++) {
1147 	if (hdr->pg_end[i] != -1)
1148 	    hdr->pg_end[j++] = hdr->pg_end[i];
1149     }
1150     hdr->npg_end = j;
1151 
1152     return ret;
1153 }
1154 
1155 /*
1156  * Returns a unique ID from a base name.
1157  *
1158  * The value returned is valid until the next call to
1159  * this function.
1160  */
sam_hdr_PG_ID(SAM_hdr * sh,const char * name)1161 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1162     khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1163     if (k == kh_end(sh->pg_hash))
1164 	return name;
1165 
1166     do {
1167 	sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1168 	k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1169     } while (k != kh_end(sh->pg_hash));
1170 
1171     return sh->ID_buf;
1172 }
1173 
1174 /*
1175  * Add an @PG line.
1176  *
1177  * If we wish complete control over this use sam_hdr_add() directly. This
1178  * function uses that, but attempts to do a lot of tedious house work for
1179  * you too.
1180  *
1181  * - It will generate a suitable ID if the supplied one clashes.
1182  * - It will generate multiple @PG records if we have multiple PG chains.
1183  *
1184  * Call it as per sam_hdr_add() with a series of key,value pairs ending
1185  * in NULL.
1186  *
1187  * Returns 0 on success
1188  *        -1 on failure
1189  */
sam_hdr_add_PG(SAM_hdr * sh,const char * name,...)1190 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1191     va_list args;
1192 
1193     if (sh->npg_end) {
1194 	/* Copy ends array to avoid us looping while modifying it */
1195 	int *end = malloc(sh->npg_end * sizeof(int));
1196 	int i, nends = sh->npg_end;
1197 
1198 	if (!end)
1199 	    return -1;
1200 
1201 	memcpy(end, sh->pg_end, nends * sizeof(*end));
1202 
1203 	for (i = 0; i < nends; i++) {
1204 	    va_start(args, name);
1205 	    if (-1 == sam_hdr_vadd(sh, "PG", args,
1206 				   "ID", sam_hdr_PG_ID(sh, name),
1207 				   "PN", name,
1208 				   "PP", sh->pg[end[i]].name,
1209 				   NULL)) {
1210 		free(end);
1211 		return  -1;
1212 	    }
1213 	    va_end(args);
1214 	}
1215 
1216 	free(end);
1217     } else {
1218 	va_start(args, name);
1219 	if (-1 == sam_hdr_vadd(sh, "PG", args,
1220 			       "ID", sam_hdr_PG_ID(sh, name),
1221 			       "PN", name,
1222 			       NULL))
1223 	    return -1;
1224 	va_end(args);
1225     }
1226 
1227     //sam_hdr_dump(sh);
1228 
1229     return 0;
1230 }
1231 
1232 /*
1233  * A function to help with construction of CL tags in @PG records.
1234  * Takes an argc, argv pair and returns a single space-separated string.
1235  * This string should be deallocated by the calling function.
1236  *
1237  * Returns malloced char * on success
1238  *         NULL on failure
1239  */
stringify_argv(int argc,char * argv[])1240 char *stringify_argv(int argc, char *argv[]) {
1241     char *str, *cp;
1242     size_t nbytes = 1;
1243     int i, j;
1244 
1245     /* Allocate */
1246     for (i = 0; i < argc; i++) {
1247 	if (i > 0) nbytes += 1;
1248 	nbytes += strlen(argv[i]);
1249     }
1250     if (!(str = malloc(nbytes)))
1251 	return NULL;
1252 
1253     /* Copy */
1254     cp = str;
1255     for (i = 0; i < argc; i++) {
1256 	if (i > 0) *cp++ = ' ';
1257 	j = 0;
1258 	while (argv[i][j]) {
1259 	    if (argv[i][j] == '\t')
1260 		*cp++ = ' ';
1261 	    else
1262 		*cp++ = argv[i][j];
1263 	    j++;
1264 	}
1265     }
1266     *cp++ = 0;
1267 
1268     return str;
1269 }
1270