1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #include <config.h>
32 
33 #include <string.h>
34 #include <assert.h>
35 
36 #include "hts_internal.h"
37 #include "cram/sam_header.h"
38 #include "cram/string_alloc.h"
39 
sam_hdr_error(char * msg,char * line,int len,int lno)40 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
41     int j;
42 
43     for (j = 0; j < len && line[j] != '\n'; j++)
44         ;
45     hts_log_error("%s at line %d: \"%.*s\"", msg, lno, j, line);
46 }
47 
sam_hdr_dump(SAM_hdr * hdr)48 void sam_hdr_dump(SAM_hdr *hdr) {
49     khint_t k;
50     int i;
51 
52     printf("===DUMP===\n");
53     for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
54         SAM_hdr_type *t1, *t2;
55         char c[2];
56 
57         if (!kh_exist(hdr->h, k))
58             continue;
59 
60         t1 = t2 = kh_val(hdr->h, k);
61         c[0] = kh_key(hdr->h, k)>>8;
62         c[1] = kh_key(hdr->h, k)&0xff;
63         printf("Type %.2s, count %d\n", c, t1->prev->order+1);
64 
65         do {
66             SAM_hdr_tag *tag;
67             printf(">>>%d ", t1->order);
68             for (tag = t1->tag; tag; tag=tag->next) {
69                 printf("\"%.2s\":\"%.*s\"\t",
70                        tag->str, tag->len-3, tag->str+3);
71             }
72             putchar('\n');
73             t1 = t1->next;
74         } while (t1 != t2);
75     }
76 
77     /* Dump out PG chains */
78     printf("\n@PG chains:\n");
79     for (i = 0; i < hdr->npg_end; i++) {
80         int j;
81         printf("  %d:", i);
82         for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
83             printf("%s%d(%.*s)",
84                    j == hdr->pg_end[i] ? " " : "->",
85                    j, hdr->pg[j].name_len, hdr->pg[j].name);
86         }
87         printf("\n");
88     }
89 
90     puts("===END DUMP===");
91 }
92 
93 /* Updates the hash tables in the SAM_hdr structure.
94  *
95  * Returns 0 on success;
96  *        -1 on failure
97  */
sam_hdr_update_hashes(SAM_hdr * sh,int type,SAM_hdr_type * h_type)98 static int sam_hdr_update_hashes(SAM_hdr *sh,
99                                  int type,
100                                  SAM_hdr_type *h_type) {
101     /* Add to reference hash? */
102     if ((type>>8) == 'S' && (type&0xff) == 'Q') {
103         SAM_hdr_tag *tag;
104         SAM_SQ *new_ref;
105         int nref = sh->nref;
106 
107         new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
108         if (!new_ref)
109             return -1;
110         sh->ref = new_ref;
111 
112         tag = h_type->tag;
113         sh->ref[nref].name = NULL;
114         sh->ref[nref].len  = 0;
115         sh->ref[nref].ty = h_type;
116         sh->ref[nref].tag  = tag;
117 
118         while (tag) {
119             if (tag->str[0] == 'S' && tag->str[1] == 'N') {
120                 if (!(sh->ref[nref].name = malloc(tag->len)))
121                     return -1;
122                 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
123                 sh->ref[nref].name[tag->len-3] = 0;
124             } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
125                 sh->ref[nref].len = atoi(tag->str+3);
126             }
127             tag = tag->next;
128         }
129 
130         if (sh->ref[nref].name) {
131             khint_t k;
132             int r;
133             k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
134             if (-1 == r) return -1;
135             kh_val(sh->ref_hash, k) = nref;
136         } else {
137             return -1; // SN should be present, according to spec.
138         }
139 
140         sh->nref++;
141     }
142 
143     /* Add to read-group hash? */
144     if ((type>>8) == 'R' && (type&0xff) == 'G') {
145         SAM_hdr_tag *tag;
146         SAM_RG *new_rg;
147         int nrg = sh->nrg;
148 
149         new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
150         if (!new_rg)
151             return -1;
152         sh->rg = new_rg;
153 
154         tag = h_type->tag;
155         sh->rg[nrg].name = NULL;
156         sh->rg[nrg].name_len = 0;
157         sh->rg[nrg].ty   = h_type;
158         sh->rg[nrg].tag  = tag;
159         sh->rg[nrg].id   = nrg;
160 
161         while (tag) {
162             if (tag->str[0] == 'I' && tag->str[1] == 'D') {
163                 if (!(sh->rg[nrg].name = malloc(tag->len)))
164                     return -1;
165                 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
166                 sh->rg[nrg].name[tag->len-3] = 0;
167                 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
168             }
169             tag = tag->next;
170         }
171 
172         if (sh->rg[nrg].name) {
173             khint_t k;
174             int r;
175             k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
176             if (-1 == r) return -1;
177             kh_val(sh->rg_hash, k) = nrg;
178         } else {
179             return -1; // ID should be present, according to spec.
180         }
181 
182         sh->nrg++;
183     }
184 
185     /* Add to program hash? */
186     if ((type>>8) == 'P' && (type&0xff) == 'G') {
187         SAM_hdr_tag *tag;
188         SAM_PG *new_pg;
189         int npg = sh->npg;
190 
191         new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
192         if (!new_pg)
193             return -1;
194         sh->pg = new_pg;
195 
196         tag = h_type->tag;
197         sh->pg[npg].name = NULL;
198         sh->pg[npg].name_len = 0;
199         sh->pg[npg].ty  = h_type;
200         sh->pg[npg].tag  = tag;
201         sh->pg[npg].id   = npg;
202         sh->pg[npg].prev_id = -1;
203 
204         while (tag) {
205             if (tag->str[0] == 'I' && tag->str[1] == 'D') {
206                 if (!(sh->pg[npg].name = malloc(tag->len)))
207                     return -1;
208                 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
209                 sh->pg[npg].name[tag->len-3] = 0;
210                 sh->pg[npg].name_len = strlen(sh->pg[npg].name);
211             } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
212                 // Resolve later if needed
213                 khint_t k;
214                 char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
215                 k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
216                 tag->str[tag->len] = tmp;
217 
218                 if (k != kh_end(sh->pg_hash)) {
219                     int p_id = kh_val(sh->pg_hash, k);
220                     sh->pg[npg].prev_id = sh->pg[p_id].id;
221 
222                     /* Unmark previous entry as a PG termination */
223                     if (sh->npg_end > 0 &&
224                         sh->pg_end[sh->npg_end-1] == p_id) {
225                         sh->npg_end--;
226                     } else {
227                         int i;
228                         for (i = 0; i < sh->npg_end; i++) {
229                             if (sh->pg_end[i] == p_id) {
230                                 memmove(&sh->pg_end[i], &sh->pg_end[i+1],
231                                         (sh->npg_end-i-1)*sizeof(*sh->pg_end));
232                                 sh->npg_end--;
233                             }
234                         }
235                     }
236                 } else {
237                     sh->pg[npg].prev_id = -1;
238                 }
239             }
240             tag = tag->next;
241         }
242 
243         if (sh->pg[npg].name) {
244             khint_t k;
245             int r;
246             k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
247             if (-1 == r) return -1;
248             kh_val(sh->pg_hash, k) = npg;
249         } else {
250             return -1; // ID should be present, according to spec.
251         }
252 
253         /* Add to npg_end[] array. Remove later if we find a PP line */
254         if (sh->npg_end >= sh->npg_end_alloc) {
255             int *new_pg_end;
256             int  new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4;
257 
258             new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int));
259             if (!new_pg_end)
260                 return -1;
261             sh->npg_end_alloc = new_alloc;
262             sh->pg_end = new_pg_end;
263         }
264         sh->pg_end[sh->npg_end++] = npg;
265 
266         sh->npg++;
267     }
268 
269     return 0;
270 }
271 
272 /*
273  * Appends a formatted line to an existing SAM header.
274  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
275  * optional new-line. If it contains more than 1 line then multiple lines
276  * will be added in order.
277  *
278  * Input text is of maximum length len or as terminated earlier by a NUL.
279  * Len may be 0 if unknown, in which case lines must be NUL-terminated.
280  *
281  * Returns 0 on success
282  *        -1 on failure
283  */
sam_hdr_add_lines(SAM_hdr * sh,const char * lines,int len)284 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
285     int i, lno, text_offset;
286     char *hdr;
287 
288     if (!len)
289         len = strlen(lines);
290 
291     text_offset = ks_len(&sh->text);
292     if (EOF == kputsn(lines, len, &sh->text))
293         return -1;
294     hdr = ks_str(&sh->text) + text_offset;
295 
296     for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
297         khint32_t type;
298         khint_t k;
299 
300         int l_start = i, new;
301         SAM_hdr_type *h_type;
302         SAM_hdr_tag *h_tag, *last;
303 
304         if (hdr[i] != '@') {
305             int j;
306             for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
307                 ;
308             sam_hdr_error("Header line does not start with '@'",
309                           &hdr[l_start], len - l_start, lno);
310             return -1;
311         }
312 
313         type = (hdr[i+1]<<8) | hdr[i+2];
314         if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
315             hdr[i+2] < 'A' || hdr[i+2] > 'z') {
316             sam_hdr_error("Header line does not have a two character key",
317                           &hdr[l_start], len - l_start, lno);
318             return -1;
319         }
320 
321         i += 3;
322         if (hdr[i] == '\n')
323             continue;
324 
325         // Add the header line type
326         if (!(h_type = pool_alloc(sh->type_pool)))
327             return -1;
328         if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
329             return -1;
330 
331         // Form the ring, either with self or other lines of this type
332         if (!new) {
333             SAM_hdr_type *t = kh_val(sh->h, k), *p;
334             p = t->prev;
335 
336             assert(p->next == t);
337             p->next = h_type;
338             h_type->prev = p;
339 
340             t->prev = h_type;
341             h_type->next = t;
342             h_type->order = p->order+1;
343         } else {
344             kh_val(sh->h, k) = h_type;
345             h_type->prev = h_type->next = h_type;
346             h_type->order = 0;
347         }
348 
349         // Parse the tags on this line
350         last = NULL;
351         if ((type>>8) == 'C' && (type&0xff) == 'O') {
352             int j;
353             if (hdr[i] != '\t') {
354                 sam_hdr_error("Missing tab",
355                               &hdr[l_start], len - l_start, lno);
356                 return -1;
357             }
358 
359             for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
360                 ;
361 
362             if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
363                 return -1;
364             h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
365             h_tag->len = j-i;
366             h_tag->next = NULL;
367             if (!h_tag->str)
368                 return -1;
369 
370             i = j;
371 
372         } else {
373             do {
374                 int j;
375                 if (hdr[i] != '\t') {
376                     sam_hdr_error("Missing tab",
377                                   &hdr[l_start], len - l_start, lno);
378                     return -1;
379                 }
380 
381                 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
382                     ;
383 
384                 if (!(h_tag = pool_alloc(sh->tag_pool)))
385                     return -1;
386                 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
387                 h_tag->len = j-i;
388                 h_tag->next = NULL;
389                 if (!h_tag->str)
390                     return -1;
391 
392                 if (h_tag->len < 3 || h_tag->str[2] != ':') {
393                     sam_hdr_error("Malformed key:value pair",
394                                   &hdr[l_start], len - l_start, lno);
395                     return -1;
396                 }
397 
398                 if (last)
399                     last->next = h_tag;
400                 else
401                     h_type->tag = h_tag;
402 
403                 last = h_tag;
404                 i = j;
405             } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
406         }
407 
408         /* Update RG/SQ hashes */
409         if (-1 == sam_hdr_update_hashes(sh, type, h_type))
410             return -1;
411     }
412 
413     return 0;
414 }
415 
416 /*
417  * Adds a single line to a SAM header.
418  * Specify type and one or more key,value pairs, ending with the NULL key.
419  * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
420  *
421  * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
422  *        -1 on failure
423  */
sam_hdr_add(SAM_hdr * sh,const char * type,...)424 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
425     va_list args;
426     va_start(args, type);
427     return sam_hdr_vadd(sh, type, args, NULL);
428 }
429 
430 /*
431  * sam_hdr_add with a va_list interface.
432  *
433  * Note: this function invokes va_arg at least once, making the value
434  * of ap indeterminate after the return.  The caller should call
435  * va_start/va_end before/after calling this function or use va_copy.
436  */
sam_hdr_vadd(SAM_hdr * sh,const char * type,va_list ap,...)437 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
438     va_list args;
439     SAM_hdr_type *h_type;
440     SAM_hdr_tag *h_tag, *last;
441     int new;
442     khint32_t type_i = (type[0]<<8) | type[1], k;
443 
444     if (EOF == kputc_('@', &sh->text))
445         return -1;
446     if (EOF == kputsn(type, 2, &sh->text))
447         return -1;
448 
449     if (!(h_type = pool_alloc(sh->type_pool)))
450         return -1;
451     if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
452         return -1;
453 
454     // Form the ring, either with self or other lines of this type
455     if (!new) {
456         SAM_hdr_type *t = kh_val(sh->h, k), *p;
457         p = t->prev;
458 
459         assert(p->next == t);
460         p->next = h_type;
461         h_type->prev = p;
462 
463         t->prev = h_type;
464         h_type->next = t;
465         h_type->order = p->order + 1;
466     } else {
467         kh_val(sh->h, k) = h_type;
468         h_type->prev = h_type->next = h_type;
469         h_type->order = 0;
470     }
471 
472     last = NULL;
473 
474     // Any ... varargs
475     va_start(args, ap);
476     for (;;) {
477         char *k, *v;
478         int idx;
479 
480         if (!(k = (char *)va_arg(args, char *)))
481             break;
482         v = va_arg(args, char *);
483 
484         if (EOF == kputc_('\t', &sh->text))
485             return -1;
486 
487         if (!(h_tag = pool_alloc(sh->tag_pool)))
488             return -1;
489         idx = ks_len(&sh->text);
490 
491         if (EOF == kputs(k, &sh->text))
492             return -1;
493         if (EOF == kputc_(':', &sh->text))
494             return -1;
495         if (EOF == kputs(v, &sh->text))
496             return -1;
497 
498         h_tag->len = ks_len(&sh->text) - idx;
499         h_tag->str = string_ndup(sh->str_pool,
500                                  ks_str(&sh->text) + idx,
501                                  h_tag->len);
502         h_tag->next = NULL;
503         if (!h_tag->str)
504             return -1;
505 
506         if (last)
507             last->next = h_tag;
508         else
509             h_type->tag = h_tag;
510 
511         last = h_tag;
512     }
513     va_end(args);
514 
515     // Plus the specified va_list params
516     for (;;) {
517         char *k, *v;
518         int idx;
519 
520         if (!(k = (char *)va_arg(ap, char *)))
521             break;
522         v = va_arg(ap, char *);
523 
524         if (EOF == kputc_('\t', &sh->text))
525             return -1;
526 
527         if (!(h_tag = pool_alloc(sh->tag_pool)))
528             return -1;
529         idx = ks_len(&sh->text);
530 
531         if (EOF == kputs(k, &sh->text))
532             return -1;
533         if (EOF == kputc_(':', &sh->text))
534             return -1;
535         if (EOF == kputs(v, &sh->text))
536             return -1;
537 
538         h_tag->len = ks_len(&sh->text) - idx;
539         h_tag->str = string_ndup(sh->str_pool,
540                                  ks_str(&sh->text) + idx,
541                                  h_tag->len);
542         h_tag->next = NULL;
543         if (!h_tag->str)
544             return -1;
545 
546         if (last)
547             last->next = h_tag;
548         else
549             h_type->tag = h_tag;
550 
551         last = h_tag;
552     }
553     va_end(ap);
554 
555     if (EOF == kputc('\n', &sh->text))
556         return -1;
557 
558     int itype = (type[0]<<8) | type[1];
559     if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
560         return -1;
561 
562     return h_type->order;
563 }
564 
565 /*
566  * Returns the first header item matching 'type'. If ID is non-NULL it checks
567  * for the tag ID: and compares against the specified ID.
568  *
569  * Returns NULL if no type/ID is found
570  */
sam_hdr_find(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)571 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
572                            char *ID_key, char *ID_value) {
573     SAM_hdr_type *t1, *t2;
574     int itype = (type[0]<<8)|(type[1]);
575     khint_t k;
576 
577     /* Special case for types we have prebuilt hashes on */
578     if (ID_key) {
579         if (type[0]   == 'S' && type[1]   == 'Q' &&
580             ID_key[0] == 'S' && ID_key[1] == 'N') {
581             k = kh_get(m_s2i, hdr->ref_hash, ID_value);
582             return k != kh_end(hdr->ref_hash)
583                 ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
584                 : NULL;
585         }
586 
587         if (type[0]   == 'R' && type[1]   == 'G' &&
588             ID_key[0] == 'I' && ID_key[1] == 'D') {
589             k = kh_get(m_s2i, hdr->rg_hash, ID_value);
590             return k != kh_end(hdr->rg_hash)
591                 ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
592                 : NULL;
593         }
594 
595         if (type[0]   == 'P' && type[1]   == 'G' &&
596             ID_key[0] == 'I' && ID_key[1] == 'D') {
597             k = kh_get(m_s2i, hdr->pg_hash, ID_value);
598             return k != kh_end(hdr->pg_hash)
599                 ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
600                 : NULL;
601         }
602     }
603 
604     k = kh_get(sam_hdr, hdr->h, itype);
605     if (k == kh_end(hdr->h))
606         return NULL;
607 
608     if (!ID_key)
609         return kh_val(hdr->h, k);
610 
611     t1 = t2 = kh_val(hdr->h, k);
612     do {
613         SAM_hdr_tag *tag;
614         for (tag = t1->tag; tag; tag = tag->next) {
615             if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
616                 char *cp1 = tag->str+3;
617                 char *cp2 = ID_value;
618                 while (*cp1 && *cp1 == *cp2)
619                     cp1++, cp2++;
620                 if (*cp2 || *cp1)
621                     continue;
622                 return t1;
623             }
624         }
625         t1 = t1->next;
626     } while (t1 != t2);
627 
628     return NULL;
629 }
630 
631 /*
632  * As per SAM_hdr_type, but returns a complete line of formatted text
633  * for a specific head type/ID combination. If ID is NULL then it returns
634  * the first line of the specified type.
635  *
636  * The returned string is malloced and should be freed by the calling
637  * function with free().
638  *
639  * Returns NULL if no type/ID is found.
640  */
sam_hdr_find_line(SAM_hdr * hdr,char * type,char * ID_key,char * ID_value)641 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
642                         char *ID_key, char *ID_value) {
643     SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
644     kstring_t ks = KS_INITIALIZER;
645     SAM_hdr_tag *tag;
646     int r = 0;
647 
648     if (!ty)
649         return NULL;
650 
651     // Paste together the line from the hashed copy
652     r |= (kputc_('@', &ks) == EOF);
653     r |= (kputs(type, &ks) == EOF);
654     for (tag = ty->tag; tag; tag = tag->next) {
655         r |= (kputc_('\t', &ks) == EOF);
656         r |= (kputsn(tag->str, tag->len, &ks) == EOF);
657     }
658 
659     if (r) {
660         KS_FREE(&ks);
661         return NULL;
662     }
663 
664     return ks_str(&ks);
665 }
666 
667 
668 /*
669  * Looks for a specific key in a single sam header line.
670  * If prev is non-NULL it also fills this out with the previous tag, to
671  * permit use in key removal. *prev is set to NULL when the tag is the first
672  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
673  * tag in the existing list.
674  *
675  * Returns the tag pointer on success
676  *         NULL on failure
677  */
sam_hdr_find_key(SAM_hdr * sh,SAM_hdr_type * type,char * key,SAM_hdr_tag ** prev)678 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
679                               SAM_hdr_type *type,
680                               char *key,
681                               SAM_hdr_tag **prev) {
682     SAM_hdr_tag *tag, *p = NULL;
683 
684     for (tag = type->tag; tag; p = tag, tag = tag->next) {
685         if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
686             if (prev)
687                 *prev = p;
688             return tag;
689         }
690     }
691 
692     if (prev)
693         *prev = p;
694 
695     return NULL;
696 }
697 
698 
699 /*
700  * Adds or updates tag key,value pairs in a header line.
701  * Eg for adding M5 tags to @SQ lines or updating sort order for the
702  * @HD line (although use the sam_hdr_sort_order() function for
703  * HD manipulation, which is a wrapper around this funuction).
704  *
705  * Specify multiple key,value pairs ending in NULL.
706  *
707  * Returns 0 on success
708  *        -1 on failure
709  */
sam_hdr_update(SAM_hdr * hdr,SAM_hdr_type * type,...)710 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
711     va_list ap;
712 
713     va_start(ap, type);
714 
715     for (;;) {
716         char *k, *v;
717         int idx;
718         SAM_hdr_tag *tag, *prev;
719 
720         if (!(k = (char *)va_arg(ap, char *)))
721             break;
722         v = va_arg(ap, char *);
723 
724         tag = sam_hdr_find_key(hdr, type, k, &prev);
725         if (!tag) {
726             if (!(tag = pool_alloc(hdr->tag_pool)))
727                 return -1;
728             if (prev)
729                 prev->next = tag;
730             else
731                 type->tag = tag;
732 
733             tag->next = NULL;
734         }
735 
736         idx = ks_len(&hdr->text);
737         if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
738             return -1;
739         tag->len = ks_len(&hdr->text) - idx;
740         tag->str = string_ndup(hdr->str_pool,
741                                ks_str(&hdr->text) + idx,
742                                tag->len);
743         if (!tag->str)
744             return -1;
745     }
746 
747     va_end(ap);
748 
749     return 0;
750 }
751 
752 #define K(a) (((a)[0]<<8)|((a)[1]))
753 
754 /*
755  * Returns the sort order:
756  */
sam_hdr_sort_order(SAM_hdr * hdr)757 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
758     return hdr->sort_order;
759 }
760 
sam_hdr_parse_sort_order(SAM_hdr * hdr)761 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
762     khint_t k;
763     enum sam_sort_order so;
764 
765     so = ORDER_UNKNOWN;
766     k = kh_get(sam_hdr, hdr->h, K("HD"));
767     if (k != kh_end(hdr->h)) {
768         SAM_hdr_type *ty = kh_val(hdr->h, k);
769         SAM_hdr_tag *tag;
770         for (tag = ty->tag; tag; tag = tag->next) {
771             if (tag->str[0] == 'S' && tag->str[1] == 'O') {
772                 if (strcmp(tag->str+3, "unsorted") == 0)
773                     so = ORDER_UNSORTED;
774                 else if (strcmp(tag->str+3, "queryname") == 0)
775                     so = ORDER_NAME;
776                 else if (strcmp(tag->str+3, "coordinate") == 0)
777                     so = ORDER_COORD;
778                 else if (strcmp(tag->str+3, "unknown") != 0)
779                     hts_log_error("Unknown sort order field: %s", tag->str+3);
780             }
781         }
782     }
783 
784     return so;
785 }
786 
787 
788 /*
789  * Reconstructs the kstring from the header hash table.
790  * Returns 0 on success
791  *        -1 on failure
792  */
sam_hdr_rebuild(SAM_hdr * hdr)793 int sam_hdr_rebuild(SAM_hdr *hdr) {
794     /* Order: HD then others */
795     kstring_t ks = KS_INITIALIZER;
796     khint_t k;
797 
798 
799     k = kh_get(sam_hdr, hdr->h, K("HD"));
800     if (k != kh_end(hdr->h)) {
801         SAM_hdr_type *ty = kh_val(hdr->h, k);
802         SAM_hdr_tag *tag;
803         if (EOF == kputs("@HD", &ks))
804             return -1;
805         for (tag = ty->tag; tag; tag = tag->next) {
806             if (EOF == kputc_('\t', &ks))
807                 return -1;
808             if (EOF == kputsn_(tag->str, tag->len, &ks))
809                 return -1;
810         }
811         if (EOF == kputc('\n', &ks))
812             return -1;
813     }
814 
815     for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
816         SAM_hdr_type *t1, *t2;
817 
818         if (!kh_exist(hdr->h, k))
819             continue;
820 
821         if (kh_key(hdr->h, k) == K("HD"))
822             continue;
823 
824         t1 = t2 = kh_val(hdr->h, k);
825         do {
826             SAM_hdr_tag *tag;
827             char c[2];
828 
829             if (EOF == kputc_('@', &ks))
830                 return -1;
831             c[0] = kh_key(hdr->h, k)>>8;
832             c[1] = kh_key(hdr->h, k)&0xff;
833             if (EOF == kputsn_(c, 2, &ks))
834                 return -1;
835             for (tag = t1->tag; tag; tag=tag->next) {
836                 if (EOF == kputc_('\t', &ks))
837                     return -1;
838                 if (EOF == kputsn_(tag->str, tag->len, &ks))
839                     return -1;
840             }
841             if (EOF == kputc('\n', &ks))
842                 return -1;
843             t1 = t1->next;
844         } while (t1 != t2);
845     }
846 
847     if (ks_str(&hdr->text))
848         KS_FREE(&hdr->text);
849 
850     hdr->text = ks;
851 
852     return 0;
853 }
854 
855 
856 /*
857  * Creates an empty SAM header, ready to be populated.
858  *
859  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
860  *         NULL on failure
861  */
sam_hdr_new()862 SAM_hdr *sam_hdr_new() {
863     SAM_hdr *sh = calloc(1, sizeof(*sh));
864 
865     if (!sh)
866         return NULL;
867 
868     sh->h = kh_init(sam_hdr);
869     if (!sh->h)
870         goto err;
871 
872     sh->ID_cnt = 1;
873     sh->ref_count = 1;
874 
875     sh->nref = 0;
876     sh->ref  = NULL;
877     if (!(sh->ref_hash = kh_init(m_s2i)))
878         goto err;
879 
880     sh->nrg = 0;
881     sh->rg  = NULL;
882     if (!(sh->rg_hash = kh_init(m_s2i)))
883         goto err;
884 
885     sh->npg = 0;
886     sh->pg  = NULL;
887     sh->npg_end = sh->npg_end_alloc = 0;
888     sh->pg_end = NULL;
889     if (!(sh->pg_hash = kh_init(m_s2i)))
890         goto err;
891 
892     KS_INIT(&sh->text);
893 
894     if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
895         goto err;
896 
897     if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
898         goto err;
899 
900     if (!(sh->str_pool = string_pool_create(8192)))
901         goto err;
902 
903     return sh;
904 
905  err:
906     if (sh->h)
907         kh_destroy(sam_hdr, sh->h);
908 
909     if (sh->tag_pool)
910         pool_destroy(sh->tag_pool);
911 
912     if (sh->type_pool)
913         pool_destroy(sh->type_pool);
914 
915     if (sh->str_pool)
916         string_pool_destroy(sh->str_pool);
917 
918     free(sh);
919 
920     return NULL;
921 }
922 
923 
924 /*
925  * Tokenises a SAM header into a hash table.
926  * Also extracts a few bits on specific data types, such as @RG lines.
927  *
928  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
929  *         NULL on failure
930  */
sam_hdr_parse_(const char * hdr,int len)931 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
932     /* Make an empty SAM_hdr */
933     SAM_hdr *sh;
934 
935     sh = sam_hdr_new();
936     if (NULL == sh) return NULL;
937 
938     if (NULL == hdr) return sh; // empty header is permitted
939 
940     /* Parse the header, line by line */
941     if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
942         sam_hdr_free(sh);
943         return NULL;
944     }
945 
946     /* Obtain sort order */
947     sh->sort_order = sam_hdr_parse_sort_order(sh);
948 
949     //sam_hdr_dump(sh);
950     //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
951     //sam_hdr_rebuild(sh);
952     //printf(">>%s<<", ks_str(sh->text));
953 
954     //parse_references(sh);
955     //parse_read_groups(sh);
956 
957     sam_hdr_link_pg(sh);
958     //sam_hdr_dump(sh);
959 
960     return sh;
961 }
962 
963 /*
964  * Produces a duplicate copy of hdr and returns it.
965  * Returns NULL on failure
966  */
sam_hdr_dup(SAM_hdr * hdr)967 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
968     if (-1 == sam_hdr_rebuild(hdr))
969         return NULL;
970 
971     return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
972 }
973 
974 /*! Increments a reference count on hdr.
975  *
976  * This permits multiple files to share the same header, all calling
977  * sam_hdr_free when done, without causing errors for other open  files.
978  */
sam_hdr_incr_ref(SAM_hdr * hdr)979 void sam_hdr_incr_ref(SAM_hdr *hdr) {
980     hdr->ref_count++;
981 }
982 
983 /*! Increments a reference count on hdr.
984  *
985  * This permits multiple files to share the same header, all calling
986  * sam_hdr_free when done, without causing errors for other open  files.
987  *
988  * If the reference count hits zero then the header is automatically
989  * freed. This makes it a synonym for sam_hdr_free().
990  */
sam_hdr_decr_ref(SAM_hdr * hdr)991 void sam_hdr_decr_ref(SAM_hdr *hdr) {
992     sam_hdr_free(hdr);
993 }
994 
995 /*! Deallocates all storage used by a SAM_hdr struct.
996  *
997  * This also decrements the header reference count. If after decrementing
998  * it is still non-zero then the header is assumed to be in use by another
999  * caller and the free is not done.
1000  *
1001  * This is a synonym for sam_hdr_dec_ref().
1002  */
sam_hdr_free(SAM_hdr * hdr)1003 void sam_hdr_free(SAM_hdr *hdr) {
1004     if (!hdr)
1005         return;
1006 
1007     if (--hdr->ref_count > 0)
1008         return;
1009 
1010     if (ks_str(&hdr->text))
1011         KS_FREE(&hdr->text);
1012 
1013     if (hdr->h)
1014         kh_destroy(sam_hdr, hdr->h);
1015 
1016     if (hdr->ref_hash)
1017         kh_destroy(m_s2i, hdr->ref_hash);
1018 
1019     if (hdr->ref) {
1020         int i;
1021         for (i = 0; i < hdr->nref; i++)
1022             if (hdr->ref[i].name)
1023                 free(hdr->ref[i].name);
1024         free(hdr->ref);
1025     }
1026 
1027     if (hdr->rg_hash)
1028         kh_destroy(m_s2i, hdr->rg_hash);
1029 
1030     if (hdr->rg) {
1031         int i;
1032         for (i = 0; i < hdr->nrg; i++)
1033             if (hdr->rg[i].name)
1034                 free(hdr->rg[i].name);
1035         free(hdr->rg);
1036     }
1037 
1038     if (hdr->pg_hash)
1039         kh_destroy(m_s2i, hdr->pg_hash);
1040 
1041     if (hdr->pg) {
1042         int i;
1043         for (i = 0; i < hdr->npg; i++)
1044             if (hdr->pg[i].name)
1045                 free(hdr->pg[i].name);
1046         free(hdr->pg);
1047     }
1048 
1049     if (hdr->pg_end)
1050         free(hdr->pg_end);
1051 
1052     if (hdr->type_pool)
1053         pool_destroy(hdr->type_pool);
1054 
1055     if (hdr->tag_pool)
1056         pool_destroy(hdr->tag_pool);
1057 
1058     if (hdr->str_pool)
1059         string_pool_destroy(hdr->str_pool);
1060 
1061     free(hdr);
1062 }
1063 
sam_hdr_length(SAM_hdr * hdr)1064 int sam_hdr_length(SAM_hdr *hdr) {
1065     return ks_len(&hdr->text);
1066 }
1067 
sam_hdr_str(SAM_hdr * hdr)1068 char *sam_hdr_str(SAM_hdr *hdr) {
1069     return ks_str(&hdr->text);
1070 }
1071 
1072 /*
1073  * Looks up a reference sequence by name and returns the numerical ID.
1074  * Returns -1 if unknown reference.
1075  */
sam_hdr_name2ref(SAM_hdr * hdr,const char * ref)1076 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1077     khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1078     return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1079 }
1080 
1081 /*
1082  * Looks up a read-group by name and returns a pointer to the start of the
1083  * associated tag list.
1084  *
1085  * Returns NULL on failure
1086  */
sam_hdr_find_rg(SAM_hdr * hdr,const char * rg)1087 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1088     khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1089     return k == kh_end(hdr->rg_hash)
1090         ? NULL
1091         : &hdr->rg[kh_val(hdr->rg_hash, k)];
1092 }
1093 
1094 
1095 /*
1096  * Fixes any PP links in @PG headers.
1097  * If the entries are in order then this doesn't need doing, but incase
1098  * our header is out of order this goes through the sh->pg[] array
1099  * setting the prev_id field.
1100  *
1101  * Note we can have multiple complete chains. This code should identify the
1102  * tails of these chains as these are the entries we have to link to in
1103  * subsequent PP records.
1104  *
1105  * Returns 0 on sucess
1106  *        -1 on failure (indicating broken PG/PP records)
1107  */
sam_hdr_link_pg(SAM_hdr * hdr)1108 int sam_hdr_link_pg(SAM_hdr *hdr) {
1109     int i, j, ret = 0;
1110 
1111     hdr->npg_end_alloc = hdr->npg;
1112     hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1113     if (!hdr->pg_end)
1114         return -1;
1115 
1116     for (i = 0; i < hdr->npg; i++)
1117         hdr->pg_end[i] = i;
1118 
1119     for (i = 0; i < hdr->npg; i++) {
1120         khint_t k;
1121         SAM_hdr_tag *tag;
1122         char tmp;
1123 
1124         for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1125             if (tag->str[0] == 'P' && tag->str[1] == 'P')
1126                 break;
1127         }
1128         if (!tag) {
1129             /* Chain start points */
1130             continue;
1131         }
1132 
1133         tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1134         k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1135         tag->str[tag->len] = tmp;
1136 
1137         if (k == kh_end(hdr->pg_hash)) {
1138             ret = -1;
1139             continue;
1140         }
1141 
1142         hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1143         hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1144     }
1145 
1146     for (i = j = 0; i < hdr->npg; i++) {
1147         if (hdr->pg_end[i] != -1)
1148             hdr->pg_end[j++] = hdr->pg_end[i];
1149     }
1150     hdr->npg_end = j;
1151 
1152     return ret;
1153 }
1154 
1155 /*
1156  * Returns a unique ID from a base name.
1157  *
1158  * The value returned is valid until the next call to
1159  * this function.
1160  */
sam_hdr_PG_ID(SAM_hdr * sh,const char * name)1161 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1162     khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1163     if (k == kh_end(sh->pg_hash))
1164         return name;
1165 
1166     do {
1167         sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1168         k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1169     } while (k != kh_end(sh->pg_hash));
1170 
1171     return sh->ID_buf;
1172 }
1173 
1174 /*
1175  * Add an @PG line.
1176  *
1177  * If we wish complete control over this use sam_hdr_add() directly. This
1178  * function uses that, but attempts to do a lot of tedious house work for
1179  * you too.
1180  *
1181  * - It will generate a suitable ID if the supplied one clashes.
1182  * - It will generate multiple @PG records if we have multiple PG chains.
1183  *
1184  * Call it as per sam_hdr_add() with a series of key,value pairs ending
1185  * in NULL.
1186  *
1187  * Returns 0 on success
1188  *        -1 on failure
1189  */
sam_hdr_add_PG(SAM_hdr * sh,const char * name,...)1190 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1191     va_list args;
1192 
1193     if (sh->npg_end) {
1194         /* Copy ends array to avoid us looping while modifying it */
1195         int *end = malloc(sh->npg_end * sizeof(int));
1196         int i, nends = sh->npg_end;
1197 
1198         if (!end)
1199             return -1;
1200 
1201         memcpy(end, sh->pg_end, nends * sizeof(*end));
1202 
1203         for (i = 0; i < nends; i++) {
1204             va_start(args, name);
1205             if (-1 == sam_hdr_vadd(sh, "PG", args,
1206                                    "ID", sam_hdr_PG_ID(sh, name),
1207                                    "PN", name,
1208                                    "PP", sh->pg[end[i]].name,
1209                                    NULL)) {
1210                 free(end);
1211                 return  -1;
1212             }
1213             va_end(args);
1214         }
1215 
1216         free(end);
1217     } else {
1218         va_start(args, name);
1219         if (-1 == sam_hdr_vadd(sh, "PG", args,
1220                                "ID", sam_hdr_PG_ID(sh, name),
1221                                "PN", name,
1222                                NULL))
1223             return -1;
1224         va_end(args);
1225     }
1226 
1227     //sam_hdr_dump(sh);
1228 
1229     return 0;
1230 }
1231 
1232 /*
1233  * A function to help with construction of CL tags in @PG records.
1234  * Takes an argc, argv pair and returns a single space-separated string.
1235  * This string should be deallocated by the calling function.
1236  *
1237  * Returns malloced char * on success
1238  *         NULL on failure
1239  */
stringify_argv(int argc,char * argv[])1240 char *stringify_argv(int argc, char *argv[]) {
1241     char *str, *cp;
1242     size_t nbytes = 1;
1243     int i, j;
1244 
1245     /* Allocate */
1246     for (i = 0; i < argc; i++) {
1247         if (i > 0) nbytes += 1;
1248         nbytes += strlen(argv[i]);
1249     }
1250     if (!(str = malloc(nbytes)))
1251         return NULL;
1252 
1253     /* Copy */
1254     cp = str;
1255     for (i = 0; i < argc; i++) {
1256         if (i > 0) *cp++ = ' ';
1257         j = 0;
1258         while (argv[i][j]) {
1259             if (argv[i][j] == '\t')
1260                 *cp++ = ' ';
1261             else
1262                 *cp++ = argv[i][j];
1263             j++;
1264         }
1265     }
1266     *cp++ = 0;
1267 
1268     return str;
1269 }
1270