1 /*
2 Copyright (c) 2018-2020 Genome Research Ltd.
3 Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33 
34 #include <string.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include "textutils_internal.h"
38 #include "header.h"
39 
40 // Hash table for removing multiple lines from the header
41 KHASH_SET_INIT_STR(rm)
42 // Used for long refs in SAM files
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44 
45 typedef khash_t(rm) rmhash_t;
46 
47 static int sam_hdr_link_pg(sam_hdr_t *bh);
48 
49 static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50 static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
51 
52 
53 #define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
sam_hrecs_error(const char * msg,const char * line,size_t len,size_t lno)54 static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
55     int j;
56 
57     if (len > MAX_ERROR_QUOTE)
58         len = MAX_ERROR_QUOTE;
59     for (j = 0; j < len && line[j] != '\n'; j++)
60         ;
61     hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
62 }
63 
64 /* ==== Static methods ==== */
65 
sam_hrecs_init_type_order(sam_hrecs_t * hrecs,char * type_list)66 static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
67     if (!hrecs)
68         return -1;
69 
70     if (!type_list) {
71         hrecs->type_count = 5;
72         hrecs->type_order = calloc(hrecs->type_count, 3);
73         if (!hrecs->type_order)
74             return -1;
75         memcpy(hrecs->type_order[0], "HD", 2);
76         memcpy(hrecs->type_order[1], "SQ", 2);
77         memcpy(hrecs->type_order[2], "RG", 2);
78         memcpy(hrecs->type_order[3], "PG", 2);
79         memcpy(hrecs->type_order[4], "CO", 2);
80     }
81 
82     return 0;
83 }
84 
sam_hrecs_add_ref_altnames(sam_hrecs_t * hrecs,int nref,const char * list)85 static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
86     const char *token;
87     ks_tokaux_t aux;
88 
89     if (!list)
90         return 0;
91 
92     for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
93         if (aux.p == token)
94             continue;
95 
96         char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
97         if (!name)
98             return -1;
99         int r;
100         khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
101         if (r < 0) return -1;
102 
103         if (r > 0)
104             kh_val(hrecs->ref_hash, k) = nref;
105         else if (kh_val(hrecs->ref_hash, k) != nref)
106             hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
107     }
108 
109     return 0;
110 }
111 
sam_hrecs_remove_ref_altnames(sam_hrecs_t * hrecs,int expected,const char * list)112 static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
113     const char *token, *sn;
114     ks_tokaux_t aux;
115     kstring_t str = KS_INITIALIZE;
116 
117     if (expected < 0 || expected >= hrecs->nref)
118         return;
119     sn = hrecs->ref[expected].name;
120 
121     for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
122         kputsn(token, aux.p - token, ks_clear(&str));
123         khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
124         if (k != kh_end(hrecs->ref_hash)
125             && kh_val(hrecs->ref_hash, k) == expected
126             && strcmp(sn, str.s) != 0)
127             kh_del(m_s2i, hrecs->ref_hash, k);
128     }
129 
130     free(str.s);
131 }
132 
133 /* Updates the hash tables in the sam_hrecs_t structure.
134  *
135  * Returns 0 on success;
136  *        -1 on failure
137  */
sam_hrecs_update_hashes(sam_hrecs_t * hrecs,khint32_t type,sam_hrec_type_t * h_type)138 static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
139                                    khint32_t type,
140                                    sam_hrec_type_t *h_type) {
141     /* Add to reference hash? */
142     if (type == TYPEKEY("SQ")) {
143         sam_hrec_tag_t *tag = h_type->tag;
144         int nref = hrecs->nref;
145         const char *name = NULL;
146         const char *altnames = NULL;
147         hts_pos_t len = -1;
148         int r;
149         khint_t k;
150 
151         while (tag) {
152             if (tag->str[0] == 'S' && tag->str[1] == 'N') {
153                 assert(tag->len >= 3);
154                 name = tag->str+3;
155             } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
156                 assert(tag->len >= 3);
157                 len = strtoll(tag->str+3, NULL, 10);
158             } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
159                 assert(tag->len >= 3);
160                 altnames = tag->str+3;
161             }
162             tag = tag->next;
163         }
164 
165         if (!name) {
166             hts_log_error("Header includes @SQ line with no SN: tag");
167             return -1; // SN should be present, according to spec.
168         }
169 
170         if (len == -1) {
171             hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
172                           name);
173             return -1; // LN should be present, according to spec.
174         }
175 
176         // Seen already?
177         k = kh_get(m_s2i, hrecs->ref_hash, name);
178         if (k < kh_end(hrecs->ref_hash)) {
179             nref = kh_val(hrecs->ref_hash, k);
180             int ref_changed_flag = 0;
181 
182             // Check for hash entry added by sam_hrecs_refs_from_targets_array()
183             if (hrecs->ref[nref].ty == NULL) {
184                 // Attach header line to existing stub entry.
185                 hrecs->ref[nref].ty = h_type;
186                 // Check lengths match; correct if not.
187                 if (len != hrecs->ref[nref].len) {
188                     char tmp[32];
189                     snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
190                              hrecs->ref[nref].len);
191                     if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
192                         return -1;
193                     ref_changed_flag = 1;
194                 }
195                 if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
196                     return -1;
197 
198                 if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
199                     hrecs->refs_changed = nref;
200                 return 0;
201             }
202 
203             // Check to see if an existing entry is being updated
204             if (hrecs->ref[nref].ty == h_type) {
205                 if (hrecs->ref[nref].len != len) {
206                     hrecs->ref[nref].len = len;
207                     ref_changed_flag = 1;
208                 }
209                 if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
210                     hrecs->ref[nref].name = name;
211                     ref_changed_flag = 1;
212                 }
213                 if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
214                     return -1;
215 
216                 if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
217                     hrecs->refs_changed = nref;
218                 return 0;
219             }
220 
221             // If here, the name is a duplicate.
222             // Check to see if it matches the SN: tag from the earlier record.
223             if (strcmp(hrecs->ref[nref].name, name) == 0) {
224                 hts_log_error("Duplicate entry \"%s\" in sam header",
225                                 name);
226                 return -1;
227             }
228 
229             // Clash with an already-seen altname
230             // As SN: should be preferred to AN: add this as a new
231             // record and update the hash entry to point to it.
232             hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
233             nref = hrecs->nref;
234         }
235 
236         if (nref == hrecs->ref_sz) {
237             size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
238             sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
239             if (!new_ref)
240                 return -1;
241             hrecs->ref = new_ref;
242             hrecs->ref_sz = new_sz;
243         }
244 
245         hrecs->ref[nref].name = name;
246         hrecs->ref[nref].len  = len;
247         hrecs->ref[nref].ty = h_type;
248 
249         k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
250         if (-1 == r) return -1;
251         kh_val(hrecs->ref_hash, k) = nref;
252 
253         if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
254             return -1;
255 
256         if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
257             hrecs->refs_changed = hrecs->nref;
258         hrecs->nref++;
259     }
260 
261     /* Add to read-group hash? */
262     if (type == TYPEKEY("RG")) {
263         sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
264         int nrg = hrecs->nrg, r;
265         khint_t k;
266 
267         if (!tag) {
268             hts_log_error("Header includes @RG line with no ID: tag");
269             return -1;  // ID should be present, according to spec.
270         }
271         assert(tag->str && tag->len >= 3);
272 
273         // Seen already?
274         k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
275         if (k < kh_end(hrecs->rg_hash)) {
276             nrg = kh_val(hrecs->rg_hash, k);
277             assert(hrecs->rg[nrg].ty != NULL);
278             if (hrecs->rg[nrg].ty != h_type) {
279                 hts_log_warning("Duplicate entry \"%s\" in sam header",
280                                 tag->str + 3);
281             } else {
282                 hrecs->rg[nrg].name = tag->str + 3;
283                 hrecs->rg[nrg].name_len = tag->len - 3;
284             }
285             return 0;
286         }
287 
288         if (nrg == hrecs->rg_sz) {
289             size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
290             sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
291             if (!new_rg)
292                 return -1;
293             hrecs->rg = new_rg;
294             hrecs->rg_sz = new_sz;
295         }
296 
297         hrecs->rg[nrg].name = tag->str + 3;
298         hrecs->rg[nrg].name_len = tag->len - 3;
299         hrecs->rg[nrg].ty   = h_type;
300         hrecs->rg[nrg].id   = nrg;
301 
302         k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
303         if (-1 == r) return -1;
304         kh_val(hrecs->rg_hash, k) = nrg;
305 
306         hrecs->nrg++;
307     }
308 
309     /* Add to program hash? */
310     if (type == TYPEKEY("PG")) {
311         sam_hrec_tag_t *tag;
312         sam_hrec_pg_t *new_pg;
313         int npg = hrecs->npg;
314 
315         if (npg == hrecs->pg_sz) {
316             size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
317             new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
318             if (!new_pg)
319                 return -1;
320             hrecs->pg = new_pg;
321             hrecs->pg_sz = new_sz;
322         }
323 
324         tag = h_type->tag;
325         hrecs->pg[npg].name = NULL;
326         hrecs->pg[npg].name_len = 0;
327         hrecs->pg[npg].ty  = h_type;
328         hrecs->pg[npg].id   = npg;
329         hrecs->pg[npg].prev_id = -1;
330 
331         while (tag) {
332             if (tag->str[0] == 'I' && tag->str[1] == 'D') {
333                 /* Avoid duplicate ID tags coming from other applications */
334                 if (!hrecs->pg[npg].name) {
335                     assert(tag->len >= 3);
336                     hrecs->pg[npg].name = tag->str + 3;
337                     hrecs->pg[npg].name_len = tag->len - 3;
338                 } else {
339                     hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
340                 }
341             } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
342                 // Resolve later if needed
343                 khint_t k;
344                 k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
345 
346                 if (k != kh_end(hrecs->pg_hash)) {
347                     int p_id = kh_val(hrecs->pg_hash, k);
348                     hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
349 
350                     /* Unmark previous entry as a PG termination */
351                     if (hrecs->npg_end > 0 &&
352                         hrecs->pg_end[hrecs->npg_end-1] == p_id) {
353                         hrecs->npg_end--;
354                     } else {
355                         int i;
356                         for (i = 0; i < hrecs->npg_end; i++) {
357                             if (hrecs->pg_end[i] == p_id) {
358                                 memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
359                                         (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
360                                 hrecs->npg_end--;
361                             }
362                         }
363                     }
364                 } else {
365                     hrecs->pg[npg].prev_id = -1;
366                 }
367             }
368             tag = tag->next;
369         }
370 
371         if (hrecs->pg[npg].name) {
372             khint_t k;
373             int r;
374             k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
375             if (-1 == r) return -1;
376             kh_val(hrecs->pg_hash, k) = npg;
377         } else {
378             return -1; // ID should be present, according to spec.
379         }
380 
381         /* Add to npg_end[] array. Remove later if we find a PP line */
382         if (hrecs->npg_end >= hrecs->npg_end_alloc) {
383             int *new_pg_end;
384             int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
385 
386             new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
387             if (!new_pg_end)
388                 return -1;
389             hrecs->npg_end_alloc = new_alloc;
390             hrecs->pg_end = new_pg_end;
391         }
392         hrecs->pg_end[hrecs->npg_end++] = npg;
393 
394         hrecs->npg++;
395     }
396 
397     return 0;
398 }
399 
sam_hrecs_remove_hash_entry(sam_hrecs_t * hrecs,khint32_t type,sam_hrec_type_t * h_type)400 static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
401     if (!hrecs || !h_type)
402         return -1;
403 
404     sam_hrec_tag_t *tag;
405     const char *key = NULL;
406     khint_t k;
407 
408     /* Remove name and any alternative names from reference hash */
409     if (type == TYPEKEY("SQ")) {
410         const char *altnames = NULL;
411 
412         tag = h_type->tag;
413 
414         while (tag) {
415             if (tag->str[0] == 'S' && tag->str[1] == 'N') {
416                 assert(tag->len >= 3);
417                 key = tag->str + 3;
418             } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
419                 assert(tag->len >= 3);
420                 altnames = tag->str + 3;
421             }
422             tag = tag->next;
423         }
424 
425         if (key) {
426             k = kh_get(m_s2i, hrecs->ref_hash, key);
427             if (k != kh_end(hrecs->ref_hash)) {
428                 int idx = kh_val(hrecs->ref_hash, k);
429                 if (idx + 1 < hrecs->nref)
430                     memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
431                             sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
432                 if (altnames)
433                     sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
434                 kh_del(m_s2i, hrecs->ref_hash, k);
435                 hrecs->nref--;
436                 if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
437                     hrecs->refs_changed = idx;
438                 for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
439                     if (kh_exist(hrecs->ref_hash, k)
440                         && kh_value(hrecs->ref_hash, k) > idx) {
441                         kh_value(hrecs->ref_hash, k)--;
442                     }
443                 }
444             }
445         }
446     }
447 
448     /* Remove from read-group hash */
449     if (type == TYPEKEY("RG")) {
450         tag = h_type->tag;
451 
452         while (tag) {
453             if (tag->str[0] == 'I' && tag->str[1] == 'D') {
454                 assert(tag->len >= 3);
455                 key = tag->str + 3;
456                 k = kh_get(m_s2i, hrecs->rg_hash, key);
457                 if (k != kh_end(hrecs->rg_hash)) {
458                     int idx = kh_val(hrecs->rg_hash, k);
459                     if (idx + 1 < hrecs->nrg)
460                         memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
461                     kh_del(m_s2i, hrecs->rg_hash, k);
462                     hrecs->nrg--;
463                     for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
464                         if (kh_exist(hrecs->rg_hash, k)
465                             && kh_value(hrecs->rg_hash, k) > idx) {
466                             kh_value(hrecs->rg_hash, k)--;
467                         }
468                     }
469                 }
470                 break;
471             }
472             tag = tag->next;
473         }
474     }
475 
476     return 0;
477 }
478 
479 /** Add a header record to the global line ordering
480  *
481  * If @p after is not NULL, the new record will be inserted after this one,
482  * otherwise it will go at the end.
483  *
484  * An exception is an HD record, which will always be put first unless
485  * one is already present.
486  */
sam_hrecs_global_list_add(sam_hrecs_t * hrecs,sam_hrec_type_t * h_type,sam_hrec_type_t * after)487 static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
488                                       sam_hrec_type_t *h_type,
489                                       sam_hrec_type_t *after) {
490     const khint32_t hd_type = TYPEKEY("HD");
491     int update_first_line = 0;
492 
493     // First line seen
494     if (!hrecs->first_line) {
495         hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
496         return;
497     }
498 
499     // @HD goes at the top (unless there's one already)
500     if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
501         after = hrecs->first_line->global_prev;
502         update_first_line = 1;
503     }
504 
505     // If no instructions given, put it at the end
506     if (!after)
507         after = hrecs->first_line->global_prev;
508 
509     h_type->global_prev = after;
510     h_type->global_next = after->global_next;
511     h_type->global_prev->global_next = h_type;
512     h_type->global_next->global_prev = h_type;
513 
514     if (update_first_line)
515         hrecs->first_line = h_type;
516 }
517 
518 /*! Add header record with a va_list interface.
519  *
520  * Adds a single record to a SAM header.
521  *
522  * This takes a header record type, a va_list argument and one or more
523  * key,value pairs, ending with the NULL key.
524  *
525  * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
526  *
527  * The purpose of the additional va_list parameter is to permit other
528  * varargs functions to call this while including their own additional
529  * parameters; an example is in sam_hdr_add_pg().
530  *
531  * Note: this function invokes va_arg at least once, making the value
532  * of ap indeterminate after the return. The caller should call
533  * va_start/va_end before/after calling this function or use va_copy.
534  *
535  * @return
536  * Returns >= 0 on success;
537  *        -1 on failure
538  */
sam_hrecs_vadd(sam_hrecs_t * hrecs,const char * type,va_list ap,...)539 static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
540     va_list args;
541     sam_hrec_type_t *h_type;
542     sam_hrec_tag_t *h_tag, *last=NULL;
543     int new;
544     khint32_t type_i = TYPEKEY(type), k;
545 
546     if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
547         return sam_hrecs_vupdate(hrecs, h_type, ap);
548 
549     if (!(h_type = pool_alloc(hrecs->type_pool)))
550         return -1;
551     k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
552     if (new < 0)
553         return -1;
554 
555     h_type->type = type_i;
556 
557     // Form the ring, either with self or other lines of this type
558     if (!new) {
559         sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
560         p = t->prev;
561 
562         assert(p->next == t);
563         p->next = h_type;
564         h_type->prev = p;
565 
566         t->prev = h_type;
567         h_type->next = t;
568     } else {
569         kh_val(hrecs->h, k) = h_type;
570         h_type->prev = h_type->next = h_type;
571     }
572     h_type->tag = NULL;
573 
574     // Add to global line ordering after any existing line of the same type,
575     // or at the end if no line of this type exists yet.
576     sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
577 
578     // Check linked-list invariants
579     assert(h_type->prev->next == h_type);
580     assert(h_type->next->prev == h_type);
581     assert(h_type->global_prev->global_next == h_type);
582     assert(h_type->global_next->global_prev == h_type);
583 
584     // Any ... varargs
585     va_start(args, ap);
586     for (;;) {
587         char *key, *val = NULL, *str;
588 
589         if (!(key = (char *)va_arg(args, char *)))
590             break;
591         if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
592             break;
593         if (*val == '\0')
594             continue;
595 
596         if (!(h_tag = pool_alloc(hrecs->tag_pool)))
597             return -1;
598 
599         if (strncmp(type, "CO", 2)) {
600             h_tag->len = 3 + strlen(val);
601             str = string_alloc(hrecs->str_pool, h_tag->len+1);
602             if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
603                 return -1;
604             h_tag->str = str;
605         } else {
606             h_tag->len = strlen(key);
607             h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
608             if (!h_tag->str)
609                 return -1;
610         }
611 
612         h_tag->next = NULL;
613         if (last)
614             last->next = h_tag;
615         else
616             h_type->tag = h_tag;
617 
618         last = h_tag;
619     }
620     va_end(args);
621 
622     // Plus the specified va_list params
623     for (;;) {
624         char *key, *val = NULL, *str;
625 
626         if (!(key = (char *)va_arg(ap, char *)))
627             break;
628         if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
629             break;
630 
631         if (!(h_tag = pool_alloc(hrecs->tag_pool)))
632             return -1;
633 
634         if (strncmp(type, "CO", 2)) {
635             h_tag->len = 3 + strlen(val);
636             str = string_alloc(hrecs->str_pool, h_tag->len+1);
637             if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
638                 return -1;
639             h_tag->str = str;
640         } else {
641             h_tag->len = strlen(key);
642             h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
643             if (!h_tag->str)
644                 return -1;
645         }
646 
647         h_tag->next = NULL;
648         if (last)
649             last->next = h_tag;
650         else
651             h_type->tag = h_tag;
652 
653         last = h_tag;
654     }
655 
656     if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
657         return -1;
658 
659     if (!strncmp(type, "PG", 2))
660         hrecs->pgs_changed = 1;
661 
662     hrecs->dirty = 1;
663 
664     return 0;
665 }
666 
667 // As sam_hrecs_vadd(), but without the extra va_list parameter
sam_hrecs_add(sam_hrecs_t * hrecs,const char * type,...)668 static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
669     va_list args;
670     int res;
671     va_start(args, type);
672     res = sam_hrecs_vadd(hrecs, type, args, NULL);
673     va_end(args);
674     return res;
675 }
676 
677 /*
678  * Function for deallocating a list of tags
679  */
680 
sam_hrecs_free_tags(sam_hrecs_t * hrecs,sam_hrec_tag_t * tag)681 static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
682     if (!hrecs || !tag)
683         return;
684     if (tag->next)
685         sam_hrecs_free_tags(hrecs, tag->next);
686 
687     pool_free(hrecs->tag_pool, tag);
688 }
689 
sam_hrecs_remove_line(sam_hrecs_t * hrecs,const char * type_name,sam_hrec_type_t * type_found)690 static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) {
691     if (!hrecs || !type_name || !type_found)
692         return -1;
693 
694     khint32_t itype = TYPEKEY(type_name);
695     khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
696     if (k == kh_end(hrecs->h))
697         return -1;
698 
699     // Remove from global list (remembering it could be the only line)
700     if (hrecs->first_line == type_found) {
701         hrecs->first_line = (type_found->global_next != type_found
702                              ? type_found->global_next : NULL);
703     }
704     type_found->global_next->global_prev = type_found->global_prev;
705     type_found->global_prev->global_next = type_found->global_next;
706 
707     /* single element in the list */
708     if (type_found->prev == type_found || type_found->next == type_found) {
709         kh_del(sam_hrecs_t, hrecs->h, k);
710     } else {
711         type_found->prev->next = type_found->next;
712         type_found->next->prev = type_found->prev;
713         if (kh_val(hrecs->h, k) == type_found) { //first element
714             kh_val(hrecs->h, k) = type_found->next;
715         }
716     }
717 
718     if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))
719         sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
720 
721     sam_hrecs_free_tags(hrecs, type_found->tag);
722     pool_free(hrecs->type_pool, type_found);
723 
724     hrecs->dirty = 1;
725 
726     return 0;
727 }
728 
729 // Paste together a line from the parsed data structures
build_header_line(const sam_hrec_type_t * ty,kstring_t * ks)730 static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
731     sam_hrec_tag_t *tag;
732     int r = 0;
733     char c[2]= { ty->type >> 8, ty->type & 0xff };
734 
735     r |= (kputc_('@', ks) == EOF);
736     r |= (kputsn(c, 2, ks) == EOF);
737     for (tag = ty->tag; tag; tag = tag->next) {
738         r |= (kputc_('\t', ks) == EOF);
739         r |= (kputsn(tag->str, tag->len, ks) == EOF);
740     }
741 
742     return r;
743 }
744 
sam_hrecs_rebuild_lines(const sam_hrecs_t * hrecs,kstring_t * ks)745 static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
746     const sam_hrec_type_t *t1, *t2;
747 
748     if (!hrecs->first_line)
749         return kputsn("", 0, ks) >= 0 ? 0 : -1;
750 
751     t1 = t2 = hrecs->first_line;
752     do {
753         if (build_header_line(t1, ks) != 0)
754             return -1;
755         if (kputc('\n', ks) < 0)
756             return -1;
757 
758         t1 = t1->global_next;
759     } while (t1 != t2);
760 
761     return 0;
762 }
763 
sam_hrecs_parse_lines(sam_hrecs_t * hrecs,const char * hdr,size_t len)764 static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
765     size_t i, lno;
766 
767     if (!hrecs || len > SSIZE_MAX)
768         return -1;
769 
770     if (!len)
771         len = strlen(hdr);
772 
773     if (len < 3) {
774         if (len == 0 || *hdr == '\0') return 0;
775         sam_hrecs_error("Header line too short", hdr, len, 1);
776         return -1;
777     }
778 
779     for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) {
780         khint32_t type;
781         khint_t k;
782 
783         int l_start = i, new;
784         sam_hrec_type_t *h_type;
785         sam_hrec_tag_t *h_tag, *last;
786 
787         if (hdr[i] != '@') {
788             sam_hrecs_error("Header line does not start with '@'",
789                           &hdr[l_start], len - l_start, lno);
790             return -1;
791         }
792 
793         if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) {
794             sam_hrecs_error("Header line does not have a two character key",
795                           &hdr[l_start], len - l_start, lno);
796             return -1;
797         }
798         type = TYPEKEY(&hdr[i+1]);
799 
800         i += 3;
801         if (i == len || hdr[i] == '\n')
802             continue;
803 
804         // Add the header line type
805         if (!(h_type = pool_alloc(hrecs->type_pool)))
806             return -1;
807         k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
808         if (new < 0)
809             return -1;
810 
811         h_type->type = type;
812 
813         // Add to end of global list
814         sam_hrecs_global_list_add(hrecs, h_type, NULL);
815 
816         // Form the ring, either with self or other lines of this type
817         if (!new) {
818             sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
819             p = t->prev;
820 
821             assert(p->next == t);
822             p->next = h_type;
823             h_type->prev = p;
824 
825             t->prev = h_type;
826             h_type->next = t;
827         } else {
828             kh_val(hrecs->h, k) = h_type;
829             h_type->prev = h_type->next = h_type;
830         }
831 
832         // Parse the tags on this line
833         last = NULL;
834         if (type == TYPEKEY("CO")) {
835             size_t j;
836 
837             if (i == len || hdr[i] != '\t') {
838                 sam_hrecs_error("Missing tab",
839                               &hdr[l_start], len - l_start, lno);
840                 return -1;
841             }
842 
843             for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
844                 ;
845 
846             if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
847                 return -1;
848             h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
849             h_tag->len = j-i;
850             h_tag->next = NULL;
851             if (!h_tag->str)
852                 return -1;
853 
854             i = j;
855 
856         } else {
857             do {
858                 size_t j;
859 
860                 if (i == len || hdr[i] != '\t') {
861                     sam_hrecs_error("Missing tab",
862                                   &hdr[l_start], len - l_start, lno);
863                     return -1;
864                 }
865 
866                 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
867                     ;
868 
869                 if (j - i < 3 || hdr[i + 2] != ':') {
870                     sam_hrecs_error("Malformed key:value pair",
871                                    &hdr[l_start], len - l_start, lno);
872                     return -1;
873                 }
874 
875                 if (!(h_tag = pool_alloc(hrecs->tag_pool)))
876                     return -1;
877                 h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
878                 h_tag->len = j-i;
879                 h_tag->next = NULL;
880                 if (!h_tag->str)
881                     return -1;
882 
883                 if (last)
884                     last->next = h_tag;
885                 else
886                     h_type->tag = h_tag;
887 
888                 last = h_tag;
889                 i = j;
890             } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
891         }
892 
893         /* Update RG/SQ hashes */
894         if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type))
895             return -1;
896     }
897 
898     return 0;
899 }
900 
901 /*! Update sam_hdr_t target_name and target_len arrays
902  *
903  *  @return 0 on success; -1 on failure
904  */
sam_hdr_update_target_arrays(sam_hdr_t * bh,const sam_hrecs_t * hrecs,int refs_changed)905 int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
906                                  int refs_changed) {
907     if (!bh || !hrecs)
908         return -1;
909 
910     if (refs_changed < 0)
911         return 0;
912 
913     // Grow arrays if necessary
914     if (bh->n_targets < hrecs->nref) {
915         char **new_names = realloc(bh->target_name,
916                                    hrecs->nref * sizeof(*new_names));
917         if (!new_names)
918             return -1;
919         bh->target_name = new_names;
920         uint32_t *new_lens = realloc(bh->target_len,
921                                      hrecs->nref * sizeof(*new_lens));
922         if (!new_lens)
923             return -1;
924         bh->target_len = new_lens;
925     }
926 
927     // Update names and lengths where changed
928     // hrecs->refs_changed is the first ref that has been updated, so ones
929     // before that can be skipped.
930     int i;
931     khint_t k;
932     khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
933     for (i = refs_changed; i < hrecs->nref; i++) {
934         if (i >= bh->n_targets
935             || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
936             if (i < bh->n_targets)
937                 free(bh->target_name[i]);
938             bh->target_name[i] = strdup(hrecs->ref[i].name);
939             if (!bh->target_name[i])
940                 return -1;
941         }
942         if (hrecs->ref[i].len < UINT32_MAX) {
943             bh->target_len[i] = hrecs->ref[i].len;
944 
945             if (!long_refs)
946                 continue;
947 
948             // Check if we have an old length, if so remove it.
949             k = kh_get(s2i, long_refs, bh->target_name[i]);
950             if (k < kh_end(long_refs))
951                 kh_del(s2i, long_refs, k);
952         } else {
953             bh->target_len[i] = UINT32_MAX;
954             if (bh->hrecs != hrecs) {
955                 // Called from sam_hdr_dup; need to add sdict entries
956                 if (!long_refs) {
957                     if (!(bh->sdict = long_refs = kh_init(s2i)))
958                         return -1;
959                 }
960 
961                 // Add / update length
962                 int absent;
963                 k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
964                 if (absent < 0)
965                     return -1;
966                 kh_val(long_refs, k) = hrecs->ref[i].len;
967             }
968         }
969     }
970 
971     // Free up any names that have been removed
972     for (; i < bh->n_targets; i++) {
973         if (long_refs) {
974             k = kh_get(s2i, long_refs, bh->target_name[i]);
975             if (k < kh_end(long_refs))
976                 kh_del(s2i, long_refs, k);
977         }
978         free(bh->target_name[i]);
979     }
980 
981     bh->n_targets = hrecs->nref;
982     return 0;
983 }
984 
rebuild_target_arrays(sam_hdr_t * bh)985 static int rebuild_target_arrays(sam_hdr_t *bh) {
986     if (!bh || !bh->hrecs)
987         return -1;
988 
989     sam_hrecs_t *hrecs = bh->hrecs;
990     if (hrecs->refs_changed < 0)
991         return 0;
992 
993     if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
994         return -1;
995 
996     hrecs->refs_changed = -1;
997     return 0;
998 }
999 
1000 /// Populate hrecs refs array from header target_name, target_len arrays
1001 /**
1002  * @return 0 on success; -1 on failure
1003  *
1004  * Pre-fills the refs hash from the target arrays.  For BAM files this
1005  * will ensure that they are in the correct order as the target arrays
1006  * are the canonical source for converting target ids to names and lengths.
1007  *
1008  * The added entries do not link to a header line. sam_hrecs_update_hashes()
1009  * will add the links later for lines found in the text header.
1010  *
1011  * This should be called before the text header is parsed.
1012  */
sam_hrecs_refs_from_targets_array(sam_hrecs_t * hrecs,const sam_hdr_t * bh)1013 static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1014                                              const sam_hdr_t *bh) {
1015     int32_t tid = 0;
1016 
1017     if (!hrecs || !bh)
1018         return -1;
1019 
1020     // This should always be called before parsing the text header
1021     // so the ref array should start off empty, and we don't have to try
1022     // to reconcile any existing data.
1023     if (hrecs->nref > 0) {
1024         hts_log_error("Called with non-empty ref array");
1025         return -1;
1026     }
1027 
1028     if (hrecs->ref_sz < bh->n_targets) {
1029         sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1030                                          bh->n_targets * sizeof(*new_ref));
1031         if (!new_ref)
1032             return -1;
1033 
1034         hrecs->ref = new_ref;
1035         hrecs->ref_sz = bh->n_targets;
1036     }
1037 
1038     for (tid = 0; tid < bh->n_targets; tid++) {
1039         khint_t k;
1040         int r;
1041         hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1042         if (!hrecs->ref[tid].name) goto fail;
1043         if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1044             hrecs->ref[tid].len  = bh->target_len[tid];
1045         } else {
1046             khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1047             k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1048             if (k < kh_end(long_refs)) {
1049                 hrecs->ref[tid].len = kh_val(long_refs, k);
1050             } else {
1051                 hrecs->ref[tid].len = UINT32_MAX;
1052             }
1053         }
1054         hrecs->ref[tid].ty   = NULL;
1055         k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1056         if (r < 0) goto fail;
1057         if (r == 0) {
1058             hts_log_error("Duplicate entry \"%s\" in target list",
1059                             hrecs->ref[tid].name);
1060             return -1;
1061         } else {
1062             kh_val(hrecs->ref_hash, k) = tid;
1063         }
1064     }
1065     hrecs->nref = bh->n_targets;
1066     return 0;
1067 
1068  fail: {
1069         int32_t i;
1070         hts_log_error("%s", strerror(errno));
1071         for (i = 0; i < tid; i++) {
1072             khint_t k;
1073             if (!hrecs->ref[i].name) continue;
1074             k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1075             if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1076         }
1077         hrecs->nref = 0;
1078         return -1;
1079     }
1080 }
1081 
1082 /*
1083  * Add SQ header records for any references in the hrecs->ref array that
1084  * were added by sam_hrecs_refs_from_targets_array() but have not
1085  * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1086  *
1087  * This may be needed either because:
1088  *
1089  *   - A bam file was read that had entries in its refs list with no
1090  *     corresponding @SQ line.
1091  *
1092  *   - A program constructed a sam_hdr_t which has target_name and target_len
1093  *     array entries with no corresponding @SQ line in text.
1094  */
add_stub_ref_sq_lines(sam_hrecs_t * hrecs)1095 static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1096     int tid;
1097     char len[32];
1098 
1099     for (tid = 0; tid < hrecs->nref; tid++) {
1100         if (hrecs->ref[tid].ty == NULL) {
1101             snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1102             if (sam_hrecs_add(hrecs, "SQ",
1103                               "SN", hrecs->ref[tid].name,
1104                               "LN", len, NULL) != 0)
1105                 return -1;
1106 
1107             // Check that the stub has actually been filled
1108             if(hrecs->ref[tid].ty == NULL) {
1109                 hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1110                         tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1111                 return -1;
1112             }
1113         }
1114     }
1115     return 0;
1116 }
1117 
sam_hdr_fill_hrecs(sam_hdr_t * bh)1118 int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1119     sam_hrecs_t *hrecs = sam_hrecs_new();
1120 
1121     if (!hrecs)
1122         return -1;
1123 
1124     if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1125         if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1126             sam_hrecs_free(hrecs);
1127             return -1;
1128         }
1129     }
1130 
1131     // Parse existing header text
1132     if (bh->text && bh->l_text > 0) {
1133         if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1134             sam_hrecs_free(hrecs);
1135             return -1;
1136         }
1137     }
1138 
1139     if (add_stub_ref_sq_lines(hrecs) < 0) {
1140         sam_hrecs_free(hrecs);
1141         return -1;
1142     }
1143 
1144     bh->hrecs = hrecs;
1145 
1146     if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1147         return -1;
1148 
1149     return 0;
1150 }
1151 
1152 /** Remove outdated header text
1153 
1154     @param bh     BAM header
1155 
1156     This is called when API functions have changed the header so that the
1157     text version is no longer valid.
1158  */
redact_header_text(sam_hdr_t * bh)1159 static void redact_header_text(sam_hdr_t *bh) {
1160     assert(bh->hrecs && bh->hrecs->dirty);
1161     bh->l_text = 0;
1162     free(bh->text);
1163     bh->text = NULL;
1164 }
1165 
1166 /** Find nth header record of a given type
1167 
1168     @param type   Header type (SQ, RG etc.)
1169     @param idx    0-based index
1170 
1171     @return sam_hrec_type_t pointer to the record on success
1172             NULL if no record exists with the given type and index
1173  */
1174 
sam_hrecs_find_type_pos(sam_hrecs_t * hrecs,const char * type,int idx)1175 static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1176                                                 const char *type, int idx) {
1177     sam_hrec_type_t *first, *itr;
1178 
1179     if (idx < 0)
1180         return NULL;
1181 
1182     if (type[0] == 'S' && type[1] == 'Q')
1183         return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1184 
1185     if (type[0] == 'R' && type[1] == 'G')
1186         return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1187 
1188     if (type[0] == 'P' && type[1] == 'G')
1189         return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1190 
1191     first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1192     if (!first)
1193         return NULL;
1194 
1195     while (idx > 0) {
1196         itr = itr->next;
1197         if (itr == first)
1198             break;
1199         --idx;
1200     }
1201 
1202     return idx == 0 ? itr : NULL;
1203 }
1204 
1205 /* ==== Public methods ==== */
1206 
sam_hdr_length(sam_hdr_t * bh)1207 size_t sam_hdr_length(sam_hdr_t *bh) {
1208     if (!bh || -1 == sam_hdr_rebuild(bh))
1209         return SIZE_MAX;
1210 
1211     return bh->l_text;
1212 }
1213 
sam_hdr_str(sam_hdr_t * bh)1214 const char *sam_hdr_str(sam_hdr_t *bh) {
1215     if (!bh || -1 == sam_hdr_rebuild(bh))
1216         return NULL;
1217 
1218     return bh->text;
1219 }
1220 
sam_hdr_nref(const sam_hdr_t * bh)1221 int sam_hdr_nref(const sam_hdr_t *bh) {
1222     if (!bh)
1223         return -1;
1224 
1225     return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1226 }
1227 
1228 /*
1229  * Reconstructs the text representation from the header hash table.
1230  * Returns 0 on success
1231  *        -1 on failure
1232  */
sam_hdr_rebuild(sam_hdr_t * bh)1233 int sam_hdr_rebuild(sam_hdr_t *bh) {
1234     sam_hrecs_t *hrecs;
1235     if (!bh)
1236         return -1;
1237 
1238     if (!(hrecs = bh->hrecs))
1239         return bh->text ? 0 : -1;
1240 
1241     if (hrecs->refs_changed >= 0) {
1242         if (rebuild_target_arrays(bh) < 0) {
1243             hts_log_error("Header target array rebuild has failed");
1244             return -1;
1245         }
1246     }
1247 
1248     /* If header text wasn't changed or header is empty, don't rebuild it. */
1249     if (!hrecs->dirty)
1250         return 0;
1251 
1252     if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1253         hts_log_error("Linking @PG lines has failed");
1254         return -1;
1255     }
1256 
1257     kstring_t ks = KS_INITIALIZE;
1258     if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1259         ks_free(&ks);
1260         hts_log_error("Header text rebuild has failed");
1261         return -1;
1262     }
1263 
1264     hrecs->dirty = 0;
1265 
1266     /* Sync */
1267     free(bh->text);
1268     bh->l_text = ks_len(&ks);
1269     bh->text = ks_release(&ks);
1270 
1271     return 0;
1272 }
1273 
1274 /*
1275  * Appends a formatted line to an existing SAM header.
1276  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1277  * optional new-line. If it contains more than 1 line then multiple lines
1278  * will be added in order.
1279  *
1280  * Input text is of maximum length len or as terminated earlier by a NUL.
1281  * len may be 0 if unknown, in which case lines must be NUL-terminated.
1282  *
1283  * Returns 0 on success
1284  *        -1 on failure
1285  */
sam_hdr_add_lines(sam_hdr_t * bh,const char * lines,size_t len)1286 int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1287     sam_hrecs_t *hrecs;
1288 
1289     if (!bh || !lines)
1290         return -1;
1291 
1292     if (len == 0 && *lines == '\0')
1293         return 0;
1294 
1295     if (!(hrecs = bh->hrecs)) {
1296         if (sam_hdr_fill_hrecs(bh) != 0)
1297             return -1;
1298         hrecs = bh->hrecs;
1299     }
1300 
1301     if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1302         return -1;
1303 
1304     if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1305         return -1;
1306 
1307     hrecs->dirty = 1;
1308     redact_header_text(bh);
1309 
1310     return 0;
1311 }
1312 
1313 /*
1314  * Adds a single line to a SAM header.
1315  * Specify type and one or more key,value pairs, ending with the NULL key.
1316  * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1317  *
1318  * Returns 0 on success
1319  *        -1 on failure
1320  */
sam_hdr_add_line(sam_hdr_t * bh,const char * type,...)1321 int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1322     va_list args;
1323     sam_hrecs_t *hrecs;
1324 
1325     if (!bh || !type)
1326         return -1;
1327 
1328     if (!(hrecs = bh->hrecs)) {
1329         if (sam_hdr_fill_hrecs(bh) != 0)
1330             return -1;
1331         hrecs = bh->hrecs;
1332     }
1333 
1334     va_start(args, type);
1335     int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1336     va_end(args);
1337 
1338     if (ret == 0) {
1339         if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1340             return -1;
1341 
1342         if (hrecs->dirty)
1343             redact_header_text(bh);
1344     }
1345 
1346     return ret;
1347 }
1348 
1349 /*
1350  * Returns a complete line of formatted text for a specific head type/ID
1351  * combination. If ID_key is NULL then it returns the first line of the specified
1352  * type.
1353  */
sam_hdr_find_line_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_val,kstring_t * ks)1354 int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1355                       const char *ID_key, const char *ID_val, kstring_t *ks) {
1356     sam_hrecs_t *hrecs;
1357     if (!bh || !type)
1358         return -2;
1359 
1360     if (!(hrecs = bh->hrecs)) {
1361         if (sam_hdr_fill_hrecs(bh) != 0)
1362             return -2;
1363         hrecs = bh->hrecs;
1364     }
1365 
1366     sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1367     if (!ty)
1368         return -1;
1369 
1370     ks->l = 0;
1371     if (build_header_line(ty, ks) < 0) {
1372         return -2;
1373     }
1374 
1375     return 0;
1376 }
1377 
sam_hdr_find_line_pos(sam_hdr_t * bh,const char * type,int pos,kstring_t * ks)1378 int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1379                           int pos, kstring_t *ks) {
1380     sam_hrecs_t *hrecs;
1381     if (!bh || !type)
1382         return -2;
1383 
1384     if (!(hrecs = bh->hrecs)) {
1385         if (sam_hdr_fill_hrecs(bh) != 0)
1386             return -2;
1387         hrecs = bh->hrecs;
1388     }
1389 
1390     sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1391     if (!ty)
1392         return -1;
1393 
1394     ks->l = 0;
1395     if (build_header_line(ty, ks) < 0) {
1396         return -2;
1397     }
1398 
1399     return 0;
1400 }
1401 
1402 /*
1403  * Remove a line from the header by specifying a tag:value that uniquely
1404  * identifies a line, i.e. the @SQ line containing "SN:ref1".
1405  * @SQ line is uniquely identified by SN tag.
1406  * @RG line is uniquely identified by ID tag.
1407  * @PG line is uniquely identified by ID tag.
1408  *
1409  * Returns 0 on success and -1 on error
1410  */
1411 
sam_hdr_remove_line_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value)1412 int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1413     sam_hrecs_t *hrecs;
1414     if (!bh || !type)
1415         return -1;
1416 
1417     if (!(hrecs = bh->hrecs)) {
1418         if (sam_hdr_fill_hrecs(bh) != 0)
1419             return -1;
1420         hrecs = bh->hrecs;
1421     }
1422 
1423     if (!strncmp(type, "PG", 2)) {
1424         hts_log_warning("Removing PG lines is not supported!");
1425         return -1;
1426     }
1427 
1428     sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1429     if (!type_found)
1430         return 0;
1431 
1432     int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1433     if (ret == 0) {
1434         if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1435             return -1;
1436 
1437         if (hrecs->dirty)
1438             redact_header_text(bh);
1439     }
1440 
1441     return ret;
1442 }
1443 
1444 /*
1445  * Remove a line from the header by specifying the position in the type
1446  * group, i.e. 3rd @SQ line.
1447  *
1448  * Returns 0 on success and -1 on error
1449  */
1450 
sam_hdr_remove_line_pos(sam_hdr_t * bh,const char * type,int position)1451 int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1452     sam_hrecs_t *hrecs;
1453     if (!bh || !type || position <= 0)
1454         return -1;
1455 
1456     if (!(hrecs = bh->hrecs)) {
1457         if (sam_hdr_fill_hrecs(bh) != 0)
1458             return -1;
1459         hrecs = bh->hrecs;
1460     }
1461 
1462     if (!strncmp(type, "PG", 2)) {
1463         hts_log_warning("Removing PG lines is not supported!");
1464         return -1;
1465     }
1466 
1467     sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1468                                                           position);
1469     if (!type_found)
1470         return -1;
1471 
1472     int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1473     if (ret == 0) {
1474         if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1475             return -1;
1476 
1477         if (hrecs->dirty)
1478             redact_header_text(bh);
1479     }
1480 
1481     return ret;
1482 }
1483 
1484 /*
1485  * Check if sam_hdr_update_line() is being used to change the name of
1486  * a record, and if the new name is going to clash with an existing one.
1487  *
1488  * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1489  * will go through them all and leave the final one in place.
1490  *
1491  * Returns 0 if the name does not change
1492  *         1 if the name changes but does not clash
1493  *        -1 if the name changes and the new one is already in use
1494  */
check_for_name_update(sam_hrecs_t * hrecs,sam_hrec_type_t * rec,va_list ap,const char ** old_name,const char ** new_name,char id_tag_out[3],khash_t (m_s2i)** hash_out)1495 static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1496                                  va_list ap, const char **old_name,
1497                                  const char **new_name,
1498                                  char id_tag_out[3],
1499                                  khash_t(m_s2i) **hash_out) {
1500     char *key, *val;
1501     const char *id_tag;
1502     sam_hrec_tag_t *tag, *prev;
1503     khash_t(m_s2i) *hash;
1504     khint_t k;
1505     int ret = 0;
1506 
1507     if        (rec->type == TYPEKEY("SQ")) {
1508         id_tag = "SN"; hash = hrecs->ref_hash;
1509     } else if (rec->type == TYPEKEY("RG")) {
1510         id_tag = "ID"; hash = hrecs->rg_hash;
1511     } else if (rec->type == TYPEKEY("PG")) {
1512         id_tag = "ID"; hash = hrecs->pg_hash;
1513     } else {
1514         return 0;
1515     }
1516 
1517     memcpy(id_tag_out, id_tag, 3);
1518     *hash_out = hash;
1519 
1520     tag = sam_hrecs_find_key(rec, id_tag, &prev);
1521     if (!tag)
1522         return 0;
1523     assert(tag->len >= 3);
1524     *old_name = tag->str + 3;
1525 
1526     while ((key = va_arg(ap, char *)) != NULL) {
1527         val = va_arg(ap, char *);
1528         if (!val) val = "";
1529         if (strcmp(key, id_tag) != 0) continue;
1530         if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1531         k = kh_get(m_s2i, hash, val);
1532         ret = k < kh_end(hash) ? -1 : 1;
1533         *new_name = val;
1534     }
1535     return ret;
1536 }
1537 
sam_hdr_update_line(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,...)1538 int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1539         const char *ID_key, const char *ID_value, ...) {
1540     sam_hrecs_t *hrecs;
1541     if (!bh)
1542         return -1;
1543 
1544     if (!(hrecs = bh->hrecs)) {
1545         if (sam_hdr_fill_hrecs(bh) != 0)
1546             return -1;
1547         hrecs = bh->hrecs;
1548     }
1549 
1550     int ret, rename;
1551     sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1552     if (!ty)
1553         return -1;
1554 
1555     va_list args;
1556     const char *old_name = "?", *new_name = "?";
1557     char id_tag[3];
1558     khash_t(m_s2i) *hash = NULL;
1559     va_start(args, ID_value);
1560     rename = check_for_name_update(hrecs, ty, args,
1561                                    &old_name, &new_name, id_tag, &hash);
1562     va_end(args);
1563     if (rename < 0) {
1564         hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1565                       type, old_name, new_name);
1566         return -1;
1567     }
1568     if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1569         // This is just too complicated
1570         hts_log_error("Renaming @PG records is not supported");
1571         return -1;
1572     }
1573     va_start(args, ID_value);
1574     ret = sam_hrecs_vupdate(hrecs, ty, args);
1575     va_end(args);
1576 
1577     if (ret)
1578         return ret;
1579 
1580     // TODO Account for @SQ-AN altnames
1581 
1582     if (rename) {
1583         // Adjust the hash table to point to the new name
1584         // sam_hrecs_update_hashes() should sort out everything else
1585         khint_t k = kh_get(m_s2i, hash, old_name);
1586         sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1587         int r, pos;
1588         assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1589         assert(new_tag && new_tag->str); // id_tag should exist
1590         assert(new_tag->len > 3);
1591         pos = kh_val(hash, k);
1592         kh_del(m_s2i, hash, k);
1593         k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1594         if (r < 1) {
1595             hts_log_error("Failed to rename item in hash table");
1596             return -1;
1597         }
1598         kh_val(hash, k) = pos;
1599     }
1600 
1601     ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1602 
1603     if (!ret && hrecs->refs_changed >= 0)
1604         ret = rebuild_target_arrays(bh);
1605 
1606     if (!ret && hrecs->dirty)
1607         redact_header_text(bh);
1608 
1609     return ret;
1610 }
1611 
sam_hdr_remove_except(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value)1612 int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1613     sam_hrecs_t *hrecs;
1614     if (!bh || !type)
1615         return -1;
1616 
1617     if (!(hrecs = bh->hrecs)) {
1618         if (sam_hdr_fill_hrecs(bh) != 0)
1619             return -1;
1620         hrecs = bh->hrecs;
1621     }
1622 
1623     sam_hrec_type_t *step;
1624     int ret = 1, remove_all = (ID_key == NULL);
1625 
1626     if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
1627         hts_log_warning("Removing PG or CO lines is not supported!");
1628         return -1;
1629     }
1630 
1631     sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1632     if (!type_found) { // remove all line of this type
1633         khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1634         if (k == kh_end(hrecs->h))
1635             return 0;
1636         type_found =  kh_val(hrecs->h, k);
1637         if (!type_found)
1638             return 0;
1639         remove_all = 1;
1640     }
1641 
1642     step = type_found->next;
1643     while (step != type_found) {
1644         sam_hrec_type_t *to_remove = step;
1645         step = step->next;
1646         ret &= sam_hrecs_remove_line(hrecs, type, to_remove);
1647     }
1648 
1649     if (remove_all)
1650         ret &= sam_hrecs_remove_line(hrecs, type, type_found);
1651 
1652     if (!ret && hrecs->dirty)
1653         redact_header_text(bh);
1654 
1655     return 0;
1656 }
1657 
sam_hdr_remove_lines(sam_hdr_t * bh,const char * type,const char * id,void * vrh)1658 int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
1659     sam_hrecs_t *hrecs;
1660     rmhash_t *rh = (rmhash_t *)vrh;
1661 
1662     if (!bh || !type)
1663         return -1;
1664     if (!rh) // remove all lines
1665         return sam_hdr_remove_except(bh, type, NULL, NULL);
1666     if (!id)
1667         return -1;
1668 
1669     if (!(hrecs = bh->hrecs)) {
1670         if (sam_hdr_fill_hrecs(bh) != 0)
1671             return -1;
1672         hrecs = bh->hrecs;
1673     }
1674 
1675     khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1676     if (k == kh_end(hrecs->h)) // nothing to remove from
1677         return 0;
1678 
1679     sam_hrec_type_t *head = kh_val(hrecs->h, k);
1680     if (!head) {
1681         hts_log_error("Header inconsistency");
1682         return -1;
1683     }
1684 
1685     int ret = 0;
1686     sam_hrec_type_t *step = head->next;
1687     while (step != head) {
1688         sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
1689         if (tag && tag->str && tag->len >= 3) {
1690            k = kh_get(rm, rh, tag->str+3);
1691            if (k == kh_end(rh)) { // value is not in the hash table, so remove
1692                sam_hrec_type_t *to_remove = step;
1693                step = step->next;
1694                ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1695            } else {
1696                step = step->next;
1697            }
1698         } else { // tag is not on the line, so skip to next line
1699             step = step->next;
1700         }
1701     }
1702 
1703     // process the first line
1704     sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
1705     if (tag && tag->str && tag->len >= 3) {
1706        k = kh_get(rm, rh, tag->str+3);
1707        if (k == kh_end(rh)) { // value is not in the hash table, so remove
1708            sam_hrec_type_t *to_remove = head;
1709            head = head->next;
1710            ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1711        }
1712     }
1713 
1714     if (!ret && hrecs->dirty)
1715         redact_header_text(bh);
1716 
1717     return ret;
1718 }
1719 
sam_hdr_count_lines(sam_hdr_t * bh,const char * type)1720 int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
1721     int count;
1722     sam_hrec_type_t *first_ty, *itr_ty;
1723 
1724     if (!bh || !type)
1725         return -1;
1726 
1727     if (!bh->hrecs) {
1728         if (sam_hdr_fill_hrecs(bh) != 0)
1729             return -1;
1730     }
1731 
1732     // Deal with types that have counts
1733     switch (type[0]) {
1734     case 'S':
1735         if (type[1] == 'Q')
1736             return bh->hrecs->nref;
1737         break;
1738     case 'R':
1739         if (type[1] == 'G')
1740             return bh->hrecs->nrg;
1741         break;
1742     case 'P':
1743         if (type[1] == 'G')
1744             return bh->hrecs->npg;
1745         break;
1746     default:
1747         break;
1748     }
1749 
1750     first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
1751     if (!first_ty)
1752         return 0;
1753 
1754     count = 1;
1755     for (itr_ty = first_ty->next;
1756          itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
1757         count++;
1758     }
1759 
1760     return count;
1761 }
1762 
sam_hdr_line_index(sam_hdr_t * bh,const char * type,const char * key)1763 int sam_hdr_line_index(sam_hdr_t *bh,
1764                        const char *type,
1765                        const char *key) {
1766     sam_hrecs_t *hrecs;
1767     if (!bh || !type || !key)
1768         return -2;
1769 
1770     if (!(hrecs = bh->hrecs)) {
1771         if (sam_hdr_fill_hrecs(bh) != 0)
1772             return -2;
1773         hrecs = bh->hrecs;
1774     }
1775 
1776     khint_t k;
1777     int idx = -1;
1778     switch (type[0]) {
1779     case 'S':
1780         if (type[1] == 'Q') {
1781             k = kh_get(m_s2i, hrecs->ref_hash, key);
1782             if (k != kh_end(hrecs->ref_hash))
1783                 idx = kh_val(hrecs->ref_hash, k);
1784         } else {
1785             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1786         }
1787         break;
1788     case 'R':
1789         if (type[1] == 'G') {
1790             k = kh_get(m_s2i, hrecs->rg_hash, key);
1791             if (k != kh_end(hrecs->rg_hash))
1792                 idx = kh_val(hrecs->rg_hash, k);
1793         } else {
1794             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1795         }
1796         break;
1797     case 'P':
1798         if (type[1] == 'G') {
1799             k = kh_get(m_s2i, hrecs->pg_hash, key);
1800             if (k != kh_end(hrecs->pg_hash))
1801                 idx = kh_val(hrecs->pg_hash, k);
1802         } else {
1803             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1804         }
1805         break;
1806     default:
1807         hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1808     }
1809 
1810     return idx;
1811 }
1812 
sam_hdr_line_name(sam_hdr_t * bh,const char * type,int pos)1813 const char *sam_hdr_line_name(sam_hdr_t *bh,
1814                               const char *type,
1815                               int pos) {
1816     sam_hrecs_t *hrecs;
1817     if (!bh || !type || pos < 0)
1818         return NULL;
1819 
1820     if (!(hrecs = bh->hrecs)) {
1821         if (sam_hdr_fill_hrecs(bh) != 0)
1822             return NULL;
1823         hrecs = bh->hrecs;
1824     }
1825 
1826     switch (type[0]) {
1827     case 'S':
1828         if (type[1] == 'Q') {
1829             if (pos < hrecs->nref)
1830                 return hrecs->ref[pos].name;
1831         } else {
1832             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1833         }
1834         break;
1835     case 'R':
1836         if (type[1] == 'G') {
1837             if (pos < hrecs->nrg)
1838                 return hrecs->rg[pos].name;
1839         } else {
1840             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1841         }
1842         break;
1843     case 'P':
1844         if (type[1] == 'G') {
1845             if (pos < hrecs->npg)
1846                 return hrecs->pg[pos].name;
1847         } else {
1848             hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1849         }
1850         break;
1851     default:
1852         hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1853     }
1854 
1855     return NULL;
1856 }
1857 
1858 /* ==== Key:val level methods ==== */
1859 
sam_hdr_find_tag_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,const char * key,kstring_t * ks)1860 int sam_hdr_find_tag_id(sam_hdr_t *bh,
1861                      const char *type,
1862                      const char *ID_key,
1863                      const char *ID_value,
1864                      const char *key,
1865                      kstring_t *ks) {
1866     sam_hrecs_t *hrecs;
1867     if (!bh || !type || !key)
1868         return -2;
1869 
1870     if (!(hrecs = bh->hrecs)) {
1871         if (sam_hdr_fill_hrecs(bh) != 0)
1872             return -2;
1873         hrecs = bh->hrecs;
1874     }
1875 
1876     sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1877     if (!ty)
1878         return -1;
1879 
1880     sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1881     if (!tag || !tag->str || tag->len < 4)
1882         return -1;
1883 
1884     ks->l = 0;
1885     if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1886         return -2;
1887     }
1888 
1889     return 0;
1890 }
1891 
sam_hdr_find_tag_pos(sam_hdr_t * bh,const char * type,int pos,const char * key,kstring_t * ks)1892 int sam_hdr_find_tag_pos(sam_hdr_t *bh,
1893                      const char *type,
1894                      int pos,
1895                      const char *key,
1896                      kstring_t *ks) {
1897     sam_hrecs_t *hrecs;
1898     if (!bh || !type || !key)
1899         return -2;
1900 
1901     if (!(hrecs = bh->hrecs)) {
1902         if (sam_hdr_fill_hrecs(bh) != 0)
1903             return -2;
1904         hrecs = bh->hrecs;
1905     }
1906 
1907     sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1908     if (!ty)
1909         return -1;
1910 
1911     sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1912     if (!tag || !tag->str || tag->len < 4)
1913         return -1;
1914 
1915     ks->l = 0;
1916     if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1917         return -2;
1918     }
1919 
1920     return 0;
1921 }
1922 
sam_hdr_remove_tag_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,const char * key)1923 int sam_hdr_remove_tag_id(sam_hdr_t *bh,
1924         const char *type,
1925         const char *ID_key,
1926         const char *ID_value,
1927         const char *key) {
1928     sam_hrecs_t *hrecs;
1929     if (!bh || !type || !key)
1930         return -1;
1931 
1932     if (!(hrecs = bh->hrecs)) {
1933         if (sam_hdr_fill_hrecs(bh) != 0)
1934             return -1;
1935         hrecs = bh->hrecs;
1936     }
1937 
1938     sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1939     if (!ty)
1940         return -1;
1941 
1942     int ret = sam_hrecs_remove_key(hrecs, ty, key);
1943     if (!ret && hrecs->dirty)
1944         redact_header_text(bh);
1945 
1946     return ret;
1947 }
1948 
1949 /*
1950  * Reconstructs a kstring from the header hash table.
1951  * Returns 0 on success
1952  *        -1 on failure
1953  */
sam_hrecs_rebuild_text(const sam_hrecs_t * hrecs,kstring_t * ks)1954 int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
1955     ks->l = 0;
1956 
1957     if (!hrecs->h || !hrecs->h->size) {
1958         return kputsn("", 0, ks) >= 0 ? 0 : -1;
1959     }
1960     if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
1961         return -1;
1962 
1963     return 0;
1964 }
1965 
1966 /*
1967  * Looks up a reference sequence by name and returns the numerical ID.
1968  * Returns -1 if unknown reference; -2 if header could not be parsed.
1969  */
sam_hdr_name2tid(sam_hdr_t * bh,const char * ref)1970 int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
1971     sam_hrecs_t *hrecs;
1972     khint_t k;
1973 
1974     if (!bh)
1975         return -1;
1976 
1977     if (!(hrecs = bh->hrecs)) {
1978         if (sam_hdr_fill_hrecs(bh) != 0)
1979             return -2;
1980         hrecs = bh->hrecs;
1981     }
1982 
1983     if (!hrecs->ref_hash)
1984         return -1;
1985 
1986     k = kh_get(m_s2i, hrecs->ref_hash, ref);
1987     return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
1988 }
1989 
sam_hdr_tid2name(const sam_hdr_t * h,int tid)1990 const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
1991     sam_hrecs_t *hrecs;
1992 
1993     if (!h || tid < 0)
1994         return NULL;
1995 
1996     if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
1997         return hrecs->ref[tid].name;
1998     } else {
1999         if (tid < h->n_targets)
2000             return h->target_name[tid];
2001     }
2002 
2003     return NULL;
2004 }
2005 
sam_hdr_tid2len(const sam_hdr_t * h,int tid)2006 hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2007     sam_hrecs_t *hrecs;
2008 
2009     if (!h || tid < 0)
2010         return 0;
2011 
2012     if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2013         return hrecs->ref[tid].len;
2014     } else {
2015         if (tid < h->n_targets) {
2016             if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2017                 return h->target_len[tid];
2018             } else {
2019                 khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2020                 khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2021                 if (k < kh_end(long_refs)) {
2022                     return kh_val(long_refs, k);
2023                 } else {
2024                     return UINT32_MAX;
2025                 }
2026             }
2027         }
2028     }
2029 
2030     return 0;
2031 }
2032 
2033 /*
2034  * Fixes any PP links in @PG headers.
2035  * If the entries are in order then this doesn't need doing, but in case
2036  * our header is out of order this goes through the hrecs->pg[] array
2037  * setting the prev_id field.
2038  *
2039  * Note we can have multiple complete chains. This code should identify the
2040  * tails of these chains as these are the entries we have to link to in
2041  * subsequent PP records.
2042  *
2043  * Returns 0 on success
2044  *        -1 on failure (indicating broken PG/PP records)
2045  */
sam_hdr_link_pg(sam_hdr_t * bh)2046 static int sam_hdr_link_pg(sam_hdr_t *bh) {
2047     sam_hrecs_t *hrecs;
2048     int i, j, ret = 0, *new_pg_end;
2049 
2050     if (!bh)
2051         return -1;
2052 
2053     if (!(hrecs = bh->hrecs)) {
2054         if (sam_hdr_fill_hrecs(bh) != 0)
2055             return -1;
2056         hrecs = bh->hrecs;
2057     }
2058 
2059     if (!hrecs->pgs_changed || !hrecs->npg)
2060         return 0;
2061 
2062     hrecs->npg_end_alloc = hrecs->npg;
2063     new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2064     if (!new_pg_end)
2065         return -1;
2066     hrecs->pg_end = new_pg_end;
2067     int *chain_size = calloc(hrecs->npg, sizeof(int));
2068     if (!chain_size)
2069         return -1;
2070 
2071     for (i = 0; i < hrecs->npg; i++)
2072         hrecs->pg_end[i] = i;
2073 
2074     for (i = 0; i < hrecs->npg; i++) {
2075         khint_t k;
2076         sam_hrec_tag_t *tag;
2077 
2078         assert(hrecs->pg[i].ty != NULL);
2079         for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2080             if (tag->str[0] == 'P' && tag->str[1] == 'P')
2081                 break;
2082         }
2083         if (!tag) {
2084             /* Chain start points */
2085             continue;
2086         }
2087 
2088         k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2089 
2090         if (k == kh_end(hrecs->pg_hash)) {
2091             hts_log_warning("PG line with PN:%s has a PP link to missing program '%s'",
2092                     hrecs->pg[i].name, tag->str+3);
2093             continue;
2094         }
2095 
2096         hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id;
2097         hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1;
2098         chain_size[i] = chain_size[kh_val(hrecs->pg_hash, k)]+1;
2099     }
2100 
2101     for (i = j = 0; i < hrecs->npg; i++) {
2102         if (hrecs->pg_end[i] != -1 && chain_size[i] > 0)
2103             hrecs->pg_end[j++] = hrecs->pg_end[i];
2104     }
2105     /* Only leafs? Choose the last one! */
2106     if (!j && hrecs->npg_end > 0) {
2107         hrecs->pg_end[0] = hrecs->pg_end[hrecs->npg_end-1];
2108         j = 1;
2109     }
2110 
2111     hrecs->npg_end = j;
2112     hrecs->pgs_changed = 0;
2113 
2114     /* mark as dirty or empty for rebuild */
2115     hrecs->dirty = 1;
2116     redact_header_text(bh);
2117     free(chain_size);
2118 
2119     return ret;
2120 }
2121 
2122 /*
2123  * Returns a unique ID from a base name.
2124  *
2125  * The value returned is valid until the next call to
2126  * this function.
2127  */
sam_hdr_pg_id(sam_hdr_t * bh,const char * name)2128 const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2129     sam_hrecs_t *hrecs;
2130     size_t name_len;
2131     const size_t name_extra = 17;
2132     if (!bh || !name)
2133         return NULL;
2134 
2135     if (!(hrecs = bh->hrecs)) {
2136         if (sam_hdr_fill_hrecs(bh) != 0)
2137             return NULL;
2138         hrecs = bh->hrecs;
2139     }
2140 
2141     khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2142     if (k == kh_end(hrecs->pg_hash))
2143         return name;
2144 
2145     name_len = strlen(name);
2146     if (name_len > 1000) name_len = 1000;
2147     if (hrecs->ID_buf_sz < name_len + name_extra) {
2148         char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2149         if (new_ID_buf == NULL)
2150             return NULL;
2151         hrecs->ID_buf = new_ID_buf;
2152         hrecs->ID_buf_sz = name_len + name_extra;
2153     }
2154 
2155     do {
2156         snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2157         k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2158     } while (k != kh_end(hrecs->pg_hash));
2159 
2160     return hrecs->ID_buf;
2161 }
2162 
2163 /*
2164  * Add an @PG line.
2165  *
2166  * If we wish complete control over this use sam_hdr_add_line() directly. This
2167  * function uses that, but attempts to do a lot of tedious house work for
2168  * you too.
2169  *
2170  * - It will generate a suitable ID if the supplied one clashes.
2171  * - It will generate multiple @PG records if we have multiple PG chains.
2172  *
2173  * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2174  * in NULL.
2175  *
2176  * Returns 0 on success
2177  *        -1 on failure
2178  */
sam_hdr_add_pg(sam_hdr_t * bh,const char * name,...)2179 int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2180     sam_hrecs_t *hrecs;
2181     const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2182     const char *key, *val;
2183     if (!bh)
2184         return -1;
2185 
2186     if (!(hrecs = bh->hrecs)) {
2187         if (sam_hdr_fill_hrecs(bh) != 0)
2188             return -1;
2189         hrecs = bh->hrecs;
2190     }
2191 
2192     bh->hrecs->pgs_changed = 1;
2193     if (sam_hdr_link_pg(bh) < 0) {
2194         hts_log_error("Error linking @PG lines");
2195         return -1;
2196     }
2197 
2198     va_list args;
2199     // Check for ID / PN / PP tags in varargs list
2200     va_start(args, name);
2201     while ((key = va_arg(args, const char *)) != NULL) {
2202         val = va_arg(args, const char *);
2203         if (!val) break;
2204         if (strcmp(key, "PN") == 0 && *val != '\0')
2205             specified_pn = val;
2206         else if (strcmp(key, "PP") == 0 && *val != '\0')
2207             specified_pp = val;
2208         else if (strcmp(key, "ID") == 0 && *val != '\0')
2209             specified_id = val;
2210     }
2211     va_end(args);
2212 
2213     if (specified_id && hrecs->pg_hash) {
2214         khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2215         if (k != kh_end(hrecs->pg_hash)) {
2216             hts_log_error("Header @PG ID:%s already present", specified_id);
2217             return -1;
2218         }
2219     }
2220 
2221     if (specified_pp && hrecs->pg_hash) {
2222         khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2223         if (k == kh_end(hrecs->pg_hash)) {
2224             hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2225                           specified_pp);
2226             return -1;
2227         }
2228     }
2229 
2230     if (!specified_pp && hrecs->npg_end) {
2231         /* Copy ends array to avoid us looping while modifying it */
2232         int *end = malloc(hrecs->npg_end * sizeof(int));
2233         int i, nends = hrecs->npg_end;
2234 
2235         if (!end)
2236             return -1;
2237 
2238         memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2239 
2240         for (i = 0; i < nends; i++) {
2241             const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2242             if (!id) {
2243                 free(end);
2244                 return -1;
2245             }
2246             va_start(args, name);
2247             if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2248                                      "ID", id,
2249                                      "PN", !specified_pn ? name : "",
2250                                      "PP", hrecs->pg[end[i]].name,
2251                                      NULL)) {
2252                 free(end);
2253                 return  -1;
2254             }
2255             va_end(args);
2256         }
2257 
2258         free(end);
2259     } else {
2260         const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2261         if (!id)
2262             return -1;
2263         va_start(args, name);
2264         if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2265                                  "ID", id,
2266                                  "PN", !specified_pn ? name : "",
2267                                  NULL))
2268             return -1;
2269         va_end(args);
2270     }
2271 
2272     hrecs->dirty = 1;
2273     redact_header_text(bh);
2274 
2275     return 0;
2276 }
2277 
2278 /*! Increments a reference count on bh.
2279  *
2280  * This permits multiple files to share the same header, all calling
2281  * sam_hdr_destroy when done, without causing errors for other open files.
2282  */
sam_hdr_incr_ref(sam_hdr_t * bh)2283 void sam_hdr_incr_ref(sam_hdr_t *bh) {
2284     if (!bh)
2285         return;
2286     bh->ref_count++;
2287 }
2288 
2289 /* ==== Internal methods ==== */
2290 
2291 /*
2292  * Creates an empty SAM header.  Allocates space for the SAM header
2293  * structures (hash tables) ready to be populated.
2294  *
2295  * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2296  *         NULL on failure
2297  */
sam_hrecs_new()2298 sam_hrecs_t *sam_hrecs_new() {
2299     sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2300 
2301     if (!hrecs)
2302         return NULL;
2303 
2304     hrecs->h = kh_init(sam_hrecs_t);
2305     if (!hrecs->h)
2306         goto err;
2307 
2308     hrecs->ID_cnt = 1;
2309 
2310     hrecs->nref = 0;
2311     hrecs->ref_sz = 0;
2312     hrecs->ref  = NULL;
2313     if (!(hrecs->ref_hash = kh_init(m_s2i)))
2314         goto err;
2315     hrecs->refs_changed = -1;
2316 
2317     hrecs->nrg = 0;
2318     hrecs->rg_sz = 0;
2319     hrecs->rg  = NULL;
2320     if (!(hrecs->rg_hash = kh_init(m_s2i)))
2321         goto err;
2322 
2323     hrecs->npg = 0;
2324     hrecs->pg_sz = 0;
2325     hrecs->pg  = NULL;
2326     hrecs->npg_end = hrecs->npg_end_alloc = 0;
2327     hrecs->pg_end = NULL;
2328     if (!(hrecs->pg_hash = kh_init(m_s2i)))
2329         goto err;
2330 
2331     if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2332         goto err;
2333 
2334     if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2335         goto err;
2336 
2337     if (!(hrecs->str_pool = string_pool_create(65536)))
2338         goto err;
2339 
2340     if (sam_hrecs_init_type_order(hrecs, NULL))
2341         goto err;
2342 
2343     return hrecs;
2344 
2345 err:
2346     if (hrecs->h)
2347         kh_destroy(sam_hrecs_t, hrecs->h);
2348 
2349     if (hrecs->tag_pool)
2350         pool_destroy(hrecs->tag_pool);
2351 
2352     if (hrecs->type_pool)
2353         pool_destroy(hrecs->type_pool);
2354 
2355     if (hrecs->str_pool)
2356         string_pool_destroy(hrecs->str_pool);
2357 
2358     free(hrecs);
2359 
2360     return NULL;
2361 }
2362 #if 0
2363 /*
2364  * Produces a duplicate copy of source and returns it.
2365  * Returns NULL on failure
2366  */
2367 sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2368         return NULL;
2369 }
2370 #endif
2371 /*! Deallocates all storage used by a sam_hrecs_t struct.
2372  *
2373  * This also decrements the header reference count. If after decrementing
2374  * it is still non-zero then the header is assumed to be in use by another
2375  * caller and the free is not done.
2376  *
2377  */
sam_hrecs_free(sam_hrecs_t * hrecs)2378 void sam_hrecs_free(sam_hrecs_t *hrecs) {
2379     if (!hrecs)
2380         return;
2381 
2382     if (hrecs->h)
2383         kh_destroy(sam_hrecs_t, hrecs->h);
2384 
2385     if (hrecs->ref_hash)
2386         kh_destroy(m_s2i, hrecs->ref_hash);
2387 
2388     if (hrecs->ref)
2389         free(hrecs->ref);
2390 
2391     if (hrecs->rg_hash)
2392         kh_destroy(m_s2i, hrecs->rg_hash);
2393 
2394     if (hrecs->rg)
2395         free(hrecs->rg);
2396 
2397     if (hrecs->pg_hash)
2398         kh_destroy(m_s2i, hrecs->pg_hash);
2399 
2400     if (hrecs->pg)
2401         free(hrecs->pg);
2402 
2403     if (hrecs->pg_end)
2404         free(hrecs->pg_end);
2405 
2406     if (hrecs->type_pool)
2407         pool_destroy(hrecs->type_pool);
2408 
2409     if (hrecs->tag_pool)
2410         pool_destroy(hrecs->tag_pool);
2411 
2412     if (hrecs->str_pool)
2413         string_pool_destroy(hrecs->str_pool);
2414 
2415     if (hrecs->type_order)
2416         free(hrecs->type_order);
2417 
2418     if (hrecs->ID_buf)
2419         free(hrecs->ID_buf);
2420 
2421     free(hrecs);
2422 }
2423 
2424 /*
2425  * Internal method already used by the CRAM code
2426  * Returns the first header item matching 'type'. If ID is non-NULL it checks
2427  * for the tag ID: and compares against the specified ID.
2428  *
2429  * Returns NULL if no type/ID is found
2430  */
sam_hrecs_find_type_id(sam_hrecs_t * hrecs,const char * type,const char * ID_key,const char * ID_value)2431 sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2432                                      const char *ID_key, const char *ID_value) {
2433     if (!hrecs || !type)
2434         return NULL;
2435     sam_hrec_type_t *t1, *t2;
2436     khint_t k;
2437 
2438     /* Special case for types we have prebuilt hashes on */
2439     if (ID_key) {
2440         if (!ID_value)
2441             return NULL;
2442 
2443         if (type[0]   == 'S' && type[1]   == 'Q' &&
2444             ID_key[0] == 'S' && ID_key[1] == 'N') {
2445             k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2446             return k != kh_end(hrecs->ref_hash)
2447                 ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2448                 : NULL;
2449         }
2450 
2451         if (type[0]   == 'R' && type[1]   == 'G' &&
2452             ID_key[0] == 'I' && ID_key[1] == 'D') {
2453             k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2454             return k != kh_end(hrecs->rg_hash)
2455                 ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2456                 : NULL;
2457         }
2458 
2459         if (type[0]   == 'P' && type[1]   == 'G' &&
2460             ID_key[0] == 'I' && ID_key[1] == 'D') {
2461             k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2462             return k != kh_end(hrecs->pg_hash)
2463                 ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2464                 : NULL;
2465         }
2466     }
2467 
2468     k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2469     if (k == kh_end(hrecs->h))
2470         return NULL;
2471 
2472     if (!ID_key)
2473         return kh_val(hrecs->h, k);
2474 
2475     t1 = t2 = kh_val(hrecs->h, k);
2476     do {
2477         sam_hrec_tag_t *tag;
2478         for (tag = t1->tag; tag; tag = tag->next) {
2479             if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2480                 const char *cp1 = tag->str+3;
2481                 const char *cp2 = ID_value;
2482                 while (*cp1 && *cp1 == *cp2)
2483                     cp1++, cp2++;
2484                 if (*cp2 || *cp1)
2485                     continue;
2486                 return t1;
2487             }
2488         }
2489         t1 = t1->next;
2490     } while (t1 != t2);
2491 
2492     return NULL;
2493 }
2494 
2495 /*
2496  * Adds or updates tag key,value pairs in a header line.
2497  * Eg for adding M5 tags to @SQ lines or updating sort order for the
2498  * @HD line.
2499  *
2500  * va_list contains multiple key,value pairs ending in NULL.
2501  *
2502  * Returns 0 on success
2503  *        -1 on failure
2504  */
sam_hrecs_vupdate(sam_hrecs_t * hrecs,sam_hrec_type_t * type,va_list ap)2505 int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2506     if (!hrecs)
2507         return -1;
2508 
2509     for (;;) {
2510         char *k, *v, *str;
2511         sam_hrec_tag_t *tag, *prev = NULL;
2512 
2513         if (!(k = (char *)va_arg(ap, char *)))
2514             break;
2515         if (!(v = va_arg(ap, char *)))
2516             v = "";
2517 
2518         tag = sam_hrecs_find_key(type, k, &prev);
2519         if (!tag) {
2520             if (!(tag = pool_alloc(hrecs->tag_pool)))
2521                 return -1;
2522             if (prev)
2523                 prev->next = tag;
2524             else
2525                 type->tag = tag;
2526 
2527             tag->next = NULL;
2528         }
2529 
2530         tag->len = 3 + strlen(v);
2531         str = string_alloc(hrecs->str_pool, tag->len+1);
2532         if (!str)
2533             return -1;
2534 
2535         if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2536             return -1;
2537 
2538         tag->str = str;
2539     }
2540 
2541     hrecs->dirty = 1; //mark text as dirty and force a rebuild
2542 
2543     return 0;
2544 }
2545 
2546 /*
2547  * Adds or updates tag key,value pairs in a header line.
2548  * Eg for adding M5 tags to @SQ lines or updating sort order for the
2549  * @HD line.
2550  *
2551  * Specify multiple key,value pairs ending in NULL.
2552  *
2553  * Returns 0 on success
2554  *        -1 on failure
2555  */
sam_hrecs_update(sam_hrecs_t * hrecs,sam_hrec_type_t * type,...)2556 static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2557     va_list args;
2558     int res;
2559     va_start(args, type);
2560     res = sam_hrecs_vupdate(hrecs, type, args);
2561     va_end(args);
2562     return res;
2563 }
2564 
2565 /*
2566  * Looks for a specific key in a single sam header line identified by *type.
2567  * If prev is non-NULL it also fills this out with the previous tag, to
2568  * permit use in key removal. *prev is set to NULL when the tag is the first
2569  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
2570  * tag in the existing list.
2571  *
2572  * Returns the tag pointer on success
2573  *         NULL on failure
2574  */
sam_hrecs_find_key(sam_hrec_type_t * type,const char * key,sam_hrec_tag_t ** prev)2575 sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
2576                                    const char *key,
2577                                    sam_hrec_tag_t **prev) {
2578     sam_hrec_tag_t *tag, *p = NULL;
2579     if (!type)
2580         return NULL;
2581 
2582     for (tag = type->tag; tag; p = tag, tag = tag->next) {
2583         if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
2584             if (prev)
2585                 *prev = p;
2586             return tag;
2587         }
2588     }
2589 
2590     if (prev)
2591         *prev = p;
2592 
2593     return NULL;
2594 }
2595 
sam_hrecs_remove_key(sam_hrecs_t * hrecs,sam_hrec_type_t * type,const char * key)2596 int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
2597                          sam_hrec_type_t *type,
2598                          const char *key) {
2599     sam_hrec_tag_t *tag, *prev;
2600     if (!hrecs)
2601         return -1;
2602     tag = sam_hrecs_find_key(type, key, &prev);
2603     if (!tag)
2604         return 0; // Not there anyway
2605 
2606     if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
2607         assert(tag->len >= 3);
2608         sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
2609         if (sn_tag) {
2610             assert(sn_tag->len >= 3);
2611             khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
2612             if (k != kh_end(hrecs->ref_hash))
2613                 sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
2614         }
2615     }
2616 
2617     if (!prev) { //first tag
2618         type->tag = tag->next;
2619     } else {
2620         prev->next = tag->next;
2621     }
2622     pool_free(hrecs->tag_pool, tag);
2623     hrecs->dirty = 1; //mark text as dirty and force a rebuild
2624 
2625     return 1;
2626 }
2627 
2628 /*
2629  * Looks up a read-group by name and returns a pointer to the start of the
2630  * associated tag list.
2631  *
2632  * Returns NULL on failure
2633  */
sam_hrecs_find_rg(sam_hrecs_t * hrecs,const char * rg)2634 sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
2635     khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
2636     return k == kh_end(hrecs->rg_hash)
2637         ? NULL
2638         : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
2639 }
2640 
2641 #if DEBUG_HEADER
sam_hrecs_dump(sam_hrecs_t * hrecs)2642 void sam_hrecs_dump(sam_hrecs_t *hrecs) {
2643     khint_t k;
2644     int i;
2645 
2646     printf("===DUMP===\n");
2647     for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
2648         sam_hrec_type_t *t1, *t2;
2649         char c[2];
2650         int idx = 0;
2651 
2652         if (!kh_exist(hrecs->h, k))
2653             continue;
2654 
2655         t1 = t2 = kh_val(hrecs->h, k);
2656         c[0] = kh_key(hrecs->h, k)>>8;
2657         c[1] = kh_key(hrecs->h, k)&0xff;
2658         printf("Type %.2s\n", c);
2659 
2660         do {
2661             sam_hrec_tag_t *tag;
2662             printf(">>>%d ", idx++);
2663             for (tag = t1->tag; tag; tag=tag->next) {
2664                 if (strncmp(c, "CO", 2))
2665                     printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
2666                 else
2667                     printf("%s", tag->str);
2668             }
2669             putchar('\n');
2670             t1 = t1->next;
2671         } while (t1 != t2);
2672     }
2673 
2674     /* Dump out PG chains */
2675     printf("\n@PG chains:\n");
2676     for (i = 0; i < hrecs->npg_end; i++) {
2677         int j;
2678         printf("  %d:", i);
2679         for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
2680             printf("%s%d(%.*s)",
2681                    j == hrecs->pg_end[i] ? " " : "->",
2682                    j, hrecs->pg[j].name_len, hrecs->pg[j].name);
2683         }
2684         printf("\n");
2685     }
2686 
2687     puts("===END DUMP===");
2688 }
2689 #endif
2690 
2691 /*
2692  * Returns the sort order:
2693  */
sam_hrecs_sort_order(sam_hrecs_t * hrecs)2694 enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
2695     khint_t k;
2696     enum sam_sort_order so;
2697 
2698     so = ORDER_UNKNOWN;
2699     k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2700     if (k != kh_end(hrecs->h)) {
2701         sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2702         sam_hrec_tag_t *tag;
2703         for (tag = ty->tag; tag; tag = tag->next) {
2704             if (tag->str[0] == 'S' && tag->str[1] == 'O') {
2705                 if (strcmp(tag->str+3, "unsorted") == 0)
2706                     so = ORDER_UNSORTED;
2707                 else if (strcmp(tag->str+3, "queryname") == 0)
2708                     so = ORDER_NAME;
2709                 else if (strcmp(tag->str+3, "coordinate") == 0)
2710                     so = ORDER_COORD;
2711                 else if (strcmp(tag->str+3, "unknown") != 0)
2712                     hts_log_error("Unknown sort order field: %s", tag->str+3);
2713             }
2714         }
2715     }
2716 
2717     return so;
2718 }
2719 
sam_hrecs_group_order(sam_hrecs_t * hrecs)2720 enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
2721     khint_t k;
2722     enum sam_group_order go;
2723 
2724     go = ORDER_NONE;
2725     k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2726     if (k != kh_end(hrecs->h)) {
2727         sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2728         sam_hrec_tag_t *tag;
2729         for (tag = ty->tag; tag; tag = tag->next) {
2730             if (tag->str[0] == 'G' && tag->str[1] == 'O') {
2731                 if (strcmp(tag->str+3, "query") == 0)
2732                     go = ORDER_QUERY;
2733                 else if (strcmp(tag->str+3, "reference") == 0)
2734                     go = ORDER_REFERENCE;
2735             }
2736         }
2737     }
2738 
2739     return go;
2740 }
2741