1 /*
2 Copyright (c) 2018-2020 Genome Research Ltd.
3 Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33
34 #include <string.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include "textutils_internal.h"
38 #include "header.h"
39
40 // Hash table for removing multiple lines from the header
41 KHASH_SET_INIT_STR(rm)
42 // Used for long refs in SAM files
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45 typedef khash_t(rm) rmhash_t;
46
47 static int sam_hdr_link_pg(sam_hdr_t *bh);
48
49 static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50 static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
51
52
53 #define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
sam_hrecs_error(const char * msg,const char * line,size_t len,size_t lno)54 static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
55 int j;
56
57 if (len > MAX_ERROR_QUOTE)
58 len = MAX_ERROR_QUOTE;
59 for (j = 0; j < len && line[j] != '\n'; j++)
60 ;
61 hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
62 }
63
64 /* ==== Static methods ==== */
65
sam_hrecs_init_type_order(sam_hrecs_t * hrecs,char * type_list)66 static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
67 if (!hrecs)
68 return -1;
69
70 if (!type_list) {
71 hrecs->type_count = 5;
72 hrecs->type_order = calloc(hrecs->type_count, 3);
73 if (!hrecs->type_order)
74 return -1;
75 memcpy(hrecs->type_order[0], "HD", 2);
76 memcpy(hrecs->type_order[1], "SQ", 2);
77 memcpy(hrecs->type_order[2], "RG", 2);
78 memcpy(hrecs->type_order[3], "PG", 2);
79 memcpy(hrecs->type_order[4], "CO", 2);
80 }
81
82 return 0;
83 }
84
sam_hrecs_add_ref_altnames(sam_hrecs_t * hrecs,int nref,const char * list)85 static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
86 const char *token;
87 ks_tokaux_t aux;
88
89 if (!list)
90 return 0;
91
92 for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
93 if (aux.p == token)
94 continue;
95
96 char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
97 if (!name)
98 return -1;
99 int r;
100 khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
101 if (r < 0) return -1;
102
103 if (r > 0)
104 kh_val(hrecs->ref_hash, k) = nref;
105 else if (kh_val(hrecs->ref_hash, k) != nref)
106 hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
107 }
108
109 return 0;
110 }
111
sam_hrecs_remove_ref_altnames(sam_hrecs_t * hrecs,int expected,const char * list)112 static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
113 const char *token, *sn;
114 ks_tokaux_t aux;
115 kstring_t str = KS_INITIALIZE;
116
117 if (expected < 0 || expected >= hrecs->nref)
118 return;
119 sn = hrecs->ref[expected].name;
120
121 for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
122 kputsn(token, aux.p - token, ks_clear(&str));
123 khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
124 if (k != kh_end(hrecs->ref_hash)
125 && kh_val(hrecs->ref_hash, k) == expected
126 && strcmp(sn, str.s) != 0)
127 kh_del(m_s2i, hrecs->ref_hash, k);
128 }
129
130 free(str.s);
131 }
132
133 /* Updates the hash tables in the sam_hrecs_t structure.
134 *
135 * Returns 0 on success;
136 * -1 on failure
137 */
sam_hrecs_update_hashes(sam_hrecs_t * hrecs,khint32_t type,sam_hrec_type_t * h_type)138 static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
139 khint32_t type,
140 sam_hrec_type_t *h_type) {
141 /* Add to reference hash? */
142 if (type == TYPEKEY("SQ")) {
143 sam_hrec_tag_t *tag = h_type->tag;
144 int nref = hrecs->nref;
145 const char *name = NULL;
146 const char *altnames = NULL;
147 hts_pos_t len = -1;
148 int r;
149 khint_t k;
150
151 while (tag) {
152 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
153 assert(tag->len >= 3);
154 name = tag->str+3;
155 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
156 assert(tag->len >= 3);
157 len = strtoll(tag->str+3, NULL, 10);
158 } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
159 assert(tag->len >= 3);
160 altnames = tag->str+3;
161 }
162 tag = tag->next;
163 }
164
165 if (!name) {
166 hts_log_error("Header includes @SQ line with no SN: tag");
167 return -1; // SN should be present, according to spec.
168 }
169
170 if (len == -1) {
171 hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
172 name);
173 return -1; // LN should be present, according to spec.
174 }
175
176 // Seen already?
177 k = kh_get(m_s2i, hrecs->ref_hash, name);
178 if (k < kh_end(hrecs->ref_hash)) {
179 nref = kh_val(hrecs->ref_hash, k);
180 int ref_changed_flag = 0;
181
182 // Check for hash entry added by sam_hrecs_refs_from_targets_array()
183 if (hrecs->ref[nref].ty == NULL) {
184 // Attach header line to existing stub entry.
185 hrecs->ref[nref].ty = h_type;
186 // Check lengths match; correct if not.
187 if (len != hrecs->ref[nref].len) {
188 char tmp[32];
189 snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
190 hrecs->ref[nref].len);
191 if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
192 return -1;
193 ref_changed_flag = 1;
194 }
195 if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
196 return -1;
197
198 if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
199 hrecs->refs_changed = nref;
200 return 0;
201 }
202
203 // Check to see if an existing entry is being updated
204 if (hrecs->ref[nref].ty == h_type) {
205 if (hrecs->ref[nref].len != len) {
206 hrecs->ref[nref].len = len;
207 ref_changed_flag = 1;
208 }
209 if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
210 hrecs->ref[nref].name = name;
211 ref_changed_flag = 1;
212 }
213 if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
214 return -1;
215
216 if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
217 hrecs->refs_changed = nref;
218 return 0;
219 }
220
221 // If here, the name is a duplicate.
222 // Check to see if it matches the SN: tag from the earlier record.
223 if (strcmp(hrecs->ref[nref].name, name) == 0) {
224 hts_log_error("Duplicate entry \"%s\" in sam header",
225 name);
226 return -1;
227 }
228
229 // Clash with an already-seen altname
230 // As SN: should be preferred to AN: add this as a new
231 // record and update the hash entry to point to it.
232 hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
233 nref = hrecs->nref;
234 }
235
236 if (nref == hrecs->ref_sz) {
237 size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
238 sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
239 if (!new_ref)
240 return -1;
241 hrecs->ref = new_ref;
242 hrecs->ref_sz = new_sz;
243 }
244
245 hrecs->ref[nref].name = name;
246 hrecs->ref[nref].len = len;
247 hrecs->ref[nref].ty = h_type;
248
249 k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
250 if (-1 == r) return -1;
251 kh_val(hrecs->ref_hash, k) = nref;
252
253 if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
254 return -1;
255
256 if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
257 hrecs->refs_changed = hrecs->nref;
258 hrecs->nref++;
259 }
260
261 /* Add to read-group hash? */
262 if (type == TYPEKEY("RG")) {
263 sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
264 int nrg = hrecs->nrg, r;
265 khint_t k;
266
267 if (!tag) {
268 hts_log_error("Header includes @RG line with no ID: tag");
269 return -1; // ID should be present, according to spec.
270 }
271 assert(tag->str && tag->len >= 3);
272
273 // Seen already?
274 k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
275 if (k < kh_end(hrecs->rg_hash)) {
276 nrg = kh_val(hrecs->rg_hash, k);
277 assert(hrecs->rg[nrg].ty != NULL);
278 if (hrecs->rg[nrg].ty != h_type) {
279 hts_log_warning("Duplicate entry \"%s\" in sam header",
280 tag->str + 3);
281 } else {
282 hrecs->rg[nrg].name = tag->str + 3;
283 hrecs->rg[nrg].name_len = tag->len - 3;
284 }
285 return 0;
286 }
287
288 if (nrg == hrecs->rg_sz) {
289 size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
290 sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
291 if (!new_rg)
292 return -1;
293 hrecs->rg = new_rg;
294 hrecs->rg_sz = new_sz;
295 }
296
297 hrecs->rg[nrg].name = tag->str + 3;
298 hrecs->rg[nrg].name_len = tag->len - 3;
299 hrecs->rg[nrg].ty = h_type;
300 hrecs->rg[nrg].id = nrg;
301
302 k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
303 if (-1 == r) return -1;
304 kh_val(hrecs->rg_hash, k) = nrg;
305
306 hrecs->nrg++;
307 }
308
309 /* Add to program hash? */
310 if (type == TYPEKEY("PG")) {
311 sam_hrec_tag_t *tag;
312 sam_hrec_pg_t *new_pg;
313 int npg = hrecs->npg;
314
315 if (npg == hrecs->pg_sz) {
316 size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
317 new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
318 if (!new_pg)
319 return -1;
320 hrecs->pg = new_pg;
321 hrecs->pg_sz = new_sz;
322 }
323
324 tag = h_type->tag;
325 hrecs->pg[npg].name = NULL;
326 hrecs->pg[npg].name_len = 0;
327 hrecs->pg[npg].ty = h_type;
328 hrecs->pg[npg].id = npg;
329 hrecs->pg[npg].prev_id = -1;
330
331 while (tag) {
332 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
333 /* Avoid duplicate ID tags coming from other applications */
334 if (!hrecs->pg[npg].name) {
335 assert(tag->len >= 3);
336 hrecs->pg[npg].name = tag->str + 3;
337 hrecs->pg[npg].name_len = tag->len - 3;
338 } else {
339 hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
340 }
341 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
342 // Resolve later if needed
343 khint_t k;
344 k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
345
346 if (k != kh_end(hrecs->pg_hash)) {
347 int p_id = kh_val(hrecs->pg_hash, k);
348 hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
349
350 /* Unmark previous entry as a PG termination */
351 if (hrecs->npg_end > 0 &&
352 hrecs->pg_end[hrecs->npg_end-1] == p_id) {
353 hrecs->npg_end--;
354 } else {
355 int i;
356 for (i = 0; i < hrecs->npg_end; i++) {
357 if (hrecs->pg_end[i] == p_id) {
358 memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
359 (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
360 hrecs->npg_end--;
361 }
362 }
363 }
364 } else {
365 hrecs->pg[npg].prev_id = -1;
366 }
367 }
368 tag = tag->next;
369 }
370
371 if (hrecs->pg[npg].name) {
372 khint_t k;
373 int r;
374 k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
375 if (-1 == r) return -1;
376 kh_val(hrecs->pg_hash, k) = npg;
377 } else {
378 return -1; // ID should be present, according to spec.
379 }
380
381 /* Add to npg_end[] array. Remove later if we find a PP line */
382 if (hrecs->npg_end >= hrecs->npg_end_alloc) {
383 int *new_pg_end;
384 int new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
385
386 new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
387 if (!new_pg_end)
388 return -1;
389 hrecs->npg_end_alloc = new_alloc;
390 hrecs->pg_end = new_pg_end;
391 }
392 hrecs->pg_end[hrecs->npg_end++] = npg;
393
394 hrecs->npg++;
395 }
396
397 return 0;
398 }
399
sam_hrecs_remove_hash_entry(sam_hrecs_t * hrecs,khint32_t type,sam_hrec_type_t * h_type)400 static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
401 if (!hrecs || !h_type)
402 return -1;
403
404 sam_hrec_tag_t *tag;
405 const char *key = NULL;
406 khint_t k;
407
408 /* Remove name and any alternative names from reference hash */
409 if (type == TYPEKEY("SQ")) {
410 const char *altnames = NULL;
411
412 tag = h_type->tag;
413
414 while (tag) {
415 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
416 assert(tag->len >= 3);
417 key = tag->str + 3;
418 } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
419 assert(tag->len >= 3);
420 altnames = tag->str + 3;
421 }
422 tag = tag->next;
423 }
424
425 if (key) {
426 k = kh_get(m_s2i, hrecs->ref_hash, key);
427 if (k != kh_end(hrecs->ref_hash)) {
428 int idx = kh_val(hrecs->ref_hash, k);
429 if (idx + 1 < hrecs->nref)
430 memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
431 sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
432 if (altnames)
433 sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
434 kh_del(m_s2i, hrecs->ref_hash, k);
435 hrecs->nref--;
436 if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
437 hrecs->refs_changed = idx;
438 for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
439 if (kh_exist(hrecs->ref_hash, k)
440 && kh_value(hrecs->ref_hash, k) > idx) {
441 kh_value(hrecs->ref_hash, k)--;
442 }
443 }
444 }
445 }
446 }
447
448 /* Remove from read-group hash */
449 if (type == TYPEKEY("RG")) {
450 tag = h_type->tag;
451
452 while (tag) {
453 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
454 assert(tag->len >= 3);
455 key = tag->str + 3;
456 k = kh_get(m_s2i, hrecs->rg_hash, key);
457 if (k != kh_end(hrecs->rg_hash)) {
458 int idx = kh_val(hrecs->rg_hash, k);
459 if (idx + 1 < hrecs->nrg)
460 memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
461 kh_del(m_s2i, hrecs->rg_hash, k);
462 hrecs->nrg--;
463 for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
464 if (kh_exist(hrecs->rg_hash, k)
465 && kh_value(hrecs->rg_hash, k) > idx) {
466 kh_value(hrecs->rg_hash, k)--;
467 }
468 }
469 }
470 break;
471 }
472 tag = tag->next;
473 }
474 }
475
476 return 0;
477 }
478
479 /** Add a header record to the global line ordering
480 *
481 * If @p after is not NULL, the new record will be inserted after this one,
482 * otherwise it will go at the end.
483 *
484 * An exception is an HD record, which will always be put first unless
485 * one is already present.
486 */
sam_hrecs_global_list_add(sam_hrecs_t * hrecs,sam_hrec_type_t * h_type,sam_hrec_type_t * after)487 static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
488 sam_hrec_type_t *h_type,
489 sam_hrec_type_t *after) {
490 const khint32_t hd_type = TYPEKEY("HD");
491 int update_first_line = 0;
492
493 // First line seen
494 if (!hrecs->first_line) {
495 hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
496 return;
497 }
498
499 // @HD goes at the top (unless there's one already)
500 if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
501 after = hrecs->first_line->global_prev;
502 update_first_line = 1;
503 }
504
505 // If no instructions given, put it at the end
506 if (!after)
507 after = hrecs->first_line->global_prev;
508
509 h_type->global_prev = after;
510 h_type->global_next = after->global_next;
511 h_type->global_prev->global_next = h_type;
512 h_type->global_next->global_prev = h_type;
513
514 if (update_first_line)
515 hrecs->first_line = h_type;
516 }
517
518 /*! Add header record with a va_list interface.
519 *
520 * Adds a single record to a SAM header.
521 *
522 * This takes a header record type, a va_list argument and one or more
523 * key,value pairs, ending with the NULL key.
524 *
525 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
526 *
527 * The purpose of the additional va_list parameter is to permit other
528 * varargs functions to call this while including their own additional
529 * parameters; an example is in sam_hdr_add_pg().
530 *
531 * Note: this function invokes va_arg at least once, making the value
532 * of ap indeterminate after the return. The caller should call
533 * va_start/va_end before/after calling this function or use va_copy.
534 *
535 * @return
536 * Returns >= 0 on success;
537 * -1 on failure
538 */
sam_hrecs_vadd(sam_hrecs_t * hrecs,const char * type,va_list ap,...)539 static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
540 va_list args;
541 sam_hrec_type_t *h_type;
542 sam_hrec_tag_t *h_tag, *last=NULL;
543 int new;
544 khint32_t type_i = TYPEKEY(type), k;
545
546 if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
547 return sam_hrecs_vupdate(hrecs, h_type, ap);
548
549 if (!(h_type = pool_alloc(hrecs->type_pool)))
550 return -1;
551 k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
552 if (new < 0)
553 return -1;
554
555 h_type->type = type_i;
556
557 // Form the ring, either with self or other lines of this type
558 if (!new) {
559 sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
560 p = t->prev;
561
562 assert(p->next == t);
563 p->next = h_type;
564 h_type->prev = p;
565
566 t->prev = h_type;
567 h_type->next = t;
568 } else {
569 kh_val(hrecs->h, k) = h_type;
570 h_type->prev = h_type->next = h_type;
571 }
572 h_type->tag = NULL;
573
574 // Add to global line ordering after any existing line of the same type,
575 // or at the end if no line of this type exists yet.
576 sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
577
578 // Check linked-list invariants
579 assert(h_type->prev->next == h_type);
580 assert(h_type->next->prev == h_type);
581 assert(h_type->global_prev->global_next == h_type);
582 assert(h_type->global_next->global_prev == h_type);
583
584 // Any ... varargs
585 va_start(args, ap);
586 for (;;) {
587 char *key, *val = NULL, *str;
588
589 if (!(key = (char *)va_arg(args, char *)))
590 break;
591 if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
592 break;
593 if (*val == '\0')
594 continue;
595
596 if (!(h_tag = pool_alloc(hrecs->tag_pool)))
597 return -1;
598
599 if (strncmp(type, "CO", 2)) {
600 h_tag->len = 3 + strlen(val);
601 str = string_alloc(hrecs->str_pool, h_tag->len+1);
602 if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
603 return -1;
604 h_tag->str = str;
605 } else {
606 h_tag->len = strlen(key);
607 h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
608 if (!h_tag->str)
609 return -1;
610 }
611
612 h_tag->next = NULL;
613 if (last)
614 last->next = h_tag;
615 else
616 h_type->tag = h_tag;
617
618 last = h_tag;
619 }
620 va_end(args);
621
622 // Plus the specified va_list params
623 for (;;) {
624 char *key, *val = NULL, *str;
625
626 if (!(key = (char *)va_arg(ap, char *)))
627 break;
628 if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
629 break;
630
631 if (!(h_tag = pool_alloc(hrecs->tag_pool)))
632 return -1;
633
634 if (strncmp(type, "CO", 2)) {
635 h_tag->len = 3 + strlen(val);
636 str = string_alloc(hrecs->str_pool, h_tag->len+1);
637 if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
638 return -1;
639 h_tag->str = str;
640 } else {
641 h_tag->len = strlen(key);
642 h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
643 if (!h_tag->str)
644 return -1;
645 }
646
647 h_tag->next = NULL;
648 if (last)
649 last->next = h_tag;
650 else
651 h_type->tag = h_tag;
652
653 last = h_tag;
654 }
655
656 if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
657 return -1;
658
659 if (!strncmp(type, "PG", 2))
660 hrecs->pgs_changed = 1;
661
662 hrecs->dirty = 1;
663
664 return 0;
665 }
666
667 // As sam_hrecs_vadd(), but without the extra va_list parameter
sam_hrecs_add(sam_hrecs_t * hrecs,const char * type,...)668 static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
669 va_list args;
670 int res;
671 va_start(args, type);
672 res = sam_hrecs_vadd(hrecs, type, args, NULL);
673 va_end(args);
674 return res;
675 }
676
677 /*
678 * Function for deallocating a list of tags
679 */
680
sam_hrecs_free_tags(sam_hrecs_t * hrecs,sam_hrec_tag_t * tag)681 static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
682 if (!hrecs || !tag)
683 return;
684 if (tag->next)
685 sam_hrecs_free_tags(hrecs, tag->next);
686
687 pool_free(hrecs->tag_pool, tag);
688 }
689
sam_hrecs_remove_line(sam_hrecs_t * hrecs,const char * type_name,sam_hrec_type_t * type_found)690 static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) {
691 if (!hrecs || !type_name || !type_found)
692 return -1;
693
694 khint32_t itype = TYPEKEY(type_name);
695 khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
696 if (k == kh_end(hrecs->h))
697 return -1;
698
699 // Remove from global list (remembering it could be the only line)
700 if (hrecs->first_line == type_found) {
701 hrecs->first_line = (type_found->global_next != type_found
702 ? type_found->global_next : NULL);
703 }
704 type_found->global_next->global_prev = type_found->global_prev;
705 type_found->global_prev->global_next = type_found->global_next;
706
707 /* single element in the list */
708 if (type_found->prev == type_found || type_found->next == type_found) {
709 kh_del(sam_hrecs_t, hrecs->h, k);
710 } else {
711 type_found->prev->next = type_found->next;
712 type_found->next->prev = type_found->prev;
713 if (kh_val(hrecs->h, k) == type_found) { //first element
714 kh_val(hrecs->h, k) = type_found->next;
715 }
716 }
717
718 if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))
719 sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
720
721 sam_hrecs_free_tags(hrecs, type_found->tag);
722 pool_free(hrecs->type_pool, type_found);
723
724 hrecs->dirty = 1;
725
726 return 0;
727 }
728
729 // Paste together a line from the parsed data structures
build_header_line(const sam_hrec_type_t * ty,kstring_t * ks)730 static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
731 sam_hrec_tag_t *tag;
732 int r = 0;
733 char c[2]= { ty->type >> 8, ty->type & 0xff };
734
735 r |= (kputc_('@', ks) == EOF);
736 r |= (kputsn(c, 2, ks) == EOF);
737 for (tag = ty->tag; tag; tag = tag->next) {
738 r |= (kputc_('\t', ks) == EOF);
739 r |= (kputsn(tag->str, tag->len, ks) == EOF);
740 }
741
742 return r;
743 }
744
sam_hrecs_rebuild_lines(const sam_hrecs_t * hrecs,kstring_t * ks)745 static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
746 const sam_hrec_type_t *t1, *t2;
747
748 if (!hrecs->first_line)
749 return kputsn("", 0, ks) >= 0 ? 0 : -1;
750
751 t1 = t2 = hrecs->first_line;
752 do {
753 if (build_header_line(t1, ks) != 0)
754 return -1;
755 if (kputc('\n', ks) < 0)
756 return -1;
757
758 t1 = t1->global_next;
759 } while (t1 != t2);
760
761 return 0;
762 }
763
sam_hrecs_parse_lines(sam_hrecs_t * hrecs,const char * hdr,size_t len)764 static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
765 size_t i, lno;
766
767 if (!hrecs || len > SSIZE_MAX)
768 return -1;
769
770 if (!len)
771 len = strlen(hdr);
772
773 if (len < 3) {
774 if (len == 0 || *hdr == '\0') return 0;
775 sam_hrecs_error("Header line too short", hdr, len, 1);
776 return -1;
777 }
778
779 for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) {
780 khint32_t type;
781 khint_t k;
782
783 int l_start = i, new;
784 sam_hrec_type_t *h_type;
785 sam_hrec_tag_t *h_tag, *last;
786
787 if (hdr[i] != '@') {
788 sam_hrecs_error("Header line does not start with '@'",
789 &hdr[l_start], len - l_start, lno);
790 return -1;
791 }
792
793 if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) {
794 sam_hrecs_error("Header line does not have a two character key",
795 &hdr[l_start], len - l_start, lno);
796 return -1;
797 }
798 type = TYPEKEY(&hdr[i+1]);
799
800 i += 3;
801 if (i == len || hdr[i] == '\n')
802 continue;
803
804 // Add the header line type
805 if (!(h_type = pool_alloc(hrecs->type_pool)))
806 return -1;
807 k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
808 if (new < 0)
809 return -1;
810
811 h_type->type = type;
812
813 // Add to end of global list
814 sam_hrecs_global_list_add(hrecs, h_type, NULL);
815
816 // Form the ring, either with self or other lines of this type
817 if (!new) {
818 sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
819 p = t->prev;
820
821 assert(p->next == t);
822 p->next = h_type;
823 h_type->prev = p;
824
825 t->prev = h_type;
826 h_type->next = t;
827 } else {
828 kh_val(hrecs->h, k) = h_type;
829 h_type->prev = h_type->next = h_type;
830 }
831
832 // Parse the tags on this line
833 last = NULL;
834 if (type == TYPEKEY("CO")) {
835 size_t j;
836
837 if (i == len || hdr[i] != '\t') {
838 sam_hrecs_error("Missing tab",
839 &hdr[l_start], len - l_start, lno);
840 return -1;
841 }
842
843 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
844 ;
845
846 if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
847 return -1;
848 h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
849 h_tag->len = j-i;
850 h_tag->next = NULL;
851 if (!h_tag->str)
852 return -1;
853
854 i = j;
855
856 } else {
857 do {
858 size_t j;
859
860 if (i == len || hdr[i] != '\t') {
861 sam_hrecs_error("Missing tab",
862 &hdr[l_start], len - l_start, lno);
863 return -1;
864 }
865
866 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
867 ;
868
869 if (j - i < 3 || hdr[i + 2] != ':') {
870 sam_hrecs_error("Malformed key:value pair",
871 &hdr[l_start], len - l_start, lno);
872 return -1;
873 }
874
875 if (!(h_tag = pool_alloc(hrecs->tag_pool)))
876 return -1;
877 h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
878 h_tag->len = j-i;
879 h_tag->next = NULL;
880 if (!h_tag->str)
881 return -1;
882
883 if (last)
884 last->next = h_tag;
885 else
886 h_type->tag = h_tag;
887
888 last = h_tag;
889 i = j;
890 } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
891 }
892
893 /* Update RG/SQ hashes */
894 if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type))
895 return -1;
896 }
897
898 return 0;
899 }
900
901 /*! Update sam_hdr_t target_name and target_len arrays
902 *
903 * @return 0 on success; -1 on failure
904 */
sam_hdr_update_target_arrays(sam_hdr_t * bh,const sam_hrecs_t * hrecs,int refs_changed)905 int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
906 int refs_changed) {
907 if (!bh || !hrecs)
908 return -1;
909
910 if (refs_changed < 0)
911 return 0;
912
913 // Grow arrays if necessary
914 if (bh->n_targets < hrecs->nref) {
915 char **new_names = realloc(bh->target_name,
916 hrecs->nref * sizeof(*new_names));
917 if (!new_names)
918 return -1;
919 bh->target_name = new_names;
920 uint32_t *new_lens = realloc(bh->target_len,
921 hrecs->nref * sizeof(*new_lens));
922 if (!new_lens)
923 return -1;
924 bh->target_len = new_lens;
925 }
926
927 // Update names and lengths where changed
928 // hrecs->refs_changed is the first ref that has been updated, so ones
929 // before that can be skipped.
930 int i;
931 khint_t k;
932 khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
933 for (i = refs_changed; i < hrecs->nref; i++) {
934 if (i >= bh->n_targets
935 || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
936 if (i < bh->n_targets)
937 free(bh->target_name[i]);
938 bh->target_name[i] = strdup(hrecs->ref[i].name);
939 if (!bh->target_name[i])
940 return -1;
941 }
942 if (hrecs->ref[i].len < UINT32_MAX) {
943 bh->target_len[i] = hrecs->ref[i].len;
944
945 if (!long_refs)
946 continue;
947
948 // Check if we have an old length, if so remove it.
949 k = kh_get(s2i, long_refs, bh->target_name[i]);
950 if (k < kh_end(long_refs))
951 kh_del(s2i, long_refs, k);
952 } else {
953 bh->target_len[i] = UINT32_MAX;
954 if (bh->hrecs != hrecs) {
955 // Called from sam_hdr_dup; need to add sdict entries
956 if (!long_refs) {
957 if (!(bh->sdict = long_refs = kh_init(s2i)))
958 return -1;
959 }
960
961 // Add / update length
962 int absent;
963 k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
964 if (absent < 0)
965 return -1;
966 kh_val(long_refs, k) = hrecs->ref[i].len;
967 }
968 }
969 }
970
971 // Free up any names that have been removed
972 for (; i < bh->n_targets; i++) {
973 if (long_refs) {
974 k = kh_get(s2i, long_refs, bh->target_name[i]);
975 if (k < kh_end(long_refs))
976 kh_del(s2i, long_refs, k);
977 }
978 free(bh->target_name[i]);
979 }
980
981 bh->n_targets = hrecs->nref;
982 return 0;
983 }
984
rebuild_target_arrays(sam_hdr_t * bh)985 static int rebuild_target_arrays(sam_hdr_t *bh) {
986 if (!bh || !bh->hrecs)
987 return -1;
988
989 sam_hrecs_t *hrecs = bh->hrecs;
990 if (hrecs->refs_changed < 0)
991 return 0;
992
993 if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
994 return -1;
995
996 hrecs->refs_changed = -1;
997 return 0;
998 }
999
1000 /// Populate hrecs refs array from header target_name, target_len arrays
1001 /**
1002 * @return 0 on success; -1 on failure
1003 *
1004 * Pre-fills the refs hash from the target arrays. For BAM files this
1005 * will ensure that they are in the correct order as the target arrays
1006 * are the canonical source for converting target ids to names and lengths.
1007 *
1008 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1009 * will add the links later for lines found in the text header.
1010 *
1011 * This should be called before the text header is parsed.
1012 */
sam_hrecs_refs_from_targets_array(sam_hrecs_t * hrecs,const sam_hdr_t * bh)1013 static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1014 const sam_hdr_t *bh) {
1015 int32_t tid = 0;
1016
1017 if (!hrecs || !bh)
1018 return -1;
1019
1020 // This should always be called before parsing the text header
1021 // so the ref array should start off empty, and we don't have to try
1022 // to reconcile any existing data.
1023 if (hrecs->nref > 0) {
1024 hts_log_error("Called with non-empty ref array");
1025 return -1;
1026 }
1027
1028 if (hrecs->ref_sz < bh->n_targets) {
1029 sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1030 bh->n_targets * sizeof(*new_ref));
1031 if (!new_ref)
1032 return -1;
1033
1034 hrecs->ref = new_ref;
1035 hrecs->ref_sz = bh->n_targets;
1036 }
1037
1038 for (tid = 0; tid < bh->n_targets; tid++) {
1039 khint_t k;
1040 int r;
1041 hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1042 if (!hrecs->ref[tid].name) goto fail;
1043 if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1044 hrecs->ref[tid].len = bh->target_len[tid];
1045 } else {
1046 khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1047 k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1048 if (k < kh_end(long_refs)) {
1049 hrecs->ref[tid].len = kh_val(long_refs, k);
1050 } else {
1051 hrecs->ref[tid].len = UINT32_MAX;
1052 }
1053 }
1054 hrecs->ref[tid].ty = NULL;
1055 k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1056 if (r < 0) goto fail;
1057 if (r == 0) {
1058 hts_log_error("Duplicate entry \"%s\" in target list",
1059 hrecs->ref[tid].name);
1060 return -1;
1061 } else {
1062 kh_val(hrecs->ref_hash, k) = tid;
1063 }
1064 }
1065 hrecs->nref = bh->n_targets;
1066 return 0;
1067
1068 fail: {
1069 int32_t i;
1070 hts_log_error("%s", strerror(errno));
1071 for (i = 0; i < tid; i++) {
1072 khint_t k;
1073 if (!hrecs->ref[i].name) continue;
1074 k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1075 if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1076 }
1077 hrecs->nref = 0;
1078 return -1;
1079 }
1080 }
1081
1082 /*
1083 * Add SQ header records for any references in the hrecs->ref array that
1084 * were added by sam_hrecs_refs_from_targets_array() but have not
1085 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1086 *
1087 * This may be needed either because:
1088 *
1089 * - A bam file was read that had entries in its refs list with no
1090 * corresponding @SQ line.
1091 *
1092 * - A program constructed a sam_hdr_t which has target_name and target_len
1093 * array entries with no corresponding @SQ line in text.
1094 */
add_stub_ref_sq_lines(sam_hrecs_t * hrecs)1095 static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1096 int tid;
1097 char len[32];
1098
1099 for (tid = 0; tid < hrecs->nref; tid++) {
1100 if (hrecs->ref[tid].ty == NULL) {
1101 snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1102 if (sam_hrecs_add(hrecs, "SQ",
1103 "SN", hrecs->ref[tid].name,
1104 "LN", len, NULL) != 0)
1105 return -1;
1106
1107 // Check that the stub has actually been filled
1108 if(hrecs->ref[tid].ty == NULL) {
1109 hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1110 tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1111 return -1;
1112 }
1113 }
1114 }
1115 return 0;
1116 }
1117
sam_hdr_fill_hrecs(sam_hdr_t * bh)1118 int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1119 sam_hrecs_t *hrecs = sam_hrecs_new();
1120
1121 if (!hrecs)
1122 return -1;
1123
1124 if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1125 if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1126 sam_hrecs_free(hrecs);
1127 return -1;
1128 }
1129 }
1130
1131 // Parse existing header text
1132 if (bh->text && bh->l_text > 0) {
1133 if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1134 sam_hrecs_free(hrecs);
1135 return -1;
1136 }
1137 }
1138
1139 if (add_stub_ref_sq_lines(hrecs) < 0) {
1140 sam_hrecs_free(hrecs);
1141 return -1;
1142 }
1143
1144 bh->hrecs = hrecs;
1145
1146 if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1147 return -1;
1148
1149 return 0;
1150 }
1151
1152 /** Remove outdated header text
1153
1154 @param bh BAM header
1155
1156 This is called when API functions have changed the header so that the
1157 text version is no longer valid.
1158 */
redact_header_text(sam_hdr_t * bh)1159 static void redact_header_text(sam_hdr_t *bh) {
1160 assert(bh->hrecs && bh->hrecs->dirty);
1161 bh->l_text = 0;
1162 free(bh->text);
1163 bh->text = NULL;
1164 }
1165
1166 /** Find nth header record of a given type
1167
1168 @param type Header type (SQ, RG etc.)
1169 @param idx 0-based index
1170
1171 @return sam_hrec_type_t pointer to the record on success
1172 NULL if no record exists with the given type and index
1173 */
1174
sam_hrecs_find_type_pos(sam_hrecs_t * hrecs,const char * type,int idx)1175 static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1176 const char *type, int idx) {
1177 sam_hrec_type_t *first, *itr;
1178
1179 if (idx < 0)
1180 return NULL;
1181
1182 if (type[0] == 'S' && type[1] == 'Q')
1183 return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1184
1185 if (type[0] == 'R' && type[1] == 'G')
1186 return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1187
1188 if (type[0] == 'P' && type[1] == 'G')
1189 return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1190
1191 first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1192 if (!first)
1193 return NULL;
1194
1195 while (idx > 0) {
1196 itr = itr->next;
1197 if (itr == first)
1198 break;
1199 --idx;
1200 }
1201
1202 return idx == 0 ? itr : NULL;
1203 }
1204
1205 /* ==== Public methods ==== */
1206
sam_hdr_length(sam_hdr_t * bh)1207 size_t sam_hdr_length(sam_hdr_t *bh) {
1208 if (!bh || -1 == sam_hdr_rebuild(bh))
1209 return SIZE_MAX;
1210
1211 return bh->l_text;
1212 }
1213
sam_hdr_str(sam_hdr_t * bh)1214 const char *sam_hdr_str(sam_hdr_t *bh) {
1215 if (!bh || -1 == sam_hdr_rebuild(bh))
1216 return NULL;
1217
1218 return bh->text;
1219 }
1220
sam_hdr_nref(const sam_hdr_t * bh)1221 int sam_hdr_nref(const sam_hdr_t *bh) {
1222 if (!bh)
1223 return -1;
1224
1225 return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1226 }
1227
1228 /*
1229 * Reconstructs the text representation from the header hash table.
1230 * Returns 0 on success
1231 * -1 on failure
1232 */
sam_hdr_rebuild(sam_hdr_t * bh)1233 int sam_hdr_rebuild(sam_hdr_t *bh) {
1234 sam_hrecs_t *hrecs;
1235 if (!bh)
1236 return -1;
1237
1238 if (!(hrecs = bh->hrecs))
1239 return bh->text ? 0 : -1;
1240
1241 if (hrecs->refs_changed >= 0) {
1242 if (rebuild_target_arrays(bh) < 0) {
1243 hts_log_error("Header target array rebuild has failed");
1244 return -1;
1245 }
1246 }
1247
1248 /* If header text wasn't changed or header is empty, don't rebuild it. */
1249 if (!hrecs->dirty)
1250 return 0;
1251
1252 if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1253 hts_log_error("Linking @PG lines has failed");
1254 return -1;
1255 }
1256
1257 kstring_t ks = KS_INITIALIZE;
1258 if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1259 ks_free(&ks);
1260 hts_log_error("Header text rebuild has failed");
1261 return -1;
1262 }
1263
1264 hrecs->dirty = 0;
1265
1266 /* Sync */
1267 free(bh->text);
1268 bh->l_text = ks_len(&ks);
1269 bh->text = ks_release(&ks);
1270
1271 return 0;
1272 }
1273
1274 /*
1275 * Appends a formatted line to an existing SAM header.
1276 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1277 * optional new-line. If it contains more than 1 line then multiple lines
1278 * will be added in order.
1279 *
1280 * Input text is of maximum length len or as terminated earlier by a NUL.
1281 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1282 *
1283 * Returns 0 on success
1284 * -1 on failure
1285 */
sam_hdr_add_lines(sam_hdr_t * bh,const char * lines,size_t len)1286 int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1287 sam_hrecs_t *hrecs;
1288
1289 if (!bh || !lines)
1290 return -1;
1291
1292 if (len == 0 && *lines == '\0')
1293 return 0;
1294
1295 if (!(hrecs = bh->hrecs)) {
1296 if (sam_hdr_fill_hrecs(bh) != 0)
1297 return -1;
1298 hrecs = bh->hrecs;
1299 }
1300
1301 if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1302 return -1;
1303
1304 if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1305 return -1;
1306
1307 hrecs->dirty = 1;
1308 redact_header_text(bh);
1309
1310 return 0;
1311 }
1312
1313 /*
1314 * Adds a single line to a SAM header.
1315 * Specify type and one or more key,value pairs, ending with the NULL key.
1316 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1317 *
1318 * Returns 0 on success
1319 * -1 on failure
1320 */
sam_hdr_add_line(sam_hdr_t * bh,const char * type,...)1321 int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1322 va_list args;
1323 sam_hrecs_t *hrecs;
1324
1325 if (!bh || !type)
1326 return -1;
1327
1328 if (!(hrecs = bh->hrecs)) {
1329 if (sam_hdr_fill_hrecs(bh) != 0)
1330 return -1;
1331 hrecs = bh->hrecs;
1332 }
1333
1334 va_start(args, type);
1335 int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1336 va_end(args);
1337
1338 if (ret == 0) {
1339 if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1340 return -1;
1341
1342 if (hrecs->dirty)
1343 redact_header_text(bh);
1344 }
1345
1346 return ret;
1347 }
1348
1349 /*
1350 * Returns a complete line of formatted text for a specific head type/ID
1351 * combination. If ID_key is NULL then it returns the first line of the specified
1352 * type.
1353 */
sam_hdr_find_line_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_val,kstring_t * ks)1354 int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1355 const char *ID_key, const char *ID_val, kstring_t *ks) {
1356 sam_hrecs_t *hrecs;
1357 if (!bh || !type)
1358 return -2;
1359
1360 if (!(hrecs = bh->hrecs)) {
1361 if (sam_hdr_fill_hrecs(bh) != 0)
1362 return -2;
1363 hrecs = bh->hrecs;
1364 }
1365
1366 sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1367 if (!ty)
1368 return -1;
1369
1370 ks->l = 0;
1371 if (build_header_line(ty, ks) < 0) {
1372 return -2;
1373 }
1374
1375 return 0;
1376 }
1377
sam_hdr_find_line_pos(sam_hdr_t * bh,const char * type,int pos,kstring_t * ks)1378 int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1379 int pos, kstring_t *ks) {
1380 sam_hrecs_t *hrecs;
1381 if (!bh || !type)
1382 return -2;
1383
1384 if (!(hrecs = bh->hrecs)) {
1385 if (sam_hdr_fill_hrecs(bh) != 0)
1386 return -2;
1387 hrecs = bh->hrecs;
1388 }
1389
1390 sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1391 if (!ty)
1392 return -1;
1393
1394 ks->l = 0;
1395 if (build_header_line(ty, ks) < 0) {
1396 return -2;
1397 }
1398
1399 return 0;
1400 }
1401
1402 /*
1403 * Remove a line from the header by specifying a tag:value that uniquely
1404 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1405 * @SQ line is uniquely identified by SN tag.
1406 * @RG line is uniquely identified by ID tag.
1407 * @PG line is uniquely identified by ID tag.
1408 *
1409 * Returns 0 on success and -1 on error
1410 */
1411
sam_hdr_remove_line_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value)1412 int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1413 sam_hrecs_t *hrecs;
1414 if (!bh || !type)
1415 return -1;
1416
1417 if (!(hrecs = bh->hrecs)) {
1418 if (sam_hdr_fill_hrecs(bh) != 0)
1419 return -1;
1420 hrecs = bh->hrecs;
1421 }
1422
1423 if (!strncmp(type, "PG", 2)) {
1424 hts_log_warning("Removing PG lines is not supported!");
1425 return -1;
1426 }
1427
1428 sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1429 if (!type_found)
1430 return 0;
1431
1432 int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1433 if (ret == 0) {
1434 if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1435 return -1;
1436
1437 if (hrecs->dirty)
1438 redact_header_text(bh);
1439 }
1440
1441 return ret;
1442 }
1443
1444 /*
1445 * Remove a line from the header by specifying the position in the type
1446 * group, i.e. 3rd @SQ line.
1447 *
1448 * Returns 0 on success and -1 on error
1449 */
1450
sam_hdr_remove_line_pos(sam_hdr_t * bh,const char * type,int position)1451 int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1452 sam_hrecs_t *hrecs;
1453 if (!bh || !type || position <= 0)
1454 return -1;
1455
1456 if (!(hrecs = bh->hrecs)) {
1457 if (sam_hdr_fill_hrecs(bh) != 0)
1458 return -1;
1459 hrecs = bh->hrecs;
1460 }
1461
1462 if (!strncmp(type, "PG", 2)) {
1463 hts_log_warning("Removing PG lines is not supported!");
1464 return -1;
1465 }
1466
1467 sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1468 position);
1469 if (!type_found)
1470 return -1;
1471
1472 int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1473 if (ret == 0) {
1474 if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1475 return -1;
1476
1477 if (hrecs->dirty)
1478 redact_header_text(bh);
1479 }
1480
1481 return ret;
1482 }
1483
1484 /*
1485 * Check if sam_hdr_update_line() is being used to change the name of
1486 * a record, and if the new name is going to clash with an existing one.
1487 *
1488 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1489 * will go through them all and leave the final one in place.
1490 *
1491 * Returns 0 if the name does not change
1492 * 1 if the name changes but does not clash
1493 * -1 if the name changes and the new one is already in use
1494 */
check_for_name_update(sam_hrecs_t * hrecs,sam_hrec_type_t * rec,va_list ap,const char ** old_name,const char ** new_name,char id_tag_out[3],khash_t (m_s2i)** hash_out)1495 static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1496 va_list ap, const char **old_name,
1497 const char **new_name,
1498 char id_tag_out[3],
1499 khash_t(m_s2i) **hash_out) {
1500 char *key, *val;
1501 const char *id_tag;
1502 sam_hrec_tag_t *tag, *prev;
1503 khash_t(m_s2i) *hash;
1504 khint_t k;
1505 int ret = 0;
1506
1507 if (rec->type == TYPEKEY("SQ")) {
1508 id_tag = "SN"; hash = hrecs->ref_hash;
1509 } else if (rec->type == TYPEKEY("RG")) {
1510 id_tag = "ID"; hash = hrecs->rg_hash;
1511 } else if (rec->type == TYPEKEY("PG")) {
1512 id_tag = "ID"; hash = hrecs->pg_hash;
1513 } else {
1514 return 0;
1515 }
1516
1517 memcpy(id_tag_out, id_tag, 3);
1518 *hash_out = hash;
1519
1520 tag = sam_hrecs_find_key(rec, id_tag, &prev);
1521 if (!tag)
1522 return 0;
1523 assert(tag->len >= 3);
1524 *old_name = tag->str + 3;
1525
1526 while ((key = va_arg(ap, char *)) != NULL) {
1527 val = va_arg(ap, char *);
1528 if (!val) val = "";
1529 if (strcmp(key, id_tag) != 0) continue;
1530 if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1531 k = kh_get(m_s2i, hash, val);
1532 ret = k < kh_end(hash) ? -1 : 1;
1533 *new_name = val;
1534 }
1535 return ret;
1536 }
1537
sam_hdr_update_line(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,...)1538 int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1539 const char *ID_key, const char *ID_value, ...) {
1540 sam_hrecs_t *hrecs;
1541 if (!bh)
1542 return -1;
1543
1544 if (!(hrecs = bh->hrecs)) {
1545 if (sam_hdr_fill_hrecs(bh) != 0)
1546 return -1;
1547 hrecs = bh->hrecs;
1548 }
1549
1550 int ret, rename;
1551 sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1552 if (!ty)
1553 return -1;
1554
1555 va_list args;
1556 const char *old_name = "?", *new_name = "?";
1557 char id_tag[3];
1558 khash_t(m_s2i) *hash = NULL;
1559 va_start(args, ID_value);
1560 rename = check_for_name_update(hrecs, ty, args,
1561 &old_name, &new_name, id_tag, &hash);
1562 va_end(args);
1563 if (rename < 0) {
1564 hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1565 type, old_name, new_name);
1566 return -1;
1567 }
1568 if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1569 // This is just too complicated
1570 hts_log_error("Renaming @PG records is not supported");
1571 return -1;
1572 }
1573 va_start(args, ID_value);
1574 ret = sam_hrecs_vupdate(hrecs, ty, args);
1575 va_end(args);
1576
1577 if (ret)
1578 return ret;
1579
1580 // TODO Account for @SQ-AN altnames
1581
1582 if (rename) {
1583 // Adjust the hash table to point to the new name
1584 // sam_hrecs_update_hashes() should sort out everything else
1585 khint_t k = kh_get(m_s2i, hash, old_name);
1586 sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1587 int r, pos;
1588 assert(k < kh_end(hash)); // Or we wouldn't have found it earlier
1589 assert(new_tag && new_tag->str); // id_tag should exist
1590 assert(new_tag->len > 3);
1591 pos = kh_val(hash, k);
1592 kh_del(m_s2i, hash, k);
1593 k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1594 if (r < 1) {
1595 hts_log_error("Failed to rename item in hash table");
1596 return -1;
1597 }
1598 kh_val(hash, k) = pos;
1599 }
1600
1601 ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1602
1603 if (!ret && hrecs->refs_changed >= 0)
1604 ret = rebuild_target_arrays(bh);
1605
1606 if (!ret && hrecs->dirty)
1607 redact_header_text(bh);
1608
1609 return ret;
1610 }
1611
sam_hdr_remove_except(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value)1612 int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1613 sam_hrecs_t *hrecs;
1614 if (!bh || !type)
1615 return -1;
1616
1617 if (!(hrecs = bh->hrecs)) {
1618 if (sam_hdr_fill_hrecs(bh) != 0)
1619 return -1;
1620 hrecs = bh->hrecs;
1621 }
1622
1623 sam_hrec_type_t *step;
1624 int ret = 1, remove_all = (ID_key == NULL);
1625
1626 if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
1627 hts_log_warning("Removing PG or CO lines is not supported!");
1628 return -1;
1629 }
1630
1631 sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1632 if (!type_found) { // remove all line of this type
1633 khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1634 if (k == kh_end(hrecs->h))
1635 return 0;
1636 type_found = kh_val(hrecs->h, k);
1637 if (!type_found)
1638 return 0;
1639 remove_all = 1;
1640 }
1641
1642 step = type_found->next;
1643 while (step != type_found) {
1644 sam_hrec_type_t *to_remove = step;
1645 step = step->next;
1646 ret &= sam_hrecs_remove_line(hrecs, type, to_remove);
1647 }
1648
1649 if (remove_all)
1650 ret &= sam_hrecs_remove_line(hrecs, type, type_found);
1651
1652 if (!ret && hrecs->dirty)
1653 redact_header_text(bh);
1654
1655 return 0;
1656 }
1657
sam_hdr_remove_lines(sam_hdr_t * bh,const char * type,const char * id,void * vrh)1658 int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
1659 sam_hrecs_t *hrecs;
1660 rmhash_t *rh = (rmhash_t *)vrh;
1661
1662 if (!bh || !type)
1663 return -1;
1664 if (!rh) // remove all lines
1665 return sam_hdr_remove_except(bh, type, NULL, NULL);
1666 if (!id)
1667 return -1;
1668
1669 if (!(hrecs = bh->hrecs)) {
1670 if (sam_hdr_fill_hrecs(bh) != 0)
1671 return -1;
1672 hrecs = bh->hrecs;
1673 }
1674
1675 khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1676 if (k == kh_end(hrecs->h)) // nothing to remove from
1677 return 0;
1678
1679 sam_hrec_type_t *head = kh_val(hrecs->h, k);
1680 if (!head) {
1681 hts_log_error("Header inconsistency");
1682 return -1;
1683 }
1684
1685 int ret = 0;
1686 sam_hrec_type_t *step = head->next;
1687 while (step != head) {
1688 sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
1689 if (tag && tag->str && tag->len >= 3) {
1690 k = kh_get(rm, rh, tag->str+3);
1691 if (k == kh_end(rh)) { // value is not in the hash table, so remove
1692 sam_hrec_type_t *to_remove = step;
1693 step = step->next;
1694 ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1695 } else {
1696 step = step->next;
1697 }
1698 } else { // tag is not on the line, so skip to next line
1699 step = step->next;
1700 }
1701 }
1702
1703 // process the first line
1704 sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
1705 if (tag && tag->str && tag->len >= 3) {
1706 k = kh_get(rm, rh, tag->str+3);
1707 if (k == kh_end(rh)) { // value is not in the hash table, so remove
1708 sam_hrec_type_t *to_remove = head;
1709 head = head->next;
1710 ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1711 }
1712 }
1713
1714 if (!ret && hrecs->dirty)
1715 redact_header_text(bh);
1716
1717 return ret;
1718 }
1719
sam_hdr_count_lines(sam_hdr_t * bh,const char * type)1720 int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
1721 int count;
1722 sam_hrec_type_t *first_ty, *itr_ty;
1723
1724 if (!bh || !type)
1725 return -1;
1726
1727 if (!bh->hrecs) {
1728 if (sam_hdr_fill_hrecs(bh) != 0)
1729 return -1;
1730 }
1731
1732 // Deal with types that have counts
1733 switch (type[0]) {
1734 case 'S':
1735 if (type[1] == 'Q')
1736 return bh->hrecs->nref;
1737 break;
1738 case 'R':
1739 if (type[1] == 'G')
1740 return bh->hrecs->nrg;
1741 break;
1742 case 'P':
1743 if (type[1] == 'G')
1744 return bh->hrecs->npg;
1745 break;
1746 default:
1747 break;
1748 }
1749
1750 first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
1751 if (!first_ty)
1752 return 0;
1753
1754 count = 1;
1755 for (itr_ty = first_ty->next;
1756 itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
1757 count++;
1758 }
1759
1760 return count;
1761 }
1762
sam_hdr_line_index(sam_hdr_t * bh,const char * type,const char * key)1763 int sam_hdr_line_index(sam_hdr_t *bh,
1764 const char *type,
1765 const char *key) {
1766 sam_hrecs_t *hrecs;
1767 if (!bh || !type || !key)
1768 return -2;
1769
1770 if (!(hrecs = bh->hrecs)) {
1771 if (sam_hdr_fill_hrecs(bh) != 0)
1772 return -2;
1773 hrecs = bh->hrecs;
1774 }
1775
1776 khint_t k;
1777 int idx = -1;
1778 switch (type[0]) {
1779 case 'S':
1780 if (type[1] == 'Q') {
1781 k = kh_get(m_s2i, hrecs->ref_hash, key);
1782 if (k != kh_end(hrecs->ref_hash))
1783 idx = kh_val(hrecs->ref_hash, k);
1784 } else {
1785 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1786 }
1787 break;
1788 case 'R':
1789 if (type[1] == 'G') {
1790 k = kh_get(m_s2i, hrecs->rg_hash, key);
1791 if (k != kh_end(hrecs->rg_hash))
1792 idx = kh_val(hrecs->rg_hash, k);
1793 } else {
1794 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1795 }
1796 break;
1797 case 'P':
1798 if (type[1] == 'G') {
1799 k = kh_get(m_s2i, hrecs->pg_hash, key);
1800 if (k != kh_end(hrecs->pg_hash))
1801 idx = kh_val(hrecs->pg_hash, k);
1802 } else {
1803 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1804 }
1805 break;
1806 default:
1807 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1808 }
1809
1810 return idx;
1811 }
1812
sam_hdr_line_name(sam_hdr_t * bh,const char * type,int pos)1813 const char *sam_hdr_line_name(sam_hdr_t *bh,
1814 const char *type,
1815 int pos) {
1816 sam_hrecs_t *hrecs;
1817 if (!bh || !type || pos < 0)
1818 return NULL;
1819
1820 if (!(hrecs = bh->hrecs)) {
1821 if (sam_hdr_fill_hrecs(bh) != 0)
1822 return NULL;
1823 hrecs = bh->hrecs;
1824 }
1825
1826 switch (type[0]) {
1827 case 'S':
1828 if (type[1] == 'Q') {
1829 if (pos < hrecs->nref)
1830 return hrecs->ref[pos].name;
1831 } else {
1832 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1833 }
1834 break;
1835 case 'R':
1836 if (type[1] == 'G') {
1837 if (pos < hrecs->nrg)
1838 return hrecs->rg[pos].name;
1839 } else {
1840 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1841 }
1842 break;
1843 case 'P':
1844 if (type[1] == 'G') {
1845 if (pos < hrecs->npg)
1846 return hrecs->pg[pos].name;
1847 } else {
1848 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1849 }
1850 break;
1851 default:
1852 hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1853 }
1854
1855 return NULL;
1856 }
1857
1858 /* ==== Key:val level methods ==== */
1859
sam_hdr_find_tag_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,const char * key,kstring_t * ks)1860 int sam_hdr_find_tag_id(sam_hdr_t *bh,
1861 const char *type,
1862 const char *ID_key,
1863 const char *ID_value,
1864 const char *key,
1865 kstring_t *ks) {
1866 sam_hrecs_t *hrecs;
1867 if (!bh || !type || !key)
1868 return -2;
1869
1870 if (!(hrecs = bh->hrecs)) {
1871 if (sam_hdr_fill_hrecs(bh) != 0)
1872 return -2;
1873 hrecs = bh->hrecs;
1874 }
1875
1876 sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1877 if (!ty)
1878 return -1;
1879
1880 sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1881 if (!tag || !tag->str || tag->len < 4)
1882 return -1;
1883
1884 ks->l = 0;
1885 if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1886 return -2;
1887 }
1888
1889 return 0;
1890 }
1891
sam_hdr_find_tag_pos(sam_hdr_t * bh,const char * type,int pos,const char * key,kstring_t * ks)1892 int sam_hdr_find_tag_pos(sam_hdr_t *bh,
1893 const char *type,
1894 int pos,
1895 const char *key,
1896 kstring_t *ks) {
1897 sam_hrecs_t *hrecs;
1898 if (!bh || !type || !key)
1899 return -2;
1900
1901 if (!(hrecs = bh->hrecs)) {
1902 if (sam_hdr_fill_hrecs(bh) != 0)
1903 return -2;
1904 hrecs = bh->hrecs;
1905 }
1906
1907 sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1908 if (!ty)
1909 return -1;
1910
1911 sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1912 if (!tag || !tag->str || tag->len < 4)
1913 return -1;
1914
1915 ks->l = 0;
1916 if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1917 return -2;
1918 }
1919
1920 return 0;
1921 }
1922
sam_hdr_remove_tag_id(sam_hdr_t * bh,const char * type,const char * ID_key,const char * ID_value,const char * key)1923 int sam_hdr_remove_tag_id(sam_hdr_t *bh,
1924 const char *type,
1925 const char *ID_key,
1926 const char *ID_value,
1927 const char *key) {
1928 sam_hrecs_t *hrecs;
1929 if (!bh || !type || !key)
1930 return -1;
1931
1932 if (!(hrecs = bh->hrecs)) {
1933 if (sam_hdr_fill_hrecs(bh) != 0)
1934 return -1;
1935 hrecs = bh->hrecs;
1936 }
1937
1938 sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1939 if (!ty)
1940 return -1;
1941
1942 int ret = sam_hrecs_remove_key(hrecs, ty, key);
1943 if (!ret && hrecs->dirty)
1944 redact_header_text(bh);
1945
1946 return ret;
1947 }
1948
1949 /*
1950 * Reconstructs a kstring from the header hash table.
1951 * Returns 0 on success
1952 * -1 on failure
1953 */
sam_hrecs_rebuild_text(const sam_hrecs_t * hrecs,kstring_t * ks)1954 int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
1955 ks->l = 0;
1956
1957 if (!hrecs->h || !hrecs->h->size) {
1958 return kputsn("", 0, ks) >= 0 ? 0 : -1;
1959 }
1960 if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
1961 return -1;
1962
1963 return 0;
1964 }
1965
1966 /*
1967 * Looks up a reference sequence by name and returns the numerical ID.
1968 * Returns -1 if unknown reference; -2 if header could not be parsed.
1969 */
sam_hdr_name2tid(sam_hdr_t * bh,const char * ref)1970 int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
1971 sam_hrecs_t *hrecs;
1972 khint_t k;
1973
1974 if (!bh)
1975 return -1;
1976
1977 if (!(hrecs = bh->hrecs)) {
1978 if (sam_hdr_fill_hrecs(bh) != 0)
1979 return -2;
1980 hrecs = bh->hrecs;
1981 }
1982
1983 if (!hrecs->ref_hash)
1984 return -1;
1985
1986 k = kh_get(m_s2i, hrecs->ref_hash, ref);
1987 return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
1988 }
1989
sam_hdr_tid2name(const sam_hdr_t * h,int tid)1990 const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
1991 sam_hrecs_t *hrecs;
1992
1993 if (!h || tid < 0)
1994 return NULL;
1995
1996 if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
1997 return hrecs->ref[tid].name;
1998 } else {
1999 if (tid < h->n_targets)
2000 return h->target_name[tid];
2001 }
2002
2003 return NULL;
2004 }
2005
sam_hdr_tid2len(const sam_hdr_t * h,int tid)2006 hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2007 sam_hrecs_t *hrecs;
2008
2009 if (!h || tid < 0)
2010 return 0;
2011
2012 if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2013 return hrecs->ref[tid].len;
2014 } else {
2015 if (tid < h->n_targets) {
2016 if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2017 return h->target_len[tid];
2018 } else {
2019 khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2020 khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2021 if (k < kh_end(long_refs)) {
2022 return kh_val(long_refs, k);
2023 } else {
2024 return UINT32_MAX;
2025 }
2026 }
2027 }
2028 }
2029
2030 return 0;
2031 }
2032
2033 /*
2034 * Fixes any PP links in @PG headers.
2035 * If the entries are in order then this doesn't need doing, but in case
2036 * our header is out of order this goes through the hrecs->pg[] array
2037 * setting the prev_id field.
2038 *
2039 * Note we can have multiple complete chains. This code should identify the
2040 * tails of these chains as these are the entries we have to link to in
2041 * subsequent PP records.
2042 *
2043 * Returns 0 on success
2044 * -1 on failure (indicating broken PG/PP records)
2045 */
sam_hdr_link_pg(sam_hdr_t * bh)2046 static int sam_hdr_link_pg(sam_hdr_t *bh) {
2047 sam_hrecs_t *hrecs;
2048 int i, j, ret = 0, *new_pg_end;
2049
2050 if (!bh)
2051 return -1;
2052
2053 if (!(hrecs = bh->hrecs)) {
2054 if (sam_hdr_fill_hrecs(bh) != 0)
2055 return -1;
2056 hrecs = bh->hrecs;
2057 }
2058
2059 if (!hrecs->pgs_changed || !hrecs->npg)
2060 return 0;
2061
2062 hrecs->npg_end_alloc = hrecs->npg;
2063 new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2064 if (!new_pg_end)
2065 return -1;
2066 hrecs->pg_end = new_pg_end;
2067 int *chain_size = calloc(hrecs->npg, sizeof(int));
2068 if (!chain_size)
2069 return -1;
2070
2071 for (i = 0; i < hrecs->npg; i++)
2072 hrecs->pg_end[i] = i;
2073
2074 for (i = 0; i < hrecs->npg; i++) {
2075 khint_t k;
2076 sam_hrec_tag_t *tag;
2077
2078 assert(hrecs->pg[i].ty != NULL);
2079 for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2080 if (tag->str[0] == 'P' && tag->str[1] == 'P')
2081 break;
2082 }
2083 if (!tag) {
2084 /* Chain start points */
2085 continue;
2086 }
2087
2088 k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2089
2090 if (k == kh_end(hrecs->pg_hash)) {
2091 hts_log_warning("PG line with PN:%s has a PP link to missing program '%s'",
2092 hrecs->pg[i].name, tag->str+3);
2093 continue;
2094 }
2095
2096 hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id;
2097 hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1;
2098 chain_size[i] = chain_size[kh_val(hrecs->pg_hash, k)]+1;
2099 }
2100
2101 for (i = j = 0; i < hrecs->npg; i++) {
2102 if (hrecs->pg_end[i] != -1 && chain_size[i] > 0)
2103 hrecs->pg_end[j++] = hrecs->pg_end[i];
2104 }
2105 /* Only leafs? Choose the last one! */
2106 if (!j && hrecs->npg_end > 0) {
2107 hrecs->pg_end[0] = hrecs->pg_end[hrecs->npg_end-1];
2108 j = 1;
2109 }
2110
2111 hrecs->npg_end = j;
2112 hrecs->pgs_changed = 0;
2113
2114 /* mark as dirty or empty for rebuild */
2115 hrecs->dirty = 1;
2116 redact_header_text(bh);
2117 free(chain_size);
2118
2119 return ret;
2120 }
2121
2122 /*
2123 * Returns a unique ID from a base name.
2124 *
2125 * The value returned is valid until the next call to
2126 * this function.
2127 */
sam_hdr_pg_id(sam_hdr_t * bh,const char * name)2128 const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2129 sam_hrecs_t *hrecs;
2130 size_t name_len;
2131 const size_t name_extra = 17;
2132 if (!bh || !name)
2133 return NULL;
2134
2135 if (!(hrecs = bh->hrecs)) {
2136 if (sam_hdr_fill_hrecs(bh) != 0)
2137 return NULL;
2138 hrecs = bh->hrecs;
2139 }
2140
2141 khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2142 if (k == kh_end(hrecs->pg_hash))
2143 return name;
2144
2145 name_len = strlen(name);
2146 if (name_len > 1000) name_len = 1000;
2147 if (hrecs->ID_buf_sz < name_len + name_extra) {
2148 char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2149 if (new_ID_buf == NULL)
2150 return NULL;
2151 hrecs->ID_buf = new_ID_buf;
2152 hrecs->ID_buf_sz = name_len + name_extra;
2153 }
2154
2155 do {
2156 snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2157 k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2158 } while (k != kh_end(hrecs->pg_hash));
2159
2160 return hrecs->ID_buf;
2161 }
2162
2163 /*
2164 * Add an @PG line.
2165 *
2166 * If we wish complete control over this use sam_hdr_add_line() directly. This
2167 * function uses that, but attempts to do a lot of tedious house work for
2168 * you too.
2169 *
2170 * - It will generate a suitable ID if the supplied one clashes.
2171 * - It will generate multiple @PG records if we have multiple PG chains.
2172 *
2173 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2174 * in NULL.
2175 *
2176 * Returns 0 on success
2177 * -1 on failure
2178 */
sam_hdr_add_pg(sam_hdr_t * bh,const char * name,...)2179 int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2180 sam_hrecs_t *hrecs;
2181 const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2182 const char *key, *val;
2183 if (!bh)
2184 return -1;
2185
2186 if (!(hrecs = bh->hrecs)) {
2187 if (sam_hdr_fill_hrecs(bh) != 0)
2188 return -1;
2189 hrecs = bh->hrecs;
2190 }
2191
2192 bh->hrecs->pgs_changed = 1;
2193 if (sam_hdr_link_pg(bh) < 0) {
2194 hts_log_error("Error linking @PG lines");
2195 return -1;
2196 }
2197
2198 va_list args;
2199 // Check for ID / PN / PP tags in varargs list
2200 va_start(args, name);
2201 while ((key = va_arg(args, const char *)) != NULL) {
2202 val = va_arg(args, const char *);
2203 if (!val) break;
2204 if (strcmp(key, "PN") == 0 && *val != '\0')
2205 specified_pn = val;
2206 else if (strcmp(key, "PP") == 0 && *val != '\0')
2207 specified_pp = val;
2208 else if (strcmp(key, "ID") == 0 && *val != '\0')
2209 specified_id = val;
2210 }
2211 va_end(args);
2212
2213 if (specified_id && hrecs->pg_hash) {
2214 khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2215 if (k != kh_end(hrecs->pg_hash)) {
2216 hts_log_error("Header @PG ID:%s already present", specified_id);
2217 return -1;
2218 }
2219 }
2220
2221 if (specified_pp && hrecs->pg_hash) {
2222 khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2223 if (k == kh_end(hrecs->pg_hash)) {
2224 hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2225 specified_pp);
2226 return -1;
2227 }
2228 }
2229
2230 if (!specified_pp && hrecs->npg_end) {
2231 /* Copy ends array to avoid us looping while modifying it */
2232 int *end = malloc(hrecs->npg_end * sizeof(int));
2233 int i, nends = hrecs->npg_end;
2234
2235 if (!end)
2236 return -1;
2237
2238 memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2239
2240 for (i = 0; i < nends; i++) {
2241 const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2242 if (!id) {
2243 free(end);
2244 return -1;
2245 }
2246 va_start(args, name);
2247 if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2248 "ID", id,
2249 "PN", !specified_pn ? name : "",
2250 "PP", hrecs->pg[end[i]].name,
2251 NULL)) {
2252 free(end);
2253 return -1;
2254 }
2255 va_end(args);
2256 }
2257
2258 free(end);
2259 } else {
2260 const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2261 if (!id)
2262 return -1;
2263 va_start(args, name);
2264 if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2265 "ID", id,
2266 "PN", !specified_pn ? name : "",
2267 NULL))
2268 return -1;
2269 va_end(args);
2270 }
2271
2272 hrecs->dirty = 1;
2273 redact_header_text(bh);
2274
2275 return 0;
2276 }
2277
2278 /*! Increments a reference count on bh.
2279 *
2280 * This permits multiple files to share the same header, all calling
2281 * sam_hdr_destroy when done, without causing errors for other open files.
2282 */
sam_hdr_incr_ref(sam_hdr_t * bh)2283 void sam_hdr_incr_ref(sam_hdr_t *bh) {
2284 if (!bh)
2285 return;
2286 bh->ref_count++;
2287 }
2288
2289 /* ==== Internal methods ==== */
2290
2291 /*
2292 * Creates an empty SAM header. Allocates space for the SAM header
2293 * structures (hash tables) ready to be populated.
2294 *
2295 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2296 * NULL on failure
2297 */
sam_hrecs_new()2298 sam_hrecs_t *sam_hrecs_new() {
2299 sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2300
2301 if (!hrecs)
2302 return NULL;
2303
2304 hrecs->h = kh_init(sam_hrecs_t);
2305 if (!hrecs->h)
2306 goto err;
2307
2308 hrecs->ID_cnt = 1;
2309
2310 hrecs->nref = 0;
2311 hrecs->ref_sz = 0;
2312 hrecs->ref = NULL;
2313 if (!(hrecs->ref_hash = kh_init(m_s2i)))
2314 goto err;
2315 hrecs->refs_changed = -1;
2316
2317 hrecs->nrg = 0;
2318 hrecs->rg_sz = 0;
2319 hrecs->rg = NULL;
2320 if (!(hrecs->rg_hash = kh_init(m_s2i)))
2321 goto err;
2322
2323 hrecs->npg = 0;
2324 hrecs->pg_sz = 0;
2325 hrecs->pg = NULL;
2326 hrecs->npg_end = hrecs->npg_end_alloc = 0;
2327 hrecs->pg_end = NULL;
2328 if (!(hrecs->pg_hash = kh_init(m_s2i)))
2329 goto err;
2330
2331 if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2332 goto err;
2333
2334 if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2335 goto err;
2336
2337 if (!(hrecs->str_pool = string_pool_create(65536)))
2338 goto err;
2339
2340 if (sam_hrecs_init_type_order(hrecs, NULL))
2341 goto err;
2342
2343 return hrecs;
2344
2345 err:
2346 if (hrecs->h)
2347 kh_destroy(sam_hrecs_t, hrecs->h);
2348
2349 if (hrecs->tag_pool)
2350 pool_destroy(hrecs->tag_pool);
2351
2352 if (hrecs->type_pool)
2353 pool_destroy(hrecs->type_pool);
2354
2355 if (hrecs->str_pool)
2356 string_pool_destroy(hrecs->str_pool);
2357
2358 free(hrecs);
2359
2360 return NULL;
2361 }
2362 #if 0
2363 /*
2364 * Produces a duplicate copy of source and returns it.
2365 * Returns NULL on failure
2366 */
2367 sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2368 return NULL;
2369 }
2370 #endif
2371 /*! Deallocates all storage used by a sam_hrecs_t struct.
2372 *
2373 * This also decrements the header reference count. If after decrementing
2374 * it is still non-zero then the header is assumed to be in use by another
2375 * caller and the free is not done.
2376 *
2377 */
sam_hrecs_free(sam_hrecs_t * hrecs)2378 void sam_hrecs_free(sam_hrecs_t *hrecs) {
2379 if (!hrecs)
2380 return;
2381
2382 if (hrecs->h)
2383 kh_destroy(sam_hrecs_t, hrecs->h);
2384
2385 if (hrecs->ref_hash)
2386 kh_destroy(m_s2i, hrecs->ref_hash);
2387
2388 if (hrecs->ref)
2389 free(hrecs->ref);
2390
2391 if (hrecs->rg_hash)
2392 kh_destroy(m_s2i, hrecs->rg_hash);
2393
2394 if (hrecs->rg)
2395 free(hrecs->rg);
2396
2397 if (hrecs->pg_hash)
2398 kh_destroy(m_s2i, hrecs->pg_hash);
2399
2400 if (hrecs->pg)
2401 free(hrecs->pg);
2402
2403 if (hrecs->pg_end)
2404 free(hrecs->pg_end);
2405
2406 if (hrecs->type_pool)
2407 pool_destroy(hrecs->type_pool);
2408
2409 if (hrecs->tag_pool)
2410 pool_destroy(hrecs->tag_pool);
2411
2412 if (hrecs->str_pool)
2413 string_pool_destroy(hrecs->str_pool);
2414
2415 if (hrecs->type_order)
2416 free(hrecs->type_order);
2417
2418 if (hrecs->ID_buf)
2419 free(hrecs->ID_buf);
2420
2421 free(hrecs);
2422 }
2423
2424 /*
2425 * Internal method already used by the CRAM code
2426 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2427 * for the tag ID: and compares against the specified ID.
2428 *
2429 * Returns NULL if no type/ID is found
2430 */
sam_hrecs_find_type_id(sam_hrecs_t * hrecs,const char * type,const char * ID_key,const char * ID_value)2431 sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2432 const char *ID_key, const char *ID_value) {
2433 if (!hrecs || !type)
2434 return NULL;
2435 sam_hrec_type_t *t1, *t2;
2436 khint_t k;
2437
2438 /* Special case for types we have prebuilt hashes on */
2439 if (ID_key) {
2440 if (!ID_value)
2441 return NULL;
2442
2443 if (type[0] == 'S' && type[1] == 'Q' &&
2444 ID_key[0] == 'S' && ID_key[1] == 'N') {
2445 k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2446 return k != kh_end(hrecs->ref_hash)
2447 ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2448 : NULL;
2449 }
2450
2451 if (type[0] == 'R' && type[1] == 'G' &&
2452 ID_key[0] == 'I' && ID_key[1] == 'D') {
2453 k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2454 return k != kh_end(hrecs->rg_hash)
2455 ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2456 : NULL;
2457 }
2458
2459 if (type[0] == 'P' && type[1] == 'G' &&
2460 ID_key[0] == 'I' && ID_key[1] == 'D') {
2461 k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2462 return k != kh_end(hrecs->pg_hash)
2463 ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2464 : NULL;
2465 }
2466 }
2467
2468 k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2469 if (k == kh_end(hrecs->h))
2470 return NULL;
2471
2472 if (!ID_key)
2473 return kh_val(hrecs->h, k);
2474
2475 t1 = t2 = kh_val(hrecs->h, k);
2476 do {
2477 sam_hrec_tag_t *tag;
2478 for (tag = t1->tag; tag; tag = tag->next) {
2479 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2480 const char *cp1 = tag->str+3;
2481 const char *cp2 = ID_value;
2482 while (*cp1 && *cp1 == *cp2)
2483 cp1++, cp2++;
2484 if (*cp2 || *cp1)
2485 continue;
2486 return t1;
2487 }
2488 }
2489 t1 = t1->next;
2490 } while (t1 != t2);
2491
2492 return NULL;
2493 }
2494
2495 /*
2496 * Adds or updates tag key,value pairs in a header line.
2497 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2498 * @HD line.
2499 *
2500 * va_list contains multiple key,value pairs ending in NULL.
2501 *
2502 * Returns 0 on success
2503 * -1 on failure
2504 */
sam_hrecs_vupdate(sam_hrecs_t * hrecs,sam_hrec_type_t * type,va_list ap)2505 int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2506 if (!hrecs)
2507 return -1;
2508
2509 for (;;) {
2510 char *k, *v, *str;
2511 sam_hrec_tag_t *tag, *prev = NULL;
2512
2513 if (!(k = (char *)va_arg(ap, char *)))
2514 break;
2515 if (!(v = va_arg(ap, char *)))
2516 v = "";
2517
2518 tag = sam_hrecs_find_key(type, k, &prev);
2519 if (!tag) {
2520 if (!(tag = pool_alloc(hrecs->tag_pool)))
2521 return -1;
2522 if (prev)
2523 prev->next = tag;
2524 else
2525 type->tag = tag;
2526
2527 tag->next = NULL;
2528 }
2529
2530 tag->len = 3 + strlen(v);
2531 str = string_alloc(hrecs->str_pool, tag->len+1);
2532 if (!str)
2533 return -1;
2534
2535 if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2536 return -1;
2537
2538 tag->str = str;
2539 }
2540
2541 hrecs->dirty = 1; //mark text as dirty and force a rebuild
2542
2543 return 0;
2544 }
2545
2546 /*
2547 * Adds or updates tag key,value pairs in a header line.
2548 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2549 * @HD line.
2550 *
2551 * Specify multiple key,value pairs ending in NULL.
2552 *
2553 * Returns 0 on success
2554 * -1 on failure
2555 */
sam_hrecs_update(sam_hrecs_t * hrecs,sam_hrec_type_t * type,...)2556 static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2557 va_list args;
2558 int res;
2559 va_start(args, type);
2560 res = sam_hrecs_vupdate(hrecs, type, args);
2561 va_end(args);
2562 return res;
2563 }
2564
2565 /*
2566 * Looks for a specific key in a single sam header line identified by *type.
2567 * If prev is non-NULL it also fills this out with the previous tag, to
2568 * permit use in key removal. *prev is set to NULL when the tag is the first
2569 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
2570 * tag in the existing list.
2571 *
2572 * Returns the tag pointer on success
2573 * NULL on failure
2574 */
sam_hrecs_find_key(sam_hrec_type_t * type,const char * key,sam_hrec_tag_t ** prev)2575 sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
2576 const char *key,
2577 sam_hrec_tag_t **prev) {
2578 sam_hrec_tag_t *tag, *p = NULL;
2579 if (!type)
2580 return NULL;
2581
2582 for (tag = type->tag; tag; p = tag, tag = tag->next) {
2583 if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
2584 if (prev)
2585 *prev = p;
2586 return tag;
2587 }
2588 }
2589
2590 if (prev)
2591 *prev = p;
2592
2593 return NULL;
2594 }
2595
sam_hrecs_remove_key(sam_hrecs_t * hrecs,sam_hrec_type_t * type,const char * key)2596 int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
2597 sam_hrec_type_t *type,
2598 const char *key) {
2599 sam_hrec_tag_t *tag, *prev;
2600 if (!hrecs)
2601 return -1;
2602 tag = sam_hrecs_find_key(type, key, &prev);
2603 if (!tag)
2604 return 0; // Not there anyway
2605
2606 if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
2607 assert(tag->len >= 3);
2608 sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
2609 if (sn_tag) {
2610 assert(sn_tag->len >= 3);
2611 khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
2612 if (k != kh_end(hrecs->ref_hash))
2613 sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
2614 }
2615 }
2616
2617 if (!prev) { //first tag
2618 type->tag = tag->next;
2619 } else {
2620 prev->next = tag->next;
2621 }
2622 pool_free(hrecs->tag_pool, tag);
2623 hrecs->dirty = 1; //mark text as dirty and force a rebuild
2624
2625 return 1;
2626 }
2627
2628 /*
2629 * Looks up a read-group by name and returns a pointer to the start of the
2630 * associated tag list.
2631 *
2632 * Returns NULL on failure
2633 */
sam_hrecs_find_rg(sam_hrecs_t * hrecs,const char * rg)2634 sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
2635 khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
2636 return k == kh_end(hrecs->rg_hash)
2637 ? NULL
2638 : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
2639 }
2640
2641 #if DEBUG_HEADER
sam_hrecs_dump(sam_hrecs_t * hrecs)2642 void sam_hrecs_dump(sam_hrecs_t *hrecs) {
2643 khint_t k;
2644 int i;
2645
2646 printf("===DUMP===\n");
2647 for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
2648 sam_hrec_type_t *t1, *t2;
2649 char c[2];
2650 int idx = 0;
2651
2652 if (!kh_exist(hrecs->h, k))
2653 continue;
2654
2655 t1 = t2 = kh_val(hrecs->h, k);
2656 c[0] = kh_key(hrecs->h, k)>>8;
2657 c[1] = kh_key(hrecs->h, k)&0xff;
2658 printf("Type %.2s\n", c);
2659
2660 do {
2661 sam_hrec_tag_t *tag;
2662 printf(">>>%d ", idx++);
2663 for (tag = t1->tag; tag; tag=tag->next) {
2664 if (strncmp(c, "CO", 2))
2665 printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
2666 else
2667 printf("%s", tag->str);
2668 }
2669 putchar('\n');
2670 t1 = t1->next;
2671 } while (t1 != t2);
2672 }
2673
2674 /* Dump out PG chains */
2675 printf("\n@PG chains:\n");
2676 for (i = 0; i < hrecs->npg_end; i++) {
2677 int j;
2678 printf(" %d:", i);
2679 for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
2680 printf("%s%d(%.*s)",
2681 j == hrecs->pg_end[i] ? " " : "->",
2682 j, hrecs->pg[j].name_len, hrecs->pg[j].name);
2683 }
2684 printf("\n");
2685 }
2686
2687 puts("===END DUMP===");
2688 }
2689 #endif
2690
2691 /*
2692 * Returns the sort order:
2693 */
sam_hrecs_sort_order(sam_hrecs_t * hrecs)2694 enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
2695 khint_t k;
2696 enum sam_sort_order so;
2697
2698 so = ORDER_UNKNOWN;
2699 k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2700 if (k != kh_end(hrecs->h)) {
2701 sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2702 sam_hrec_tag_t *tag;
2703 for (tag = ty->tag; tag; tag = tag->next) {
2704 if (tag->str[0] == 'S' && tag->str[1] == 'O') {
2705 if (strcmp(tag->str+3, "unsorted") == 0)
2706 so = ORDER_UNSORTED;
2707 else if (strcmp(tag->str+3, "queryname") == 0)
2708 so = ORDER_NAME;
2709 else if (strcmp(tag->str+3, "coordinate") == 0)
2710 so = ORDER_COORD;
2711 else if (strcmp(tag->str+3, "unknown") != 0)
2712 hts_log_error("Unknown sort order field: %s", tag->str+3);
2713 }
2714 }
2715 }
2716
2717 return so;
2718 }
2719
sam_hrecs_group_order(sam_hrecs_t * hrecs)2720 enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
2721 khint_t k;
2722 enum sam_group_order go;
2723
2724 go = ORDER_NONE;
2725 k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2726 if (k != kh_end(hrecs->h)) {
2727 sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2728 sam_hrec_tag_t *tag;
2729 for (tag = ty->tag; tag; tag = tag->next) {
2730 if (tag->str[0] == 'G' && tag->str[1] == 'O') {
2731 if (strcmp(tag->str+3, "query") == 0)
2732 go = ORDER_QUERY;
2733 else if (strcmp(tag->str+3, "reference") == 0)
2734 go = ORDER_REFERENCE;
2735 }
2736 }
2737 }
2738
2739 return go;
2740 }
2741