1 #include "sam_header.h"
2 #include <stdio.h>
3 #include <string.h>
4 #include <ctype.h>
5 #include <stdlib.h>
6 #include <stdarg.h>
7
8 #include "khash.h"
9 KHASH_MAP_INIT_STR(str, const char *)
10
11 struct _HeaderList
12 {
13 struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
14 struct _HeaderList *next;
15 void *data;
16 };
17 typedef struct _HeaderList list_t;
18 typedef list_t HeaderDict;
19
20 typedef struct
21 {
22 char key[2];
23 char *value;
24 }
25 HeaderTag;
26
27 typedef struct
28 {
29 char type[2];
30 list_t *tags;
31 }
32 HeaderLine;
33
34 const char *o_hd_tags[] = {"SO","GO",NULL};
35 const char *r_hd_tags[] = {"VN",NULL};
36
37 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
38 const char *r_sq_tags[] = {"SN","LN",NULL};
39 const char *u_sq_tags[] = {"SN",NULL};
40
41 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
42 const char *r_rg_tags[] = {"ID",NULL};
43 const char *u_rg_tags[] = {"ID",NULL};
44
45 const char *o_pg_tags[] = {"VN","CL",NULL};
46 const char *r_pg_tags[] = {"ID",NULL};
47
48 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
49 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
50 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
51 const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
52
53
debug(const char * format,...)54 static void debug(const char *format, ...)
55 {
56 va_list ap;
57 va_start(ap, format);
58 vfprintf(stderr, format, ap);
59 va_end(ap);
60 }
61
62 #if 0
63 // Replaced by list_append_to_end
64 static list_t *list_prepend(list_t *root, void *data)
65 {
66 list_t *l = malloc(sizeof(list_t));
67 l->next = root;
68 l->data = data;
69 return l;
70 }
71 #endif
72
73 // Relies on the root->last being correct. Do not use with the other list_*
74 // routines unless they are fixed to modify root->last as well.
list_append_to_end(list_t * root,void * data)75 static list_t *list_append_to_end(list_t *root, void *data)
76 {
77 list_t *l = malloc(sizeof(list_t));
78 l->last = l;
79 l->next = NULL;
80 l->data = data;
81
82 if ( !root )
83 return l;
84
85 root->last->next = l;
86 root->last = l;
87 return root;
88 }
89
list_append(list_t * root,void * data)90 static list_t *list_append(list_t *root, void *data)
91 {
92 list_t *l = root;
93 while (l && l->next)
94 l = l->next;
95 if ( l )
96 {
97 l->next = malloc(sizeof(list_t));
98 l = l->next;
99 }
100 else
101 {
102 l = malloc(sizeof(list_t));
103 root = l;
104 }
105 l->data = data;
106 l->next = NULL;
107 return root;
108 }
109
list_free(list_t * root)110 static void list_free(list_t *root)
111 {
112 list_t *l = root;
113 while (root)
114 {
115 l = root;
116 root = root->next;
117 free(l);
118 }
119 }
120
121
122
123 // Look for a tag "XY" in a predefined const char *[] array.
tag_exists(const char * tag,const char ** tags)124 static int tag_exists(const char *tag, const char **tags)
125 {
126 int itag=0;
127 if ( !tags ) return -1;
128 while ( tags[itag] )
129 {
130 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
131 itag++;
132 }
133 return -1;
134 }
135
136
137
138 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
139 // or NULL if everything has been read. The lineptr should be freed by the caller. The
140 // newline character is stripped.
nextline(char ** lineptr,size_t * n,const char * text)141 static const char *nextline(char **lineptr, size_t *n, const char *text)
142 {
143 int len;
144 const char *to = text;
145
146 if ( !*to ) return NULL;
147
148 while ( *to && *to!='\n' && *to!='\r' ) to++;
149 len = to - text + 1;
150
151 if ( *to )
152 {
153 // Advance the pointer for the next call
154 if ( *to=='\n' ) to++;
155 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
156 }
157 if ( !len )
158 return to;
159
160 if ( !*lineptr )
161 {
162 *lineptr = malloc(len);
163 *n = len;
164 }
165 else if ( *n<len )
166 {
167 *lineptr = realloc(*lineptr, len);
168 *n = len;
169 }
170 if ( !*lineptr ) {
171 debug("[nextline] Insufficient memory!\n");
172 return 0;
173 }
174
175 memcpy(*lineptr,text,len);
176 (*lineptr)[len-1] = 0;
177
178 return to;
179 }
180
181 // name points to "XY", value_from points to the first character of the value string and
182 // value_to points to the last character of the value string.
new_tag(const char * name,const char * value_from,const char * value_to)183 static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
184 {
185 HeaderTag *tag = malloc(sizeof(HeaderTag));
186 int len = value_to-value_from+1;
187
188 tag->key[0] = name[0];
189 tag->key[1] = name[1];
190 tag->value = malloc(len+1);
191 memcpy(tag->value,value_from,len+1);
192 tag->value[len] = 0;
193 return tag;
194 }
195
header_line_has_tag(HeaderLine * hline,const char * key)196 static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
197 {
198 list_t *tags = hline->tags;
199 while (tags)
200 {
201 HeaderTag *tag = tags->data;
202 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
203 tags = tags->next;
204 }
205 return NULL;
206 }
207
208
209 // Return codes:
210 // 0 .. different types or unique tags differ or conflicting tags, cannot be merged
211 // 1 .. all tags identical -> no need to merge, drop one
212 // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
213 // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
sam_header_compare_lines(HeaderLine * hline1,HeaderLine * hline2)214 static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
215 {
216 HeaderTag *t1, *t2;
217
218 if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
219 return 0;
220
221 int itype = tag_exists(hline1->type,types);
222 if ( itype==-1 ) {
223 debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
224 return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
225 }
226
227 if ( unique_tags[itype] )
228 {
229 t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
230 t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
231 if ( !t1 || !t2 ) // this should never happen, the unique tags are required
232 return 2;
233
234 if ( strcmp(t1->value,t2->value) )
235 return 0; // the unique tags differ, cannot be merged
236 }
237 if ( !required_tags[itype] && !optional_tags[itype] )
238 {
239 t1 = hline1->tags->data;
240 t2 = hline2->tags->data;
241 if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
242 return 0;
243 }
244
245 int missing=0, itag=0;
246 while ( required_tags[itype] && required_tags[itype][itag] )
247 {
248 t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
249 t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
250 if ( !t1 && !t2 )
251 return 2; // this should never happen
252 else if ( !t1 || !t2 )
253 missing = 1; // there is some tag missing in one of the hlines
254 else if ( strcmp(t1->value,t2->value) )
255 {
256 if ( unique_tags[itype] )
257 return 2; // the lines have a matching unique tag but have a conflicting tag
258
259 return 0; // the lines contain conflicting tags, cannot be merged
260 }
261 itag++;
262 }
263 itag = 0;
264 while ( optional_tags[itype] && optional_tags[itype][itag] )
265 {
266 t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
267 t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
268 if ( !t1 && !t2 )
269 {
270 itag++;
271 continue;
272 }
273 if ( !t1 || !t2 )
274 missing = 1; // there is some tag missing in one of the hlines
275 else if ( strcmp(t1->value,t2->value) )
276 {
277 if ( unique_tags[itype] )
278 return 2; // the lines have a matching unique tag but have a conflicting tag
279
280 return 0; // the lines contain conflicting tags, cannot be merged
281 }
282 itag++;
283 }
284 if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
285 return 1;
286 }
287
288
sam_header_line_clone(const HeaderLine * hline)289 static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
290 {
291 list_t *tags;
292 HeaderLine *out = malloc(sizeof(HeaderLine));
293 out->type[0] = hline->type[0];
294 out->type[1] = hline->type[1];
295 out->tags = NULL;
296
297 tags = hline->tags;
298 while (tags)
299 {
300 HeaderTag *old = tags->data;
301
302 HeaderTag *new = malloc(sizeof(HeaderTag));
303 new->key[0] = old->key[0];
304 new->key[1] = old->key[1];
305 new->value = strdup(old->value);
306 out->tags = list_append(out->tags, new);
307
308 tags = tags->next;
309 }
310 return out;
311 }
312
sam_header_line_merge_with(HeaderLine * out_hline,const HeaderLine * tmpl_hline)313 static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
314 {
315 list_t *tmpl_tags;
316
317 if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
318 return 0;
319
320 tmpl_tags = tmpl_hline->tags;
321 while (tmpl_tags)
322 {
323 HeaderTag *tmpl_tag = tmpl_tags->data;
324 HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
325 if ( !out_tag )
326 {
327 HeaderTag *tag = malloc(sizeof(HeaderTag));
328 tag->key[0] = tmpl_tag->key[0];
329 tag->key[1] = tmpl_tag->key[1];
330 tag->value = strdup(tmpl_tag->value);
331 out_hline->tags = list_append(out_hline->tags,tag);
332 }
333 tmpl_tags = tmpl_tags->next;
334 }
335 return 1;
336 }
337
338
sam_header_line_parse(const char * headerLine)339 static HeaderLine *sam_header_line_parse(const char *headerLine)
340 {
341 HeaderLine *hline;
342 HeaderTag *tag;
343 const char *from, *to;
344 from = headerLine;
345
346 if ( *from != '@' ) {
347 debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
348 return 0;
349 }
350 to = ++from;
351
352 while (*to && *to!='\t') to++;
353 if ( to-from != 2 ) {
354 debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
355 return 0;
356 }
357
358 hline = malloc(sizeof(HeaderLine));
359 hline->type[0] = from[0];
360 hline->type[1] = from[1];
361 hline->tags = NULL;
362
363 int itype = tag_exists(hline->type, types);
364
365 from = to;
366 while (*to && *to=='\t') to++;
367 if ( to-from != 1 ) {
368 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
369 return 0;
370 }
371 from = to;
372 while (*from)
373 {
374 while (*to && *to!='\t') to++;
375
376 if ( !required_tags[itype] && !optional_tags[itype] )
377 {
378 // CO is a special case, it can contain anything, including tabs
379 if ( *to ) { to++; continue; }
380 tag = new_tag(" ",from,to-1);
381 }
382 else
383 tag = new_tag(from,from+3,to-1);
384
385 if ( header_line_has_tag(hline,tag->key) )
386 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
387 hline->tags = list_append(hline->tags, tag);
388
389 from = to;
390 while (*to && *to=='\t') to++;
391 if ( *to && to-from != 1 ) {
392 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
393 return 0;
394 }
395
396 from = to;
397 }
398 return hline;
399 }
400
401
402 // Must be of an existing type, all tags must be recognised and all required tags must be present
sam_header_line_validate(HeaderLine * hline)403 static int sam_header_line_validate(HeaderLine *hline)
404 {
405 list_t *tags;
406 HeaderTag *tag;
407 int itype, itag;
408
409 // Is the type correct?
410 itype = tag_exists(hline->type, types);
411 if ( itype==-1 )
412 {
413 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
414 return 0;
415 }
416
417 // Has all required tags?
418 itag = 0;
419 while ( required_tags[itype] && required_tags[itype][itag] )
420 {
421 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
422 {
423 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
424 hline->type[0],hline->type[1]);
425 return 0;
426 }
427 itag++;
428 }
429
430 // Are all tags recognised?
431 tags = hline->tags;
432 while ( tags )
433 {
434 tag = tags->data;
435 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
436 {
437 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
438 return 0;
439 }
440 tags = tags->next;
441 }
442
443 return 1;
444 }
445
446
print_header_line(FILE * fp,HeaderLine * hline)447 static void print_header_line(FILE *fp, HeaderLine *hline)
448 {
449 list_t *tags = hline->tags;
450 HeaderTag *tag;
451
452 fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
453 while (tags)
454 {
455 tag = tags->data;
456
457 fprintf(fp, "\t");
458 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
459 fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
460 fprintf(fp, "%s", tag->value);
461
462 tags = tags->next;
463 }
464 fprintf(fp,"\n");
465 }
466
467
sam_header_line_free(HeaderLine * hline)468 static void sam_header_line_free(HeaderLine *hline)
469 {
470 list_t *tags = hline->tags;
471 while (tags)
472 {
473 HeaderTag *tag = tags->data;
474 free(tag->value);
475 free(tag);
476 tags = tags->next;
477 }
478 list_free(hline->tags);
479 free(hline);
480 }
481
sam_header_free(void * _header)482 void sam_header_free(void *_header)
483 {
484 HeaderDict *header = (HeaderDict*)_header;
485 list_t *hlines = header;
486 while (hlines)
487 {
488 sam_header_line_free(hlines->data);
489 hlines = hlines->next;
490 }
491 list_free(header);
492 }
493
sam_header_clone(const HeaderDict * dict)494 HeaderDict *sam_header_clone(const HeaderDict *dict)
495 {
496 HeaderDict *out = NULL;
497 while (dict)
498 {
499 HeaderLine *hline = dict->data;
500 out = list_append(out, sam_header_line_clone(hline));
501 dict = dict->next;
502 }
503 return out;
504 }
505
506 // Returns a newly allocated string
sam_header_write(const void * _header)507 char *sam_header_write(const void *_header)
508 {
509 const HeaderDict *header = (const HeaderDict*)_header;
510 char *out = NULL;
511 int len=0, nout=0;
512 const list_t *hlines;
513
514 // Calculate the length of the string to allocate
515 hlines = header;
516 while (hlines)
517 {
518 len += 4; // @XY and \n
519
520 HeaderLine *hline = hlines->data;
521 list_t *tags = hline->tags;
522 while (tags)
523 {
524 HeaderTag *tag = tags->data;
525 len += strlen(tag->value) + 1; // \t
526 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
527 len += strlen(tag->value) + 3; // XY:
528 tags = tags->next;
529 }
530 hlines = hlines->next;
531 }
532
533 nout = 0;
534 out = malloc(len+1);
535 hlines = header;
536 while (hlines)
537 {
538 HeaderLine *hline = hlines->data;
539
540 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
541
542 list_t *tags = hline->tags;
543 while (tags)
544 {
545 HeaderTag *tag = tags->data;
546 nout += sprintf(out+nout,"\t");
547 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
548 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
549 nout += sprintf(out+nout,"%s", tag->value);
550 tags = tags->next;
551 }
552 hlines = hlines->next;
553 nout += sprintf(out+nout,"\n");
554 }
555 out[len] = 0;
556 return out;
557 }
558
sam_header_parse2(const char * headerText)559 void *sam_header_parse2(const char *headerText)
560 {
561 list_t *hlines = NULL;
562 HeaderLine *hline;
563 const char *text;
564 char *buf=NULL;
565 size_t nbuf = 0;
566
567 if ( !headerText )
568 return 0;
569
570 text = headerText;
571 while ( (text=nextline(&buf, &nbuf, text)) )
572 {
573 hline = sam_header_line_parse(buf);
574 if ( hline && sam_header_line_validate(hline) )
575 // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
576 hlines = list_append_to_end(hlines, hline);
577 else
578 {
579 if (hline) sam_header_line_free(hline);
580 sam_header_free(hlines);
581 if ( buf ) free(buf);
582 return NULL;
583 }
584 }
585 if ( buf ) free(buf);
586
587 return hlines;
588 }
589
sam_header2tbl(const void * _dict,char type[2],char key_tag[2],char value_tag[2])590 void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
591 {
592 const HeaderDict *dict = (const HeaderDict*)_dict;
593 const list_t *l = dict;
594 khash_t(str) *tbl = kh_init(str);
595 khiter_t k;
596 int ret;
597
598 if (_dict == 0) return tbl; // return an empty (not null) hash table
599 while (l)
600 {
601 HeaderLine *hline = l->data;
602 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
603 {
604 l = l->next;
605 continue;
606 }
607
608 HeaderTag *key, *value;
609 key = header_line_has_tag(hline,key_tag);
610 value = header_line_has_tag(hline,value_tag);
611 if ( !key || !value )
612 {
613 l = l->next;
614 continue;
615 }
616
617 k = kh_get(str, tbl, key->value);
618 if ( k != kh_end(tbl) )
619 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
620 k = kh_put(str, tbl, key->value, &ret);
621 kh_value(tbl, k) = value->value;
622
623 l = l->next;
624 }
625 return tbl;
626 }
627
sam_header2list(const void * _dict,char type[2],char key_tag[2],int * _n)628 char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
629 {
630 const HeaderDict *dict = (const HeaderDict*)_dict;
631 const list_t *l = dict;
632 int max, n;
633 char **ret;
634
635 ret = 0; *_n = max = n = 0;
636 while (l)
637 {
638 HeaderLine *hline = l->data;
639 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
640 {
641 l = l->next;
642 continue;
643 }
644
645 HeaderTag *key;
646 key = header_line_has_tag(hline,key_tag);
647 if ( !key )
648 {
649 l = l->next;
650 continue;
651 }
652
653 if (n == max) {
654 max = max? max<<1 : 4;
655 ret = realloc(ret, max * sizeof(void*));
656 }
657 ret[n++] = key->value;
658
659 l = l->next;
660 }
661 *_n = n;
662 return ret;
663 }
664
sam_tbl_get(void * h,const char * key)665 const char *sam_tbl_get(void *h, const char *key)
666 {
667 khash_t(str) *tbl = (khash_t(str)*)h;
668 khint_t k;
669 k = kh_get(str, tbl, key);
670 return k == kh_end(tbl)? 0 : kh_val(tbl, k);
671 }
672
sam_tbl_size(void * h)673 int sam_tbl_size(void *h)
674 {
675 khash_t(str) *tbl = (khash_t(str)*)h;
676 return h? kh_size(tbl) : 0;
677 }
678
sam_tbl_destroy(void * h)679 void sam_tbl_destroy(void *h)
680 {
681 khash_t(str) *tbl = (khash_t(str)*)h;
682 kh_destroy(str, tbl);
683 }
684
sam_header_merge(int n,const void ** _dicts)685 void *sam_header_merge(int n, const void **_dicts)
686 {
687 const HeaderDict **dicts = (const HeaderDict**)_dicts;
688 HeaderDict *out_dict;
689 int idict, status;
690
691 if ( n<2 ) return NULL;
692
693 out_dict = sam_header_clone(dicts[0]);
694
695 for (idict=1; idict<n; idict++)
696 {
697 const list_t *tmpl_hlines = dicts[idict];
698
699 while ( tmpl_hlines )
700 {
701 list_t *out_hlines = out_dict;
702 int inserted = 0;
703 while ( out_hlines )
704 {
705 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
706 if ( status==0 )
707 {
708 out_hlines = out_hlines->next;
709 continue;
710 }
711
712 if ( status==2 )
713 {
714 print_header_line(stderr,tmpl_hlines->data);
715 print_header_line(stderr,out_hlines->data);
716 debug("Conflicting lines, cannot merge the headers.\n");
717 return 0;
718 }
719 if ( status==3 )
720 sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
721
722 inserted = 1;
723 break;
724 }
725 if ( !inserted )
726 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
727
728 tmpl_hlines = tmpl_hlines->next;
729 }
730 }
731
732 return out_dict;
733 }
734
735
736