1 #include "sam_header.h"
2 #include <stdio.h>
3 #include <string.h>
4 #include <ctype.h>
5 #include <stdlib.h>
6 #include <stdarg.h>
7 
8 #include "khash.h"
9 KHASH_MAP_INIT_STR(str, const char *)
10 
11 struct _HeaderList
12 {
13     struct _HeaderList *last;   // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
14     struct _HeaderList *next;
15     void *data;
16 };
17 typedef struct _HeaderList list_t;
18 typedef list_t HeaderDict;
19 
20 typedef struct
21 {
22     char key[2];
23     char *value;
24 }
25 HeaderTag;
26 
27 typedef struct
28 {
29     char type[2];
30     list_t *tags;
31 }
32 HeaderLine;
33 
34 const char *o_hd_tags[] = {"SO","GO",NULL};
35 const char *r_hd_tags[] = {"VN",NULL};
36 
37 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
38 const char *r_sq_tags[] = {"SN","LN",NULL};
39 const char *u_sq_tags[] = {"SN",NULL};
40 
41 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
42 const char *r_rg_tags[] = {"ID",NULL};
43 const char *u_rg_tags[] = {"ID",NULL};
44 
45 const char *o_pg_tags[] = {"VN","CL",NULL};
46 const char *r_pg_tags[] = {"ID",NULL};
47 
48 const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
49 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
50 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
51 const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
52 
53 
debug(const char * format,...)54 static void debug(const char *format, ...)
55 {
56     va_list ap;
57     va_start(ap, format);
58     vfprintf(stderr, format, ap);
59     va_end(ap);
60 }
61 
62 #if 0
63 // Replaced by list_append_to_end
64 static list_t *list_prepend(list_t *root, void *data)
65 {
66     list_t *l = malloc(sizeof(list_t));
67     l->next = root;
68     l->data = data;
69     return l;
70 }
71 #endif
72 
73 // Relies on the root->last being correct. Do not use with the other list_*
74 //  routines unless they are fixed to modify root->last as well.
list_append_to_end(list_t * root,void * data)75 static list_t *list_append_to_end(list_t *root, void *data)
76 {
77     list_t *l = malloc(sizeof(list_t));
78     l->last = l;
79     l->next = NULL;
80     l->data = data;
81 
82     if ( !root )
83         return l;
84 
85     root->last->next = l;
86     root->last = l;
87     return root;
88 }
89 
list_append(list_t * root,void * data)90 static list_t *list_append(list_t *root, void *data)
91 {
92     list_t *l = root;
93     while (l && l->next)
94         l = l->next;
95     if ( l )
96     {
97         l->next = malloc(sizeof(list_t));
98         l = l->next;
99     }
100     else
101     {
102         l = malloc(sizeof(list_t));
103         root = l;
104     }
105     l->data = data;
106     l->next = NULL;
107     return root;
108 }
109 
list_free(list_t * root)110 static void list_free(list_t *root)
111 {
112     list_t *l = root;
113     while (root)
114     {
115         l = root;
116         root = root->next;
117         free(l);
118     }
119 }
120 
121 
122 
123 // Look for a tag "XY" in a predefined const char *[] array.
tag_exists(const char * tag,const char ** tags)124 static int tag_exists(const char *tag, const char **tags)
125 {
126     int itag=0;
127     if ( !tags ) return -1;
128     while ( tags[itag] )
129     {
130         if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
131         itag++;
132     }
133     return -1;
134 }
135 
136 
137 
138 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
139 //  or NULL if everything has been read. The lineptr should be freed by the caller. The
140 //  newline character is stripped.
nextline(char ** lineptr,size_t * n,const char * text)141 static const char *nextline(char **lineptr, size_t *n, const char *text)
142 {
143     int len;
144     const char *to = text;
145 
146     if ( !*to ) return NULL;
147 
148     while ( *to && *to!='\n' && *to!='\r' ) to++;
149     len = to - text + 1;
150 
151     if ( *to )
152     {
153         // Advance the pointer for the next call
154         if ( *to=='\n' ) to++;
155         else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
156     }
157     if ( !len )
158         return to;
159 
160     if ( !*lineptr )
161     {
162         *lineptr = malloc(len);
163         *n = len;
164     }
165     else if ( *n<len )
166     {
167         *lineptr = realloc(*lineptr, len);
168         *n = len;
169     }
170     if ( !*lineptr ) {
171 		debug("[nextline] Insufficient memory!\n");
172 		return 0;
173 	}
174 
175     memcpy(*lineptr,text,len);
176     (*lineptr)[len-1] = 0;
177 
178     return to;
179 }
180 
181 // name points to "XY", value_from points to the first character of the value string and
182 //  value_to points to the last character of the value string.
new_tag(const char * name,const char * value_from,const char * value_to)183 static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
184 {
185     HeaderTag *tag = malloc(sizeof(HeaderTag));
186     int len = value_to-value_from+1;
187 
188     tag->key[0] = name[0];
189     tag->key[1] = name[1];
190     tag->value = malloc(len+1);
191     memcpy(tag->value,value_from,len+1);
192     tag->value[len] = 0;
193     return tag;
194 }
195 
header_line_has_tag(HeaderLine * hline,const char * key)196 static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
197 {
198     list_t *tags = hline->tags;
199     while (tags)
200     {
201         HeaderTag *tag = tags->data;
202         if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
203         tags = tags->next;
204     }
205     return NULL;
206 }
207 
208 
209 // Return codes:
210 //   0 .. different types or unique tags differ or conflicting tags, cannot be merged
211 //   1 .. all tags identical -> no need to merge, drop one
212 //   2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
213 //   3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
sam_header_compare_lines(HeaderLine * hline1,HeaderLine * hline2)214 static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
215 {
216     HeaderTag *t1, *t2;
217 
218     if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
219         return 0;
220 
221     int itype = tag_exists(hline1->type,types);
222     if ( itype==-1 ) {
223 		debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
224 		return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
225 	}
226 
227     if ( unique_tags[itype] )
228     {
229         t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
230         t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
231         if ( !t1 || !t2 ) // this should never happen, the unique tags are required
232             return 2;
233 
234         if ( strcmp(t1->value,t2->value) )
235             return 0;   // the unique tags differ, cannot be merged
236     }
237     if ( !required_tags[itype] && !optional_tags[itype] )
238     {
239         t1 = hline1->tags->data;
240         t2 = hline2->tags->data;
241         if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
242         return 0;
243     }
244 
245     int missing=0, itag=0;
246     while ( required_tags[itype] && required_tags[itype][itag] )
247     {
248         t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
249         t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
250         if ( !t1 && !t2 )
251             return 2;       // this should never happen
252         else if ( !t1 || !t2 )
253             missing = 1;    // there is some tag missing in one of the hlines
254         else if ( strcmp(t1->value,t2->value) )
255         {
256             if ( unique_tags[itype] )
257                 return 2;   // the lines have a matching unique tag but have a conflicting tag
258 
259             return 0;    // the lines contain conflicting tags, cannot be merged
260         }
261         itag++;
262     }
263     itag = 0;
264     while ( optional_tags[itype] && optional_tags[itype][itag] )
265     {
266         t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
267         t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
268         if ( !t1 && !t2 )
269         {
270             itag++;
271             continue;
272         }
273         if ( !t1 || !t2 )
274             missing = 1;    // there is some tag missing in one of the hlines
275         else if ( strcmp(t1->value,t2->value) )
276         {
277             if ( unique_tags[itype] )
278                 return 2;   // the lines have a matching unique tag but have a conflicting tag
279 
280             return 0;   // the lines contain conflicting tags, cannot be merged
281         }
282         itag++;
283     }
284     if ( missing ) return 3;    // there are some missing complementary tags with no conflicts, can be merged
285     return 1;
286 }
287 
288 
sam_header_line_clone(const HeaderLine * hline)289 static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
290 {
291     list_t *tags;
292     HeaderLine *out = malloc(sizeof(HeaderLine));
293     out->type[0] = hline->type[0];
294     out->type[1] = hline->type[1];
295     out->tags = NULL;
296 
297     tags = hline->tags;
298     while (tags)
299     {
300         HeaderTag *old = tags->data;
301 
302         HeaderTag *new = malloc(sizeof(HeaderTag));
303         new->key[0] = old->key[0];
304         new->key[1] = old->key[1];
305         new->value  = strdup(old->value);
306         out->tags = list_append(out->tags, new);
307 
308         tags = tags->next;
309     }
310     return out;
311 }
312 
sam_header_line_merge_with(HeaderLine * out_hline,const HeaderLine * tmpl_hline)313 static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
314 {
315     list_t *tmpl_tags;
316 
317     if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
318         return 0;
319 
320     tmpl_tags = tmpl_hline->tags;
321     while (tmpl_tags)
322     {
323         HeaderTag *tmpl_tag = tmpl_tags->data;
324         HeaderTag *out_tag  = header_line_has_tag(out_hline, tmpl_tag->key);
325         if ( !out_tag )
326         {
327             HeaderTag *tag = malloc(sizeof(HeaderTag));
328             tag->key[0] = tmpl_tag->key[0];
329             tag->key[1] = tmpl_tag->key[1];
330             tag->value  = strdup(tmpl_tag->value);
331             out_hline->tags = list_append(out_hline->tags,tag);
332         }
333         tmpl_tags = tmpl_tags->next;
334     }
335     return 1;
336 }
337 
338 
sam_header_line_parse(const char * headerLine)339 static HeaderLine *sam_header_line_parse(const char *headerLine)
340 {
341     HeaderLine *hline;
342     HeaderTag *tag;
343     const char *from, *to;
344     from = headerLine;
345 
346     if ( *from != '@' ) {
347 		debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
348 		return 0;
349 	}
350     to = ++from;
351 
352     while (*to && *to!='\t') to++;
353     if ( to-from != 2 ) {
354 		debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
355 		return 0;
356 	}
357 
358     hline = malloc(sizeof(HeaderLine));
359     hline->type[0] = from[0];
360     hline->type[1] = from[1];
361     hline->tags = NULL;
362 
363     int itype = tag_exists(hline->type, types);
364 
365     from = to;
366     while (*to && *to=='\t') to++;
367     if ( to-from != 1 ) {
368         debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
369 		return 0;
370 	}
371     from = to;
372     while (*from)
373     {
374         while (*to && *to!='\t') to++;
375 
376         if ( !required_tags[itype] && !optional_tags[itype] )
377         {
378             // CO is a special case, it can contain anything, including tabs
379             if ( *to ) { to++; continue; }
380             tag = new_tag("  ",from,to-1);
381         }
382         else
383             tag = new_tag(from,from+3,to-1);
384 
385         if ( header_line_has_tag(hline,tag->key) )
386                 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
387         hline->tags = list_append(hline->tags, tag);
388 
389         from = to;
390         while (*to && *to=='\t') to++;
391         if ( *to && to-from != 1 ) {
392 			debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
393 			return 0;
394 		}
395 
396         from = to;
397     }
398     return hline;
399 }
400 
401 
402 // Must be of an existing type, all tags must be recognised and all required tags must be present
sam_header_line_validate(HeaderLine * hline)403 static int sam_header_line_validate(HeaderLine *hline)
404 {
405     list_t *tags;
406     HeaderTag *tag;
407     int itype, itag;
408 
409     // Is the type correct?
410     itype = tag_exists(hline->type, types);
411     if ( itype==-1 )
412     {
413         debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
414         return 0;
415     }
416 
417     // Has all required tags?
418     itag = 0;
419     while ( required_tags[itype] && required_tags[itype][itag] )
420     {
421         if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
422         {
423             debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
424                 hline->type[0],hline->type[1]);
425             return 0;
426         }
427         itag++;
428     }
429 
430     // Are all tags recognised?
431     tags = hline->tags;
432     while ( tags )
433     {
434         tag = tags->data;
435         if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
436         {
437             debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
438             return 0;
439         }
440         tags = tags->next;
441     }
442 
443     return 1;
444 }
445 
446 
print_header_line(FILE * fp,HeaderLine * hline)447 static void print_header_line(FILE *fp, HeaderLine *hline)
448 {
449     list_t *tags = hline->tags;
450     HeaderTag *tag;
451 
452     fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
453     while (tags)
454     {
455         tag = tags->data;
456 
457         fprintf(fp, "\t");
458         if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
459             fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
460         fprintf(fp, "%s", tag->value);
461 
462         tags = tags->next;
463     }
464     fprintf(fp,"\n");
465 }
466 
467 
sam_header_line_free(HeaderLine * hline)468 static void sam_header_line_free(HeaderLine *hline)
469 {
470     list_t *tags = hline->tags;
471     while (tags)
472     {
473         HeaderTag *tag = tags->data;
474         free(tag->value);
475         free(tag);
476         tags = tags->next;
477     }
478     list_free(hline->tags);
479     free(hline);
480 }
481 
sam_header_free(void * _header)482 void sam_header_free(void *_header)
483 {
484 	HeaderDict *header = (HeaderDict*)_header;
485     list_t *hlines = header;
486     while (hlines)
487     {
488         sam_header_line_free(hlines->data);
489         hlines = hlines->next;
490     }
491     list_free(header);
492 }
493 
sam_header_clone(const HeaderDict * dict)494 HeaderDict *sam_header_clone(const HeaderDict *dict)
495 {
496     HeaderDict *out = NULL;
497     while (dict)
498     {
499         HeaderLine *hline = dict->data;
500         out = list_append(out, sam_header_line_clone(hline));
501         dict = dict->next;
502     }
503     return out;
504 }
505 
506 // Returns a newly allocated string
sam_header_write(const void * _header)507 char *sam_header_write(const void *_header)
508 {
509 	const HeaderDict *header = (const HeaderDict*)_header;
510     char *out = NULL;
511     int len=0, nout=0;
512     const list_t *hlines;
513 
514     // Calculate the length of the string to allocate
515     hlines = header;
516     while (hlines)
517     {
518         len += 4;   // @XY and \n
519 
520         HeaderLine *hline = hlines->data;
521         list_t *tags = hline->tags;
522         while (tags)
523         {
524             HeaderTag *tag = tags->data;
525             len += strlen(tag->value) + 1;                  // \t
526             if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
527                 len += strlen(tag->value) + 3;              // XY:
528             tags = tags->next;
529         }
530         hlines = hlines->next;
531     }
532 
533     nout = 0;
534     out  = malloc(len+1);
535     hlines = header;
536     while (hlines)
537     {
538         HeaderLine *hline = hlines->data;
539 
540         nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
541 
542         list_t *tags = hline->tags;
543         while (tags)
544         {
545             HeaderTag *tag = tags->data;
546             nout += sprintf(out+nout,"\t");
547             if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
548                 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
549             nout += sprintf(out+nout,"%s", tag->value);
550             tags = tags->next;
551         }
552         hlines = hlines->next;
553         nout += sprintf(out+nout,"\n");
554     }
555     out[len] = 0;
556     return out;
557 }
558 
sam_header_parse2(const char * headerText)559 void *sam_header_parse2(const char *headerText)
560 {
561     list_t *hlines = NULL;
562     HeaderLine *hline;
563     const char *text;
564     char *buf=NULL;
565     size_t nbuf = 0;
566 
567     if ( !headerText )
568 		return 0;
569 
570     text = headerText;
571     while ( (text=nextline(&buf, &nbuf, text)) )
572     {
573         hline = sam_header_line_parse(buf);
574         if ( hline && sam_header_line_validate(hline) )
575             // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
576             hlines = list_append_to_end(hlines, hline);
577         else
578         {
579 			if (hline) sam_header_line_free(hline);
580 			sam_header_free(hlines);
581             if ( buf ) free(buf);
582             return NULL;
583         }
584     }
585     if ( buf ) free(buf);
586 
587     return hlines;
588 }
589 
sam_header2tbl(const void * _dict,char type[2],char key_tag[2],char value_tag[2])590 void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
591 {
592 	const HeaderDict *dict = (const HeaderDict*)_dict;
593     const list_t *l   = dict;
594     khash_t(str) *tbl = kh_init(str);
595     khiter_t k;
596     int ret;
597 
598 	if (_dict == 0) return tbl; // return an empty (not null) hash table
599     while (l)
600     {
601         HeaderLine *hline = l->data;
602         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
603         {
604             l = l->next;
605             continue;
606         }
607 
608         HeaderTag *key, *value;
609         key   = header_line_has_tag(hline,key_tag);
610         value = header_line_has_tag(hline,value_tag);
611         if ( !key || !value )
612         {
613             l = l->next;
614             continue;
615         }
616 
617         k = kh_get(str, tbl, key->value);
618         if ( k != kh_end(tbl) )
619             debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
620         k = kh_put(str, tbl, key->value, &ret);
621         kh_value(tbl, k) = value->value;
622 
623         l = l->next;
624     }
625     return tbl;
626 }
627 
sam_header2list(const void * _dict,char type[2],char key_tag[2],int * _n)628 char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
629 {
630 	const HeaderDict *dict = (const HeaderDict*)_dict;
631     const list_t *l   = dict;
632     int max, n;
633 	char **ret;
634 
635 	ret = 0; *_n = max = n = 0;
636     while (l)
637     {
638         HeaderLine *hline = l->data;
639         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
640         {
641             l = l->next;
642             continue;
643         }
644 
645         HeaderTag *key;
646         key   = header_line_has_tag(hline,key_tag);
647         if ( !key )
648         {
649             l = l->next;
650             continue;
651         }
652 
653 		if (n == max) {
654 			max = max? max<<1 : 4;
655 			ret = realloc(ret, max * sizeof(void*));
656 		}
657 		ret[n++] = key->value;
658 
659         l = l->next;
660     }
661 	*_n = n;
662     return ret;
663 }
664 
sam_tbl_get(void * h,const char * key)665 const char *sam_tbl_get(void *h, const char *key)
666 {
667 	khash_t(str) *tbl = (khash_t(str)*)h;
668 	khint_t k;
669 	k = kh_get(str, tbl, key);
670 	return k == kh_end(tbl)? 0 : kh_val(tbl, k);
671 }
672 
sam_tbl_size(void * h)673 int sam_tbl_size(void *h)
674 {
675 	khash_t(str) *tbl = (khash_t(str)*)h;
676 	return h? kh_size(tbl) : 0;
677 }
678 
sam_tbl_destroy(void * h)679 void sam_tbl_destroy(void *h)
680 {
681 	khash_t(str) *tbl = (khash_t(str)*)h;
682 	kh_destroy(str, tbl);
683 }
684 
sam_header_merge(int n,const void ** _dicts)685 void *sam_header_merge(int n, const void **_dicts)
686 {
687 	const HeaderDict **dicts = (const HeaderDict**)_dicts;
688     HeaderDict *out_dict;
689     int idict, status;
690 
691     if ( n<2 ) return NULL;
692 
693     out_dict = sam_header_clone(dicts[0]);
694 
695     for (idict=1; idict<n; idict++)
696     {
697         const list_t *tmpl_hlines = dicts[idict];
698 
699         while ( tmpl_hlines )
700         {
701             list_t *out_hlines = out_dict;
702             int inserted = 0;
703             while ( out_hlines )
704             {
705                 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
706                 if ( status==0 )
707                 {
708                     out_hlines = out_hlines->next;
709                     continue;
710                 }
711 
712                 if ( status==2 )
713                 {
714                     print_header_line(stderr,tmpl_hlines->data);
715                     print_header_line(stderr,out_hlines->data);
716                     debug("Conflicting lines, cannot merge the headers.\n");
717 					return 0;
718                 }
719                 if ( status==3 )
720                     sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
721 
722                 inserted = 1;
723                 break;
724             }
725             if ( !inserted )
726                 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
727 
728             tmpl_hlines = tmpl_hlines->next;
729         }
730     }
731 
732     return out_dict;
733 }
734 
735 
736