1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield, Rob Davies
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36  */
37 
38 /*! \file
39  * SAM header parsing.
40  *
41  * These functions can be shared between SAM, BAM and CRAM file
42  * formats as all three internally use the same string encoding for
43  * header fields.
44  *
45  * Consider using the scram() generic API and calling
46  * scram_get_header() to obtain the format-specific pointer to the
47  * SAM_hdr struct.
48  */
49 
50 /*
51  * TODO.
52  *
53  * - Sort order (parse to struct, enum type, updating funcs)
54  * - Removal of lines.
55  * - Updating of lines
56  */
57 
58 #ifndef _SAM_HDR_H_
59 #define _SAM_HDR_H_
60 
61 #ifdef __cplusplus
62 extern "C" {
63 #endif
64 
65 #ifdef HAVE_CONFIG_H
66 #include "io_lib_config.h"
67 #endif
68 
69 #include <stdarg.h>
70 
71 #include "io_lib/dstring.h"
72 #include "io_lib/hash_table.h"
73 #include "io_lib/string_alloc.h"
74 
75 /*
76  * Proposed new SAM header parsing
77 
78 1 @SQ ID:foo LN:100
79 2 @SQ ID:bar LN:200
80 3 @SQ ID:ram LN:300 UR:xyz
81 4 @RG ID:r ...
82 5 @RG ID:s ...
83 
84 Hash table for 2-char keys without dup entries.
85 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
86 
87 HASH("SQ")--\
88             |
89     (3) <-> 1 <-> 2 <-> 3 <-> (1)
90 
91 HASH("RG")--\
92             |
93     (5) <-> 4 <-> 5 <-> (4)
94 
95 Items stored in the hash values also form their own linked lists:
96 Ie SQ->ID(foo)->LN(100)
97    SQ->ID(bar)->LN(200)
98    SQ->ID(ram)->LN(300)->UR(xyz)
99    RG->ID(r)
100  */
101 
102 /*! A single key:value pair on a header line
103  *
104  * These form a linked list and hold strings. The strings are
105  * allocated from a string_alloc_t pool refeenced in the master
106  * SAM_hdr structure. Do not attempt to free, malloc or manipulate
107  * these strings directly.
108  */
109 typedef struct SAM_hdr_tag_s {
110     struct SAM_hdr_tag_s *next;
111     char *str;
112     int   len;
113 } SAM_hdr_tag;
114 
115 /*! The parsed version of the SAM header string.
116  *
117  * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type
118  * struct via the main HashTable h in the SAM_hdr struct.
119  *
120  * These in turn consist of circular bi-directional linked lists (ie
121  * rings) to hold the multiple instances of the same header type
122  * code. For example if we have 5 \@SQ lines the primary hash table
123  * will key on \@SQ pointing to the first SAM_hdr_type and that in turn
124  * will be part of a ring of 5 elements.
125  *
126  * For each SAM_hdr_type structure we also point to a SAM_hdr_tag
127  * structure which holds the tokenised attributes; the tab separated
128  * key:value pairs per line.
129  */
130 typedef struct SAM_hdr_item_s {
131     struct SAM_hdr_item_s *next; // cirular
132     struct SAM_hdr_item_s *prev;
133     SAM_hdr_tag *tag;            // first tag
134     int order;                   // 0 upwards
135 } SAM_hdr_type;
136 
137 /*! Parsed \@SQ lines */
138 typedef struct {
139     char *name;
140     uint32_t len;
141     SAM_hdr_type *ty;
142     SAM_hdr_tag  *tag;
143 } SAM_SQ;
144 
145 /*! Parsed \@RG lines */
146 typedef struct {
147     char *name;
148     SAM_hdr_type *ty;
149     SAM_hdr_tag  *tag;
150     int name_len;
151     int id;           // numerical ID
152 } SAM_RG;
153 
154 /*! Parsed \@PG lines */
155 typedef struct {
156     char *name;
157     SAM_hdr_type *ty;
158     SAM_hdr_tag  *tag;
159     int name_len;
160     int id;           // numerical ID
161     int prev_id;      // -1 if none
162 } SAM_PG;
163 
164 /*! Sort order parsed from @HD line */
165 enum sam_sort_order {
166     ORDER_UNKNOWN  =-1,
167     ORDER_UNSORTED = 0,
168     ORDER_NAME     = 1,
169     ORDER_COORD    = 2,
170   //ORDER_COLLATE  = 3 // maybe one day!
171 };
172 
173 /*! Primary structure for header manipulation
174  *
175  * The initial header text is held in the text dstring_t, but is also
176  * parsed out into SQ, RG and PG arrays. These have a HashTable
177  * associated with each to allow lookup by ID or SN fields instead of
178  * their numeric array indices. Additionally PG has an array to hold
179  * the linked list start points (the last in a PP chain).
180  *
181  * Use the appropriate sam_hdr_* functions to edit the header, and
182  * call sam_hdr_rebuild() any time the textual form needs to be
183  * updated again.
184  */
185 typedef struct {
186     dstring_t *text;          //!< concatenated text, indexed by SAM_hdr_tag
187     HashTable *h;             //!< 2-char IDs, values are SAM_hdr_type
188     string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings
189     pool_alloc_t   *type_pool;//!< Pool of SAM_hdr_type structs
190     pool_alloc_t   *tag_pool; //!< Pool of SAM_hdr_tag structs
191 
192     // @SQ lines / references
193     int nref;                 //!< Number of \@SQ lines
194     SAM_SQ *ref;              //!< Array of parsed \@SQ lines
195     HashTable *ref_hash;      //!< Hash table indexed by SN: field
196 
197     // @RG lines / read-groups
198     int nrg;                  //!< Number of \@RG lines
199     SAM_RG *rg;               //!< Array of parsed \@RG lines
200     HashTable *rg_hash;	      //!< Hash table indexed by ID: field
201 
202     // @PG lines / programs
203     int npg;                  //!< Number of \@PG lines
204     int npg_end;              //!< Number of terminating \@PG lines
205     int npg_end_alloc;        //!< Size of pg_end field
206     SAM_PG *pg;		      //!< Array of parsed \@PG lines
207     HashTable *pg_hash;	      //!< Hash table indexed by ID: field
208     int *pg_end;              //!< \@PG chain termination IDs
209 
210     // @HD data
211     enum sam_sort_order sort_order; //!< @HD SO: field
212 
213     // Order of first occurence of @?? lines.
214     dstring_t *type_order;
215     int ntypes;
216 
217     // @cond internal
218     char ID_buf[1024];  // temporary buffer
219     int ID_cnt;
220     int ref_count;      // number of uses of this SAM_hdr
221     // @endcond
222 } SAM_hdr;
223 
224 /*! Creates an empty SAM header, ready to be populated.
225  *
226  * @return
227  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
228  *         NULL on failure
229  */
230 SAM_hdr *sam_hdr_new();
231 
232 /*! Tokenises a SAM header into a hash table.
233  *
234  * Also extracts a few bits on specific data types, such as @RG lines.
235  *
236  * @return
237  * Returns a SAM_hdr struct on success (free with sam_hdr_free());
238  *         NULL on failure
239  */
240 #ifdef SAMTOOLS
241 SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
242 #else
243 SAM_hdr *sam_hdr_parse(const char *hdr, int len);
244 #endif
245 
246 
247 /*! Produces a duplicate copy of hdr and returns it.
248  * @return
249  * Returns NULL on failure
250  */
251 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr);
252 
253 
254 /*! Increments a reference count on hdr.
255  *
256  * This permits multiple files to share the same header, all calling
257  * sam_hdr_free when done, without causing errors for other open  files.
258  */
259 void sam_hdr_incr_ref(SAM_hdr *hdr);
260 
261 
262 /*! Increments a reference count on hdr.
263  *
264  * This permits multiple files to share the same header, all calling
265  * sam_hdr_free when done, without causing errors for other open  files.
266  *
267  * If the reference count hits zero then the header is automatically
268  * freed. This makes it a synonym for sam_hdr_free().
269  */
270 void sam_hdr_decr_ref(SAM_hdr *hdr);
271 
272 
273 /*! Deallocates all storage used by a SAM_hdr struct.
274  *
275  * This also decrements the header reference count. If after decrementing
276  * it is still non-zero then the header is assumed to be in use by another
277  * caller and the free is not done.
278  *
279  * This is a synonym for sam_hdr_dec_ref().
280  */
281 void sam_hdr_free(SAM_hdr *hdr);
282 
283 /*! Returns the current length of the SAM_hdr in text form.
284  *
285  * Call sam_hdr_rebuild() first if editing has taken place.
286  */
287 int sam_hdr_length(SAM_hdr *hdr);
288 
289 /*! Returns the string form of the SAM_hdr.
290  *
291  * Call sam_hdr_rebuild() first if editing has taken place.
292  */
293 char *sam_hdr_str(SAM_hdr *hdr);
294 
295 /*! Appends a formatted line to an existing SAM header.
296  *
297  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
298  * optional new-line. If it contains more than 1 line then multiple lines
299  * will be added in order.
300  *
301  * Input text is of maximum length len or as terminated earlier by a NUL.
302  * Len may be 0 if unknown, in which case lines must be NUL-terminated.
303  *
304  * @return
305  * Returns 0 on success;
306  *        -1 on failure
307  */
308 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len);
309 
310 /*! Adds a single line to a SAM header.
311  *
312  * Specify type and one or more key,value pairs, ending with the NULL key.
313  * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
314  *
315  * @return
316  * Returns 0 on success;
317  *        -1 on failure
318  */
319 int sam_hdr_add(SAM_hdr *sh, const char *type, ...);
320 
321 /*! Adds a single line to a SAM header.
322  *
323  * This is much like sam_hdr_add() but with the additional va_list
324  * argument. This is followed by specifying type and one or more
325  * key,value pairs, ending with the NULL key.
326  *
327  * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
328  *
329  * The purpose of the additional va_list parameter is to permit other
330  * varargs functions to call this while including their own additional
331  * parameters; an example is in sam_hdr_add_PG().
332  *
333  * @return
334  * Returns 0 on success;
335  *        -1 on failure
336  */
337 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...);
338 
339 /*!
340  * @return
341  * Returns the first header item matching 'type'. If ID is non-NULL it checks
342  * for the tag ID: and compares against the specified ID.
343  *
344  * Returns NULL if no type/ID is found
345  */
346 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
347 			   char *ID_key, char *ID_value);
348 
349 /*!
350  *
351  * As per SAM_hdr_type, but returns a complete line of formatted text
352  * for a specific head type/ID combination. If ID is NULL then it returns
353  * the first line of the specified type.
354  *
355  * The returned string is malloced and should be freed by the calling
356  * function with free().
357  *
358  * @return
359  * Returns NULL if no type/ID is found.
360  */
361 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
362 			char *ID_key, char *ID_value);
363 
364 /*! Looks for a specific key in a single sam header line.
365  *
366  * If prev is non-NULL it also fills this out with the previous tag, to
367  * permit use in key removal. *prev is set to NULL when the tag is the first
368  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
369  * tag in the existing list.
370  *
371  * @return
372  * Returns the tag pointer on success;
373  *         NULL on failure
374  */
375 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
376 			      SAM_hdr_type *type,
377 			      char *key,
378 			      SAM_hdr_tag **prev);
379 
380 /*! Adds or updates tag key,value pairs in a header line.
381  *
382  * Eg for adding M5 tags to @SQ lines or updating sort order for the
383  * @HD line (although use the sam_hdr_sort_order() function for
384  * HD manipulation, which is a wrapper around this funuction).
385  *
386  * Specify multiple key,value pairs ending in NULL.
387  *
388  * @return
389  * Returns 0 on success;
390  *        -1 on failure
391  */
392 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...);
393 
394 /*! Returns the sort order from the @HD SO: field */
395 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr);
396 
397 /*! Reconstructs the dstring from the header hash table.
398  * @return
399  * Returns 0 on success;
400  *        -1 on failure
401  */
402 int sam_hdr_rebuild(SAM_hdr *hdr);
403 
404 /*! Looks up a reference sequence by name and returns the numerical ID.
405  * @return
406  * Returns -1 if unknown reference.
407  */
408 int sam_hdr_name2ref(SAM_hdr *hdr, char *ref);
409 
410 /*! Looks up a read-group by name and returns a pointer to the start of the
411  * associated tag list.
412  *
413  * @return
414  * Returns NULL on failure
415  */
416 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, char *rg);
417 
418 /*! Fixes any PP links in @PG headers.
419  *
420  * If the entries are in order then this doesn't need doing, but incase
421  * our header is out of order this goes through the sh->pg[] array
422  * setting the prev_id field.
423  *
424  * @return
425  * Returns 0 on sucess;
426  *        -1 on failure (indicating broken PG/PP records)
427  */
428 int sam_hdr_link_pg(SAM_hdr *hdr);
429 
430 
431 /*! Add an @PG line.
432  *
433  * If we wish complete control over this use sam_hdr_add() directly. This
434  * function uses that, but attempts to do a lot of tedious house work for
435  * you too.
436  *
437  * - It will generate a suitable ID if the supplied one clashes.
438  * - It will generate multiple @PG records if we have multiple PG chains.
439  *
440  * Call it as per sam_hdr_add() with a series of key,value pairs ending
441  * in NULL.
442  *
443  * @return
444  * Returns 0 on success;
445  *        -1 on failure
446  */
447 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
448 
449 /*!
450  * A function to help with construction of CL tags in @PG records.
451  * Takes an argc, argv pair and returns a single space-separated string.
452  * This string should be deallocated by the calling function.
453  *
454  * @return
455  * Returns malloced char * on success;
456  *         NULL on failure
457  */
458 char *stringify_argv(int argc, char *argv[]);
459 
460 #ifdef __cplusplus
461 }
462 #endif
463 
464 #endif /* _SAM_HDR_H_ */
465