1 /*
2 Copyright (c) 2013-2019 Genome Research Ltd.
3 Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*! \file
32  * SAM header parsing.
33  *
34  * These functions can be shared between SAM, BAM and CRAM file
35  * formats as all three internally use the same string encoding for
36  * header fields.
37  */
38 
39 
40 #ifndef HEADER_H_
41 #define HEADER_H_
42 
43 #include <stdarg.h>
44 
45 #include "cram/string_alloc.h"
46 #include "cram/pooled_alloc.h"
47 
48 #include "htslib/khash.h"
49 #include "htslib/kstring.h"
50 #include "htslib/sam.h"
51 
52 #ifdef __cplusplus
53 extern "C" {
54 #endif
55 
56 /*! Make a single integer out of a two-letter type code */
TYPEKEY(const char * type)57 static inline khint32_t TYPEKEY(const char *type) {
58     unsigned int u0 = (unsigned char) type[0];
59     unsigned int u1 = (unsigned char) type[1];
60     return (u0 << 8) | u1;
61 }
62 
63 /*
64  * Proposed new SAM header parsing
65 
66 1 @SQ ID:foo LN:100
67 2 @SQ ID:bar LN:200
68 3 @SQ ID:ram LN:300 UR:xyz
69 4 @RG ID:r ...
70 5 @RG ID:s ...
71 
72 Hash table for 2-char @keys without dup entries.
73 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
74 
75 HASH("SQ")--\
76             |
77     (3) <-> 1 <-> 2 <-> 3 <-> (1)
78 
79 HASH("RG")--\
80             |
81     (5) <-> 4 <-> 5 <-> (4)
82 
83 Items stored in the hash values also form their own linked lists:
84 Ie SQ->ID(foo)->LN(100)
85    SQ->ID(bar)->LN(200)
86    SQ->ID(ram)->LN(300)->UR(xyz)
87    RG->ID(r)
88  */
89 
90 /*! A single key:value pair on a header line
91  *
92  * These form a linked list and hold strings. The strings are
93  * allocated from a string_alloc_t pool referenced in the master
94  * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
95  * these strings directly.
96  */
97 typedef struct sam_hrec_tag_s {
98     struct sam_hrec_tag_s *next;
99     const char *str;
100     int   len;
101 } sam_hrec_tag_t;
102 
103 /*! The parsed version of the SAM header string.
104  *
105  * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
106  * struct via the main hash table h in the sam_hrecs_t struct.
107  *
108  * These in turn consist of circular bi-directional linked lists (ie
109  * rings) to hold the multiple instances of the same header type
110  * code. For example if we have 5 \@SQ lines the primary hash table
111  * will key on \@SQ pointing to the first sam_hdr_type and that in turn
112  * will be part of a ring of 5 elements.
113  *
114  * For each sam_hdr_type structure we also point to a sam_hdr_tag
115  * structure which holds the tokenised attributes; the tab separated
116  * key:value pairs per line.
117  */
118 typedef struct sam_hrec_type_s {
119     struct sam_hrec_type_s *next; // circular list of this type
120     struct sam_hrec_type_s *prev; // circular list of this type
121     struct sam_hrec_type_s *global_next; // circular list of all lines
122     struct sam_hrec_type_s *global_prev; // circular list of all lines
123     sam_hrec_tag_t *tag;          // first tag
124     khint32_t type;               // Two-letter type code as an int
125 } sam_hrec_type_t;
126 
127 /*! Parsed \@SQ lines */
128 typedef struct {
129     const char *name;
130     hts_pos_t len;
131     sam_hrec_type_t *ty;
132 } sam_hrec_sq_t;
133 
134 /*! Parsed \@RG lines */
135 typedef struct {
136     const char *name;
137     sam_hrec_type_t *ty;
138     int name_len;
139     int id;           // numerical ID
140 } sam_hrec_rg_t;
141 
142 /*! Parsed \@PG lines */
143 typedef struct {
144     const char *name;
145     sam_hrec_type_t *ty;
146     int name_len;
147     int id;           // numerical ID
148     int prev_id;      // -1 if none
149 } sam_hrec_pg_t;
150 
151 
152 /*! Sort order parsed from @HD line */
153 enum sam_sort_order {
154     ORDER_UNKNOWN  =-1,
155     ORDER_UNSORTED = 0,
156     ORDER_NAME     = 1,
157     ORDER_COORD    = 2
158   //ORDER_COLLATE  = 3 // maybe one day!
159 };
160 
161 enum sam_group_order {
162     ORDER_NONE      =-1,
163     ORDER_QUERY     = 0,
164     ORDER_REFERENCE = 1
165 };
166 
167 KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
168 KHASH_MAP_INIT_STR(m_s2i, int)
169 
170 /*! Primary structure for header manipulation
171  *
172  * The initial header text is held in the text kstring_t, but is also
173  * parsed out into SQ, RG and PG arrays. These have a hash table
174  * associated with each to allow lookup by ID or SN fields instead of
175  * their numeric array indices. Additionally PG has an array to hold
176  * the linked list start points (the last in a PP chain).
177  *
178  * Use the appropriate sam_hdr_* functions to edit the header, and
179  * call sam_hdr_rebuild() any time the textual form needs to be
180  * updated again.
181  */
182 struct sam_hrecs_t {
183     khash_t(sam_hrecs_t) *h;
184     sam_hrec_type_t *first_line; //!< First line (usually @HD)
185     string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
186     pool_alloc_t   *type_pool;//!< Pool of sam_hdr_type structs
187     pool_alloc_t   *tag_pool; //!< Pool of sam_hdr_tag structs
188 
189     // @SQ lines / references
190     int nref;                  //!< Number of \@SQ lines
191     int ref_sz;                //!< Number of entries available in ref[]
192     sam_hrec_sq_t *ref;        //!< Array of parsed \@SQ lines
193     khash_t(m_s2i) *ref_hash;  //!< Maps SQ SN field to ref[] index
194 
195     // @RG lines / read-groups
196     int nrg;                   //!< Number of \@RG lines
197     int rg_sz;                 //!< number of entries available in rg[]
198     sam_hrec_rg_t *rg;         //!< Array of parsed \@RG lines
199     khash_t(m_s2i) *rg_hash;   //!< Maps RG ID field to rg[] index
200 
201     // @PG lines / programs
202     int npg;                   //!< Number of \@PG lines
203     int pg_sz;                //!< Number of entries available in pg[]
204     int npg_end;               //!< Number of terminating \@PG lines
205     int npg_end_alloc;         //!< Size of pg_end field
206     sam_hrec_pg_t *pg;         //!< Array of parsed \@PG lines
207     khash_t(m_s2i) *pg_hash;   //!< Maps PG ID field to pg[] index
208     int *pg_end;               //!< \@PG chain termination IDs
209 
210     // @cond internal
211     char *ID_buf;             // temporary buffer for sam_hdr_pg_id
212     uint32_t ID_buf_sz;
213     int ID_cnt;
214     // @endcond
215 
216     int dirty;                // marks the header as modified, so it can be rebuilt
217     int refs_changed;         // Index of first changed ref (-1 if unchanged)
218     int pgs_changed;          // New PG line added
219     int type_count;
220     char (*type_order)[3];
221 };
222 
223 /*!
224  * Method for parsing the header text and populating the
225  * internal hash tables. After calling this method, the
226  * parsed representation becomes the single source of truth.
227  *
228  * @param bh    Header structure, previously initialised by a
229  *              sam_hdr_init call
230  * @return      0 on success, -1 on failure
231  */
232 int sam_hdr_fill_hrecs(sam_hdr_t *bh);
233 
234 /*!
235  * Reconstructs the text representation of the header from
236  * the hash table data after a change has been performed on
237  * the header.
238  *
239  * @return  0 on success, -1 on failure
240  */
241 int sam_hdr_rebuild(sam_hdr_t *bh);
242 
243 /*! Creates an empty SAM header, ready to be populated.
244  *
245  * @return
246  * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
247  *         NULL on failure
248  */
249 sam_hrecs_t *sam_hrecs_new(void);
250 
251 /*! Produces a duplicate copy of hrecs and returns it.
252  * @return
253  * Returns NULL on failure
254  */
255 sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
256 
257 /*! Update sam_hdr_t target_name and target_len arrays
258  *
259  *  sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
260  *  can use it to construct target arrays from the source header.
261  *
262  *  @return 0 on success; -1 on failure
263  */
264 int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
265                                  int refs_changed);
266 
267 /*! Reconstructs a kstring from the header hash table.
268  *
269  * @return
270  * Returns 0 on success
271  *        -1 on failure
272  */
273 int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
274 
275 /*! Deallocates all storage used by a sam_hrecs_t struct.
276  *
277  * This also decrements the header reference count. If after decrementing
278  * it is still non-zero then the header is assumed to be in use by another
279  * caller and the free is not done.
280  */
281 void sam_hrecs_free(sam_hrecs_t *hrecs);
282 
283 /*!
284  * @return
285  * Returns the first header item matching 'type'. If ID is non-NULL it checks
286  * for the tag ID: and compares against the specified ID.
287  *
288  * Returns NULL if no type/ID is found
289  */
290 sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
291                                      const char *ID_key, const char *ID_value);
292 
293 sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
294                                    const char *key,
295                                    sam_hrec_tag_t **prev);
296 
297 int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
298                          sam_hrec_type_t *type,
299                          const char *key);
300 
301 /*! Looks up a read-group by name and returns a pointer to the start of the
302  * associated tag list.
303  *
304  * @return
305  * Returns NULL on failure
306  */
307 sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
308 
309 /*! Returns the sort order from the @HD SO: field */
310 enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
311 
312 /*! Returns the group order from the @HD SO: field */
313 enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
314 
315 #ifdef __cplusplus
316 }
317 #endif
318 
319 #endif /* HEADER_H_ */
320