1 /* 2 * Copyright (c) 2013 Genome Research Ltd. 3 * Author(s): James Bonfield, Rob Davies 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following 13 * disclaimer in the documentation and/or other materials provided 14 * with the distribution. 15 * 16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 17 * Institute nor the names of its contributors may be used to endorse 18 * or promote products derived from this software without specific 19 * prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS 22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH 25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013 36 */ 37 38 /*! \file 39 * SAM header parsing. 40 * 41 * These functions can be shared between SAM, BAM and CRAM file 42 * formats as all three internally use the same string encoding for 43 * header fields. 44 * 45 * Consider using the scram() generic API and calling 46 * scram_get_header() to obtain the format-specific pointer to the 47 * SAM_hdr struct. 48 */ 49 50 /* 51 * TODO. 52 * 53 * - Sort order (parse to struct, enum type, updating funcs) 54 * - Removal of lines. 55 * - Updating of lines 56 */ 57 58 #ifndef _SAM_HDR_H_ 59 #define _SAM_HDR_H_ 60 61 #ifdef __cplusplus 62 extern "C" { 63 #endif 64 65 #ifdef HAVE_CONFIG_H 66 #include "io_lib_config.h" 67 #endif 68 69 #include <stdarg.h> 70 71 #include "io_lib/dstring.h" 72 #include "io_lib/hash_table.h" 73 #include "io_lib/string_alloc.h" 74 75 /* 76 * Proposed new SAM header parsing 77 78 1 @SQ ID:foo LN:100 79 2 @SQ ID:bar LN:200 80 3 @SQ ID:ram LN:300 UR:xyz 81 4 @RG ID:r ... 82 5 @RG ID:s ... 83 84 Hash table for 2-char keys without dup entries. 85 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. 86 87 HASH("SQ")--\ 88 | 89 (3) <-> 1 <-> 2 <-> 3 <-> (1) 90 91 HASH("RG")--\ 92 | 93 (5) <-> 4 <-> 5 <-> (4) 94 95 Items stored in the hash values also form their own linked lists: 96 Ie SQ->ID(foo)->LN(100) 97 SQ->ID(bar)->LN(200) 98 SQ->ID(ram)->LN(300)->UR(xyz) 99 RG->ID(r) 100 */ 101 102 /*! A single key:value pair on a header line 103 * 104 * These form a linked list and hold strings. The strings are 105 * allocated from a string_alloc_t pool refeenced in the master 106 * SAM_hdr structure. Do not attempt to free, malloc or manipulate 107 * these strings directly. 108 */ 109 typedef struct SAM_hdr_tag_s { 110 struct SAM_hdr_tag_s *next; 111 char *str; 112 int len; 113 } SAM_hdr_tag; 114 115 /*! The parsed version of the SAM header string. 116 * 117 * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type 118 * struct via the main HashTable h in the SAM_hdr struct. 119 * 120 * These in turn consist of circular bi-directional linked lists (ie 121 * rings) to hold the multiple instances of the same header type 122 * code. For example if we have 5 \@SQ lines the primary hash table 123 * will key on \@SQ pointing to the first SAM_hdr_type and that in turn 124 * will be part of a ring of 5 elements. 125 * 126 * For each SAM_hdr_type structure we also point to a SAM_hdr_tag 127 * structure which holds the tokenised attributes; the tab separated 128 * key:value pairs per line. 129 */ 130 typedef struct SAM_hdr_item_s { 131 struct SAM_hdr_item_s *next; // cirular 132 struct SAM_hdr_item_s *prev; 133 SAM_hdr_tag *tag; // first tag 134 int order; // 0 upwards 135 } SAM_hdr_type; 136 137 /*! Parsed \@SQ lines */ 138 typedef struct { 139 char *name; 140 uint32_t len; 141 SAM_hdr_type *ty; 142 SAM_hdr_tag *tag; 143 } SAM_SQ; 144 145 /*! Parsed \@RG lines */ 146 typedef struct { 147 char *name; 148 SAM_hdr_type *ty; 149 SAM_hdr_tag *tag; 150 int name_len; 151 int id; // numerical ID 152 } SAM_RG; 153 154 /*! Parsed \@PG lines */ 155 typedef struct { 156 char *name; 157 SAM_hdr_type *ty; 158 SAM_hdr_tag *tag; 159 int name_len; 160 int id; // numerical ID 161 int prev_id; // -1 if none 162 } SAM_PG; 163 164 /*! Sort order parsed from @HD line */ 165 enum sam_sort_order { 166 ORDER_UNKNOWN =-1, 167 ORDER_UNSORTED = 0, 168 ORDER_NAME = 1, 169 ORDER_COORD = 2, 170 //ORDER_COLLATE = 3 // maybe one day! 171 }; 172 173 /*! Primary structure for header manipulation 174 * 175 * The initial header text is held in the text dstring_t, but is also 176 * parsed out into SQ, RG and PG arrays. These have a HashTable 177 * associated with each to allow lookup by ID or SN fields instead of 178 * their numeric array indices. Additionally PG has an array to hold 179 * the linked list start points (the last in a PP chain). 180 * 181 * Use the appropriate sam_hdr_* functions to edit the header, and 182 * call sam_hdr_rebuild() any time the textual form needs to be 183 * updated again. 184 */ 185 typedef struct { 186 dstring_t *text; //!< concatenated text, indexed by SAM_hdr_tag 187 HashTable *h; //!< 2-char IDs, values are SAM_hdr_type 188 string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings 189 pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs 190 pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs 191 192 // @SQ lines / references 193 int nref; //!< Number of \@SQ lines 194 SAM_SQ *ref; //!< Array of parsed \@SQ lines 195 HashTable *ref_hash; //!< Hash table indexed by SN: field 196 197 // @RG lines / read-groups 198 int nrg; //!< Number of \@RG lines 199 SAM_RG *rg; //!< Array of parsed \@RG lines 200 HashTable *rg_hash; //!< Hash table indexed by ID: field 201 202 // @PG lines / programs 203 int npg; //!< Number of \@PG lines 204 int npg_end; //!< Number of terminating \@PG lines 205 int npg_end_alloc; //!< Size of pg_end field 206 SAM_PG *pg; //!< Array of parsed \@PG lines 207 HashTable *pg_hash; //!< Hash table indexed by ID: field 208 int *pg_end; //!< \@PG chain termination IDs 209 210 // @HD data 211 enum sam_sort_order sort_order; //!< @HD SO: field 212 213 // Order of first occurence of @?? lines. 214 dstring_t *type_order; 215 int ntypes; 216 217 // @cond internal 218 char ID_buf[1024]; // temporary buffer 219 int ID_cnt; 220 int ref_count; // number of uses of this SAM_hdr 221 // @endcond 222 } SAM_hdr; 223 224 /*! Creates an empty SAM header, ready to be populated. 225 * 226 * @return 227 * Returns a SAM_hdr struct on success (free with sam_hdr_free()) 228 * NULL on failure 229 */ 230 SAM_hdr *sam_hdr_new(); 231 232 /*! Tokenises a SAM header into a hash table. 233 * 234 * Also extracts a few bits on specific data types, such as @RG lines. 235 * 236 * @return 237 * Returns a SAM_hdr struct on success (free with sam_hdr_free()); 238 * NULL on failure 239 */ 240 #ifdef SAMTOOLS 241 SAM_hdr *sam_hdr_parse_(const char *hdr, int len); 242 #else 243 SAM_hdr *sam_hdr_parse(const char *hdr, int len); 244 #endif 245 246 247 /*! Produces a duplicate copy of hdr and returns it. 248 * @return 249 * Returns NULL on failure 250 */ 251 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr); 252 253 254 /*! Increments a reference count on hdr. 255 * 256 * This permits multiple files to share the same header, all calling 257 * sam_hdr_free when done, without causing errors for other open files. 258 */ 259 void sam_hdr_incr_ref(SAM_hdr *hdr); 260 261 262 /*! Increments a reference count on hdr. 263 * 264 * This permits multiple files to share the same header, all calling 265 * sam_hdr_free when done, without causing errors for other open files. 266 * 267 * If the reference count hits zero then the header is automatically 268 * freed. This makes it a synonym for sam_hdr_free(). 269 */ 270 void sam_hdr_decr_ref(SAM_hdr *hdr); 271 272 273 /*! Deallocates all storage used by a SAM_hdr struct. 274 * 275 * This also decrements the header reference count. If after decrementing 276 * it is still non-zero then the header is assumed to be in use by another 277 * caller and the free is not done. 278 * 279 * This is a synonym for sam_hdr_dec_ref(). 280 */ 281 void sam_hdr_free(SAM_hdr *hdr); 282 283 /*! Returns the current length of the SAM_hdr in text form. 284 * 285 * Call sam_hdr_rebuild() first if editing has taken place. 286 */ 287 int sam_hdr_length(SAM_hdr *hdr); 288 289 /*! Returns the string form of the SAM_hdr. 290 * 291 * Call sam_hdr_rebuild() first if editing has taken place. 292 */ 293 char *sam_hdr_str(SAM_hdr *hdr); 294 295 /*! Appends a formatted line to an existing SAM header. 296 * 297 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with 298 * optional new-line. If it contains more than 1 line then multiple lines 299 * will be added in order. 300 * 301 * Input text is of maximum length len or as terminated earlier by a NUL. 302 * Len may be 0 if unknown, in which case lines must be NUL-terminated. 303 * 304 * @return 305 * Returns 0 on success; 306 * -1 on failure 307 */ 308 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len); 309 310 /*! Adds a single line to a SAM header. 311 * 312 * Specify type and one or more key,value pairs, ending with the NULL key. 313 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). 314 * 315 * @return 316 * Returns 0 on success; 317 * -1 on failure 318 */ 319 int sam_hdr_add(SAM_hdr *sh, const char *type, ...); 320 321 /*! Adds a single line to a SAM header. 322 * 323 * This is much like sam_hdr_add() but with the additional va_list 324 * argument. This is followed by specifying type and one or more 325 * key,value pairs, ending with the NULL key. 326 * 327 * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). 328 * 329 * The purpose of the additional va_list parameter is to permit other 330 * varargs functions to call this while including their own additional 331 * parameters; an example is in sam_hdr_add_PG(). 332 * 333 * @return 334 * Returns 0 on success; 335 * -1 on failure 336 */ 337 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...); 338 339 /*! 340 * @return 341 * Returns the first header item matching 'type'. If ID is non-NULL it checks 342 * for the tag ID: and compares against the specified ID. 343 * 344 * Returns NULL if no type/ID is found 345 */ 346 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, 347 char *ID_key, char *ID_value); 348 349 /*! 350 * 351 * As per SAM_hdr_type, but returns a complete line of formatted text 352 * for a specific head type/ID combination. If ID is NULL then it returns 353 * the first line of the specified type. 354 * 355 * The returned string is malloced and should be freed by the calling 356 * function with free(). 357 * 358 * @return 359 * Returns NULL if no type/ID is found. 360 */ 361 char *sam_hdr_find_line(SAM_hdr *hdr, char *type, 362 char *ID_key, char *ID_value); 363 364 /*! Looks for a specific key in a single sam header line. 365 * 366 * If prev is non-NULL it also fills this out with the previous tag, to 367 * permit use in key removal. *prev is set to NULL when the tag is the first 368 * key in the list. When a tag isn't found, prev (if non NULL) will be the last 369 * tag in the existing list. 370 * 371 * @return 372 * Returns the tag pointer on success; 373 * NULL on failure 374 */ 375 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, 376 SAM_hdr_type *type, 377 char *key, 378 SAM_hdr_tag **prev); 379 380 /*! Adds or updates tag key,value pairs in a header line. 381 * 382 * Eg for adding M5 tags to @SQ lines or updating sort order for the 383 * @HD line (although use the sam_hdr_sort_order() function for 384 * HD manipulation, which is a wrapper around this funuction). 385 * 386 * Specify multiple key,value pairs ending in NULL. 387 * 388 * @return 389 * Returns 0 on success; 390 * -1 on failure 391 */ 392 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...); 393 394 /*! Returns the sort order from the @HD SO: field */ 395 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr); 396 397 /*! Reconstructs the dstring from the header hash table. 398 * @return 399 * Returns 0 on success; 400 * -1 on failure 401 */ 402 int sam_hdr_rebuild(SAM_hdr *hdr); 403 404 /*! Looks up a reference sequence by name and returns the numerical ID. 405 * @return 406 * Returns -1 if unknown reference. 407 */ 408 int sam_hdr_name2ref(SAM_hdr *hdr, char *ref); 409 410 /*! Looks up a read-group by name and returns a pointer to the start of the 411 * associated tag list. 412 * 413 * @return 414 * Returns NULL on failure 415 */ 416 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, char *rg); 417 418 /*! Fixes any PP links in @PG headers. 419 * 420 * If the entries are in order then this doesn't need doing, but incase 421 * our header is out of order this goes through the sh->pg[] array 422 * setting the prev_id field. 423 * 424 * @return 425 * Returns 0 on sucess; 426 * -1 on failure (indicating broken PG/PP records) 427 */ 428 int sam_hdr_link_pg(SAM_hdr *hdr); 429 430 431 /*! Add an @PG line. 432 * 433 * If we wish complete control over this use sam_hdr_add() directly. This 434 * function uses that, but attempts to do a lot of tedious house work for 435 * you too. 436 * 437 * - It will generate a suitable ID if the supplied one clashes. 438 * - It will generate multiple @PG records if we have multiple PG chains. 439 * 440 * Call it as per sam_hdr_add() with a series of key,value pairs ending 441 * in NULL. 442 * 443 * @return 444 * Returns 0 on success; 445 * -1 on failure 446 */ 447 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); 448 449 /*! 450 * A function to help with construction of CL tags in @PG records. 451 * Takes an argc, argv pair and returns a single space-separated string. 452 * This string should be deallocated by the calling function. 453 * 454 * @return 455 * Returns malloced char * on success; 456 * NULL on failure 457 */ 458 char *stringify_argv(int argc, char *argv[]); 459 460 #ifdef __cplusplus 461 } 462 #endif 463 464 #endif /* _SAM_HDR_H_ */ 465