1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 27 #ifndef _h_common_reader_ 28 #define _h_common_reader_ 29 30 #ifndef _h_klib_defs_ 31 #include <klib/defs.h> 32 #endif 33 34 #ifdef __cplusplus 35 extern "C" { 36 #endif 37 38 /*-------------------------------------------------------------------------- 39 * forwards 40 */ 41 typedef struct ReaderFile ReaderFile; 42 typedef struct Record Record; 43 typedef struct Sequence Sequence; 44 typedef struct Alignment Alignment; 45 typedef struct CGData CGData; 46 typedef struct Rejected Rejected; 47 typedef struct ReferenceInfo ReferenceInfo; 48 49 /*-------------------------------------------------------------------------- 50 ReaderFile 51 */ 52 rc_t CC ReaderFileAddRef ( const ReaderFile *self ); 53 rc_t CC ReaderFileRelease ( const ReaderFile *self ); 54 55 /* GetRecord 56 * Parses the next record from the source. At the end of the file, rc == 0, *result == 0. 57 */ 58 rc_t CC ReaderFileGetRecord( const ReaderFile *self, const Record** result); 59 60 /* GetPathname 61 * Returns input's pathname, if applicable. 62 */ 63 const char* CC ReaderFileGetPathname ( const ReaderFile *self ); 64 65 /* GetProportionalPosition 66 * get the aproximate proportional position in the input file 67 * this is intended to be useful for computing progress 68 * 69 * NB - does not return rc_t 70 */ 71 float CC ReaderFileGetProportionalPosition ( const ReaderFile *self ); 72 73 /* GetReferenceInfo 74 * 75 */ 76 rc_t CC ReaderFileGetReferenceInfo ( const ReaderFile *self, const ReferenceInfo** result ); 77 78 /*-------------------------------------------------------------------------- 79 Record 80 */ 81 82 /* AddRef 83 * Release 84 */ 85 rc_t CC RecordAddRef ( const Record *self ); 86 rc_t CC RecordRelease ( const Record *self ); 87 88 rc_t CC RecordGetRejected ( const Record *self, const Rejected** result); 89 rc_t CC RecordGetSequence ( const Record *self, const Sequence** result); 90 rc_t CC RecordGetAlignment( const Record *self, const Alignment** result); 91 92 /*-------------------------------------------------------------------------- 93 Sequence 94 */ 95 96 /* AddRef 97 * Release 98 */ 99 rc_t CC SequenceAddRef ( const Sequence *self ); 100 rc_t CC SequenceRelease ( const Sequence *self ); 101 102 /* GetReadLength 103 * get the sequence length 104 * i.e. the number of elements of both sequence and quality 105 * 106 * "length" [ OUT ] - length in bases of query sequence and quality 107 */ 108 rc_t CC SequenceGetReadLength ( const Sequence *self, uint32_t *length ); 109 110 /* GetRead 111 * get the sequence data [0..ReadLength) 112 * caller provides buffer of ReadLength bytes 113 * 114 * "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes 115 */ 116 rc_t CC SequenceGetRead( const Sequence *self, char *sequence ); 117 118 /* GetRead2 119 * get the sequence data [0..ReadLength) 120 * caller provides buffer of ReadLength bytes 121 * 122 * "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes 123 * 124 * "start" [ IN ] and "stop" [ IN ] - zero-based coordinates, half-closed interval; both have to be within ReadLength 125 */ 126 rc_t CC SequenceGetRead2 ( const Sequence *self, char *sequence, uint32_t start, uint32_t stop); 127 128 enum QualityType { 129 QT_Unknown = 0, 130 QT_Phred, 131 QT_LogOdds 132 }; 133 /* GetQuality 134 * get the raw quality data [0..ReadLength) from OQ if possible else from QUAL 135 * values are unsigned with 0xFF == missing 136 * 137 * "quality" [ OUT ] - return param for quality sequence 138 * held internally, validity is guaranteed for the life of the sequence 139 * 140 * "offset" [ OUT ] - the zero point of quality (33, 64; 0 for binary) 141 * 142 * "qualType" [ OUT ] - quality type (phred, log-odds, unknown) 143 */ 144 rc_t CC SequenceGetQuality(const Sequence *self, const int8_t **quality, uint8_t *offset, int *qualType); 145 146 /* SequenceGetSpotGroup 147 * get the name of the spot group (e.g. accession) 148 * 149 * "name" [ OUT ] - return param for group name 150 * held internally, validity is guaranteed for the life of the sequence 151 * 152 * "length" [ OUT ] - return the number of bytes in "name" 153 */ 154 rc_t CC SequenceGetSpotGroup ( const Sequence *self, const char **name, size_t *length ); 155 156 157 /* SequenceGetSpotName 158 * get the read name and length in bytes 159 * 160 * "name" [ OUT ] - return param for read group name 161 * held internally, validity is guaranteed for the life of the sequence 162 * 163 * "length" [ OUT ] - return the number of bytes in "name" 164 */ 165 rc_t CC SequenceGetSpotName ( const Sequence *self, const char **name, size_t *length ); 166 167 /* IsColorSpace 168 * Does the sequence have colorspace info 169 */ 170 bool CC SequenceIsColorSpace ( const Sequence *self ); 171 172 /* GetCSKey 173 * get the colorspace key 174 * 175 * "cskey" [ OUT ] - return param 176 * 177 * return: if no colorspace info, RC is 0 but the value of cskey is undefined 178 */ 179 rc_t CC SequenceGetCSKey ( const Sequence *self, char cskey[1] ); 180 181 /* GetCSReadLength 182 * get the color space sequence length 183 * i.e. the number of elements of both sequence and quality 184 * 185 * "length" [ OUT ] - length in bases of query sequence and quality 186 */ 187 rc_t CC SequenceGetCSReadLength ( const Sequence *self, uint32_t *length ); 188 189 /* GetCSRead 190 * get the color space sequence data [0..ReadLength) 191 * caller provides buffer of ReadLength bytes 192 * 193 * "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes 194 */ 195 rc_t CC SequenceGetCSRead( const Sequence *self, char *sequence ); 196 197 /* GetCSQuality 198 * get the color spaqce sequence's raw quality data [0..ReadLength) from OQ if possible else from QUAL 199 * values are unsigned with 0xFF == missing 200 * 201 * "quality" [ OUT ] - return param for quality sequence 202 * held internally, validity is guaranteed for the life of the sequence 203 * 204 * "offset" [ OUT ] - the zero point of quality (33, 64; 0 for binary) 205 * 206 * "qualType" [ OUT ] - quality type (phred, log-odds, unknown) 207 */ 208 rc_t CC SequenceGetCSQuality(const Sequence *self, const int8_t **quality, uint8_t *offset, int *qualType); 209 210 211 /* WasPaired 212 * true if read number is present and not 0 213 */ 214 bool CC SequenceWasPaired ( const Sequence *self ); 215 216 enum ReadOrientation { 217 ReadOrientationUnknown, 218 ReadOrientationForward, 219 ReadOrientationReverse 220 }; 221 /* SequenceGetOrientationSelf 222 */ 223 int CC SequenceGetOrientationSelf( const Sequence *self ); 224 /* SequenceGetOrientationMate 225 */ 226 int CC SequenceGetOrientationMate( const Sequence *self ); 227 228 /* IsFirst 229 * fastq: read number is present and equal to 1 230 */ 231 bool CC SequenceIsFirst ( const Sequence *self ); 232 /* IsSecond 233 * fastq: read number is present and equal to 2 234 */ 235 bool CC SequenceIsSecond ( const Sequence *self ); 236 /* IsDuplicate 237 * 238 */ 239 bool CC SequenceIsDuplicate( const Sequence *self ); 240 /* IsLowQuality 241 * 242 */ 243 bool CC SequenceIsLowQuality( const Sequence *self ); 244 245 /* RecordGetTI 246 * 247 */ 248 rc_t SequenceGetTI(Sequence const *self, uint64_t *ti); 249 250 /*-------------------------------------------------------------------------- 251 Alignment 252 */ 253 254 /* AddRef 255 * Release 256 */ 257 rc_t CC AlignmentAddRef ( const Alignment *self ); 258 rc_t CC AlignmentRelease ( const Alignment *self ); 259 260 /* GetRefSeqId 261 * get id of reference sequence 262 * pass result into BAMFileGetRefSeqById to get the Reference Sequence record 263 * 264 * "refSeqId" [ OUT ] - zero-based id of reference sequence 265 * returns -1 if set is invalid within BAM ( rc may be zero ) 266 */ 267 rc_t CC AlignmentGetRefSeqId ( const Alignment *self, int32_t *refSeqId ); 268 269 /* GetMateRefSeqId 270 * get id of mate's reference sequence 271 * pass result into BAMFileGetRefSeqById to get the Reference Sequence record 272 * 273 * "refSeqId" [ OUT ] - zero-based id of reference sequence 274 * returns -1 if invalid 275 */ 276 rc_t CC AlignmentGetMateRefSeqId ( const Alignment *self, int32_t *refSeqId ); 277 278 /* GetPosition 279 * get the aligned position on the ref. seq. 280 * 281 * "n" [ IN ] - zero-based position index for cases of multiple alignments 282 * 283 * "pos" [ OUT ] - zero-based position on reference sequence 284 * returns -1 if invalid 285 */ 286 rc_t CC AlignmentGetPosition ( const Alignment *self, int64_t *pos ); 287 288 /* GetMatePosition 289 * starting coordinate of mate's alignment on ref. seq. 290 * 291 * "pos" [ OUT ] - zero-based position on reference sequence 292 * returns -1 if invalid 293 */ 294 rc_t CC AlignmentGetMatePosition ( const Alignment *self, int64_t *pos ); 295 296 /* GetMapQuality 297 * return the quality score of mapping 298 * 299 * "qual" [ OUT ] - return param for quality score 300 */ 301 rc_t CC AlignmentGetMapQuality ( const Alignment *self, uint8_t *qual ); 302 303 /* GetAlignmentDetail 304 * get the alignment details 305 * 306 * "rslt" [ OUT, NULL OKAY ] and "count" [ IN ] - array to hold detail records 307 * 308 * "actual" [ OUT, NULL OKAY ] - number of elements written to "rslt" 309 * required if "rslt" is NULL 310 * 311 * "firstMatch" [ OUT, NULL OKAY ] - zero-based index into "rslt" of the first match to the refSeq 312 * or < 0 if invalid 313 * 314 * "lastMatch" [ OUT, NULL OKAY ] - zero-based index into "rslt" of the last match to the refSeq 315 * or < 0 if invalid 316 */ 317 typedef uint32_t AlignOpType; 318 enum AlignOpTypes 319 { 320 align_Match = 'M', /* 0 */ 321 align_Insert = 'I', /* 1 */ 322 align_Delete = 'D', /* 2 */ 323 align_Skip = 'N', /* 3 */ 324 align_SoftClip = 'S', /* 4 */ 325 align_HardClip = 'H', /* 5 */ 326 align_Padded = 'P', /* 6 */ 327 align_Equal = '=', /* 7 */ 328 align_NotEqual = 'X', /* 8 */ 329 align_Overlap = 'B' /* Complete Genomics extension */ 330 }; 331 332 typedef struct AlignmentDetail AlignmentDetail; 333 struct AlignmentDetail 334 { 335 int64_t refSeq_pos; /* position on refSeq where this alignment region starts or -1 if NA */ 336 int32_t read_pos; /* position on read where this alignment region starts or -1 if NA */ 337 uint32_t length; /* length of alignment region */ 338 AlignOpType type; /* type of alignment */ 339 }; 340 341 rc_t CC AlignmentGetAlignmentDetail ( const Alignment *self, 342 AlignmentDetail *rslt, 343 uint32_t count, 344 uint32_t *actual, 345 int32_t *firstMatch, 346 int32_t *lastMatch ); 347 348 349 /* GetCigarCount 350 * the number of CIGAR elements 351 * a CIGAR element consists of the pair of matching op code and op length 352 * 353 * "n" [ OUT ] - return param for cigar count 354 */ 355 rc_t CC AlignmentGetAlignOpCount ( const Alignment *self, uint32_t *n ); 356 357 358 /* GetInsertSize 359 * distance in bases to start of mate's alignment on ref. seq. 360 * 361 * "size" [ OUT ] - >0 for first in pair, <0 for second 362 */ 363 rc_t CC AlignmentGetInsertSize ( const Alignment *self, int64_t *size ); 364 365 /* GetBAMCigar 366 * 367 */ 368 rc_t CC AlignmentGetBAMCigar(const Alignment *cself, uint32_t const **rslt, uint32_t *length); 369 370 /* IsSecondary 371 * 372 */ 373 bool CC AlignmentIsSecondary( const Alignment *self ); 374 375 376 /* AlignmentGetCG 377 * rc_t == 0, result == 0 if no CG data 378 */ 379 rc_t CC AlignmentGetCGData ( const Alignment *self, const CGData** result); 380 381 /*-------------------------------------------------------------------------- 382 * CGData 383 */ 384 rc_t CC CGDataAddRef ( const CGData *self ); 385 rc_t CC CGDataRelease ( const CGData *self ); 386 387 /* CGGetSeqQual 388 */ 389 rc_t CC CGDataGetSeqQual ( const CGData* self, 390 char sequence[/* 35 */], 391 uint8_t quality[/* 35 */] ); 392 393 /* CGGetCigar 394 */ 395 rc_t CC CGDataGetCigar ( const CGData* self, 396 uint32_t *cigar, 397 uint32_t cig_max, 398 uint32_t *cig_act ); 399 400 /* CGGetAlignGroup 401 */ 402 rc_t CC CGDataGetAlignGroup ( const CGData* self, 403 char buffer[], 404 size_t max_size, 405 size_t *act_size ); 406 407 /*-------------------------------------------------------------------------- 408 * Rejected 409 */ 410 411 /* AddRef 412 * Release 413 */ 414 rc_t CC RejectedAddRef ( const Rejected *self ); 415 rc_t CC RejectedRelease ( const Rejected *self ); 416 417 /* GetError 418 * "text" [ OUT ] - NUL-terminated error message, held internally 419 * "line" [ OUT ] - 1-based line # in the source (0 for binary formats) 420 * "column" [ OUT ] - 1-based column # in the source (offset from the start of the file for binary formats) 421 * "fatal" [ OUT ] - no further parsing should be done (likely an unsupported format) 422 */ 423 rc_t CC RejectedGetError( const Rejected* self, const char** text, uint64_t* line, uint64_t* column, bool* fatal ); 424 425 /* GetData 426 * "data" [ OUT ] - raw input representing the rejected record. held internally 427 * "length" [ OUT ] - size of the data buffer 428 */ 429 rc_t CC RejectedGetData( const Rejected* self, const void** text, size_t* length ); 430 431 /*-------------------------------------------------------------------------- 432 * ReferenceInfo 433 */ 434 typedef struct ReferenceSequence 435 { 436 uint64_t length; 437 const char *name; /* not null unique */ 438 const uint8_t *checksum; 439 } ReferenceSequence; 440 441 typedef struct ReadGroup 442 { 443 const char *name; /* not null unique, accession e.g. SRR001138 */ 444 const char *platform; /* e.g. ILLUMINA */ 445 } ReadGroup; 446 447 rc_t CC ReferenceInfoAddRef ( const ReferenceInfo *self ); 448 rc_t CC ReferenceInfoRelease ( const ReferenceInfo *self ); 449 450 /* GetRefSeqCount 451 * get the number of Reference Sequences refered to in the header 452 * this is not necessarily the number of Reference Sequences referenced 453 * by the alignments 454 */ 455 rc_t CC ReferenceInfoGetRefSeqCount ( const ReferenceInfo *self, uint32_t* count ); 456 457 /* GetRefSeq 458 * get the n'th Ref. Seq. where n is [0..RefSeqCount) 459 * the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists. 460 */ 461 rc_t CC ReferenceInfoGetRefSeq ( const ReferenceInfo *self, uint32_t n, ReferenceSequence *result ); 462 463 /* GetReadGroupCount 464 * get the number of Read Groups (accessions, etc.) refered to in the header 465 * this is not necessarily the number of Read Groups referenced 466 * by the alignments 467 */ 468 rc_t CC ReferenceInfoGetReadGroupCount ( const ReferenceInfo *self, uint32_t *count ); 469 470 /* GetReadGroup 471 * get the n'th Read Group where n is [0..ReadGroupCount) 472 * the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists. 473 */ 474 rc_t CC ReferenceInfoGetReadGroup ( const ReferenceInfo *self, unsigned n, ReadGroup *result ); 475 476 /* GetReadGroupByName 477 * get a Read Group by its name 478 * the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists. 479 */ 480 rc_t CC ReferenceInfoGetReadGroupByName ( const ReferenceInfo *self, const char *name, ReadGroup *result ); 481 482 #ifdef __cplusplus 483 } 484 #endif 485 486 #endif /* _h_common_reader_ */ 487