1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #ifndef _h_common_reader_
28 #define _h_common_reader_
29 
30 #ifndef _h_klib_defs_
31 #include <klib/defs.h>
32 #endif
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 /*--------------------------------------------------------------------------
39  * forwards
40  */
41 typedef struct ReaderFile           ReaderFile;
42 typedef struct Record               Record;
43 typedef struct Sequence             Sequence;
44 typedef struct Alignment            Alignment;
45 typedef struct CGData               CGData;
46 typedef struct Rejected             Rejected;
47 typedef struct ReferenceInfo        ReferenceInfo;
48 
49 /*--------------------------------------------------------------------------
50  ReaderFile
51  */
52 rc_t CC ReaderFileAddRef ( const ReaderFile *self );
53 rc_t CC ReaderFileRelease ( const ReaderFile *self );
54 
55 /* GetRecord
56  * Parses the next record from the source. At the end of the file, rc == 0, *result == 0.
57  */
58 rc_t CC ReaderFileGetRecord( const ReaderFile *self, const Record** result);
59 
60 /* GetPathname
61  * Returns input's pathname, if applicable.
62  */
63 const char* CC ReaderFileGetPathname ( const ReaderFile *self );
64 
65 /* GetProportionalPosition
66  *  get the aproximate proportional position in the input file
67  *  this is intended to be useful for computing progress
68  *
69  * NB - does not return rc_t
70  */
71 float CC ReaderFileGetProportionalPosition ( const ReaderFile *self );
72 
73 /* GetReferenceInfo
74  *
75  */
76 rc_t CC ReaderFileGetReferenceInfo ( const ReaderFile *self, const ReferenceInfo** result );
77 
78 /*--------------------------------------------------------------------------
79  Record
80  */
81 
82 /* AddRef
83  * Release
84  */
85 rc_t CC RecordAddRef ( const Record *self );
86 rc_t CC RecordRelease ( const Record *self );
87 
88 rc_t CC RecordGetRejected ( const Record *self, const Rejected** result);
89 rc_t CC RecordGetSequence ( const Record *self, const Sequence** result);
90 rc_t CC RecordGetAlignment( const Record *self, const Alignment** result);
91 
92 /*--------------------------------------------------------------------------
93  Sequence
94  */
95 
96 /* AddRef
97  * Release
98  */
99 rc_t CC SequenceAddRef ( const Sequence *self );
100 rc_t CC SequenceRelease ( const Sequence *self );
101 
102 /* GetReadLength
103  *  get the sequence length
104  *  i.e. the number of elements of both sequence and quality
105  *
106  *  "length" [ OUT ] - length in bases of query sequence and quality
107  */
108 rc_t CC SequenceGetReadLength ( const Sequence *self, uint32_t *length );
109 
110 /* GetRead
111  *  get the sequence data [0..ReadLength)
112  *  caller provides buffer of ReadLength bytes
113  *
114  *  "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes
115  */
116 rc_t CC SequenceGetRead( const Sequence *self, char *sequence );
117 
118 /* GetRead2
119  *  get the sequence data [0..ReadLength)
120  *  caller provides buffer of ReadLength bytes
121  *
122  *  "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes
123  *
124  *  "start" [ IN ] and "stop" [ IN ] - zero-based coordinates, half-closed interval; both have to be within ReadLength
125  */
126 rc_t CC SequenceGetRead2 ( const Sequence *self, char *sequence, uint32_t start, uint32_t stop);
127 
128 enum QualityType {
129     QT_Unknown = 0,
130     QT_Phred,
131     QT_LogOdds
132 };
133 /* GetQuality
134  *  get the raw quality data [0..ReadLength) from OQ if possible else from QUAL
135  *  values are unsigned with 0xFF == missing
136  *
137  *  "quality" [ OUT ] - return param for quality sequence
138  *   held internally, validity is guaranteed for the life of the sequence
139  *
140  *  "offset" [ OUT ] - the zero point of quality (33, 64; 0 for binary)
141  *
142  *  "qualType" [ OUT ] - quality type (phred, log-odds, unknown)
143  */
144 rc_t CC SequenceGetQuality(const Sequence *self, const int8_t **quality, uint8_t *offset, int *qualType);
145 
146 /* SequenceGetSpotGroup
147  *  get the name of the spot group (e.g. accession)
148  *
149  *  "name" [ OUT ] - return param for group name
150  *   held internally, validity is guaranteed for the life of the sequence
151  *
152  *  "length" [ OUT ] - return the number of bytes in "name"
153  */
154 rc_t CC SequenceGetSpotGroup ( const Sequence *self, const char **name, size_t *length );
155 
156 
157 /* SequenceGetSpotName
158  *  get the read name and length in bytes
159  *
160  *  "name" [ OUT ] - return param for read group name
161  *   held internally, validity is guaranteed for the life of the sequence
162  *
163  *  "length" [ OUT ] - return the number of bytes in "name"
164  */
165 rc_t CC SequenceGetSpotName ( const Sequence *self, const char **name, size_t *length );
166 
167 /* IsColorSpace
168  *  Does the sequence have colorspace info
169  */
170 bool CC SequenceIsColorSpace ( const Sequence *self );
171 
172 /* GetCSKey
173  *  get the colorspace key
174  *
175  *  "cskey" [ OUT ] - return param
176  *
177  *  return: if no colorspace info, RC is 0 but the value of cskey is undefined
178  */
179 rc_t CC SequenceGetCSKey ( const Sequence *self, char cskey[1] );
180 
181 /* GetCSReadLength
182  *  get the color space sequence length
183  *  i.e. the number of elements of both sequence and quality
184  *
185  *  "length" [ OUT ] - length in bases of query sequence and quality
186  */
187 rc_t CC SequenceGetCSReadLength ( const Sequence *self, uint32_t *length );
188 
189 /* GetCSRead
190  *  get the color space sequence data [0..ReadLength)
191  *  caller provides buffer of ReadLength bytes
192  *
193  *  "sequence" [ OUT ] - pointer to a buffer of at least ReadLength bytes
194  */
195 rc_t CC SequenceGetCSRead( const Sequence *self, char *sequence );
196 
197 /* GetCSQuality
198  *  get the color spaqce sequence's raw quality data [0..ReadLength) from OQ if possible else from QUAL
199  *  values are unsigned with 0xFF == missing
200  *
201  *  "quality" [ OUT ] - return param for quality sequence
202  *   held internally, validity is guaranteed for the life of the sequence
203  *
204  *  "offset" [ OUT ] - the zero point of quality (33, 64; 0 for binary)
205  *
206  *  "qualType" [ OUT ] - quality type (phred, log-odds, unknown)
207  */
208 rc_t CC SequenceGetCSQuality(const Sequence *self, const int8_t **quality, uint8_t *offset, int *qualType);
209 
210 
211 /* WasPaired
212  * true if read number is present and not 0
213  */
214 bool CC SequenceWasPaired     ( const Sequence *self );
215 
216 enum ReadOrientation {
217     ReadOrientationUnknown,
218     ReadOrientationForward,
219     ReadOrientationReverse
220 };
221 /* SequenceGetOrientationSelf
222  */
223 int CC SequenceGetOrientationSelf( const Sequence *self );
224 /* SequenceGetOrientationMate
225  */
226 int CC SequenceGetOrientationMate( const Sequence *self );
227 
228 /* IsFirst
229  * fastq: read number is present and equal to 1
230  */
231 bool CC SequenceIsFirst       ( const Sequence *self );
232 /* IsSecond
233  * fastq: read number is present and equal to 2
234  */
235 bool CC SequenceIsSecond      ( const Sequence *self );
236 /* IsDuplicate
237  *
238  */
239 bool CC SequenceIsDuplicate( const Sequence *self );
240 /* IsLowQuality
241  *
242  */
243 bool CC SequenceIsLowQuality( const Sequence *self );
244 
245 /*  RecordGetTI
246  *
247  */
248 rc_t SequenceGetTI(Sequence const *self, uint64_t *ti);
249 
250 /*--------------------------------------------------------------------------
251  Alignment
252  */
253 
254 /* AddRef
255  * Release
256  */
257 rc_t CC AlignmentAddRef ( const Alignment *self );
258 rc_t CC AlignmentRelease ( const Alignment *self );
259 
260 /* GetRefSeqId
261  *  get id of reference sequence
262  *  pass result into BAMFileGetRefSeqById to get the Reference Sequence record
263  *
264  *  "refSeqId" [ OUT ] - zero-based id of reference sequence
265  *   returns -1 if set is invalid within BAM ( rc may be zero )
266  */
267 rc_t CC AlignmentGetRefSeqId ( const Alignment *self, int32_t *refSeqId );
268 
269 /* GetMateRefSeqId
270  *  get id of mate's reference sequence
271  *  pass result into BAMFileGetRefSeqById to get the Reference Sequence record
272  *
273  *  "refSeqId" [ OUT ] - zero-based id of reference sequence
274  *   returns -1 if invalid
275  */
276 rc_t CC AlignmentGetMateRefSeqId ( const Alignment *self, int32_t *refSeqId );
277 
278 /* GetPosition
279  *  get the aligned position on the ref. seq.
280  *
281  *  "n" [ IN ] - zero-based position index for cases of multiple alignments
282  *
283  *  "pos" [ OUT ] - zero-based position on reference sequence
284  *  returns -1 if invalid
285  */
286 rc_t CC AlignmentGetPosition ( const Alignment *self, int64_t *pos );
287 
288 /* GetMatePosition
289  *  starting coordinate of mate's alignment on ref. seq.
290  *
291  *  "pos" [ OUT ] - zero-based position on reference sequence
292  *  returns -1 if invalid
293  */
294 rc_t CC AlignmentGetMatePosition ( const Alignment *self, int64_t *pos );
295 
296 /* GetMapQuality
297  *  return the quality score of mapping
298  *
299  *  "qual" [ OUT ] - return param for quality score
300  */
301 rc_t CC AlignmentGetMapQuality ( const Alignment *self, uint8_t *qual );
302 
303 /* GetAlignmentDetail
304  *  get the alignment details
305  *
306  *  "rslt" [ OUT, NULL OKAY ] and "count" [ IN ] - array to hold detail records
307  *
308  *  "actual" [ OUT, NULL OKAY ] - number of elements written to "rslt"
309  *   required if "rslt" is NULL
310  *
311  *  "firstMatch" [ OUT, NULL OKAY ] - zero-based index into "rslt" of the first match to the refSeq
312  *   or < 0 if invalid
313  *
314  *  "lastMatch" [ OUT, NULL OKAY ] - zero-based index into "rslt" of the last match to the refSeq
315  *   or < 0 if invalid
316  */
317 typedef uint32_t AlignOpType;
318 enum AlignOpTypes
319 {
320     align_Match    = 'M', /* 0 */
321     align_Insert   = 'I', /* 1 */
322     align_Delete   = 'D', /* 2 */
323     align_Skip     = 'N', /* 3 */
324     align_SoftClip = 'S', /* 4 */
325     align_HardClip = 'H', /* 5 */
326     align_Padded   = 'P', /* 6 */
327     align_Equal    = '=', /* 7 */
328     align_NotEqual = 'X', /* 8 */
329     align_Overlap  = 'B' /* Complete Genomics extension */
330 };
331 
332 typedef struct AlignmentDetail AlignmentDetail;
333 struct AlignmentDetail
334 {
335     int64_t refSeq_pos; /* position on refSeq where this alignment region starts or -1 if NA */
336     int32_t read_pos;   /* position on read where this alignment region starts or -1 if NA */
337     uint32_t length;    /* length of alignment region */
338     AlignOpType type;  /* type of alignment */
339 };
340 
341 rc_t CC AlignmentGetAlignmentDetail ( const Alignment *self,
342                                       AlignmentDetail *rslt,
343                                       uint32_t count,
344                                       uint32_t *actual,
345                                       int32_t *firstMatch,
346                                       int32_t *lastMatch );
347 
348 
349 /* GetCigarCount
350  *  the number of CIGAR elements
351  *  a CIGAR element consists of the pair of matching op code and op length
352  *
353  *  "n" [ OUT ] - return param for cigar count
354  */
355 rc_t CC AlignmentGetAlignOpCount ( const Alignment *self, uint32_t *n );
356 
357 
358 /* GetInsertSize
359  *  distance in bases to start of mate's alignment on ref. seq.
360  *
361  *  "size" [ OUT ] - >0 for first in pair, <0 for second
362  */
363 rc_t CC AlignmentGetInsertSize ( const Alignment *self, int64_t *size );
364 
365 /* GetBAMCigar
366  *
367  */
368 rc_t CC AlignmentGetBAMCigar(const Alignment *cself, uint32_t const **rslt, uint32_t *length);
369 
370 /* IsSecondary
371  *
372  */
373 bool CC AlignmentIsSecondary( const Alignment *self );
374 
375 
376 /* AlignmentGetCG
377  * rc_t == 0, result == 0 if no CG data
378  */
379 rc_t CC AlignmentGetCGData ( const Alignment *self, const CGData** result);
380 
381 /*--------------------------------------------------------------------------
382  * CGData
383  */
384 rc_t CC CGDataAddRef ( const CGData *self );
385 rc_t CC CGDataRelease ( const CGData *self );
386 
387 /* CGGetSeqQual
388  */
389 rc_t CC CGDataGetSeqQual ( const CGData* self,
390                            char sequence[/* 35 */],
391                            uint8_t quality[/* 35 */] );
392 
393 /* CGGetCigar
394  */
395 rc_t CC CGDataGetCigar ( const CGData* self,
396                          uint32_t *cigar,
397                          uint32_t cig_max,
398                          uint32_t *cig_act );
399 
400 /* CGGetAlignGroup
401  */
402 rc_t CC CGDataGetAlignGroup ( const CGData* self,
403                               char buffer[],
404                               size_t max_size,
405                               size_t *act_size );
406 
407 /*--------------------------------------------------------------------------
408  * Rejected
409  */
410 
411 /* AddRef
412  * Release
413  */
414 rc_t CC RejectedAddRef ( const Rejected *self );
415 rc_t CC RejectedRelease ( const Rejected *self );
416 
417 /* GetError
418  *  "text" [ OUT ] - NUL-terminated error message, held internally
419  *  "line" [ OUT ] - 1-based line # in the source (0 for binary formats)
420  *  "column" [ OUT ] - 1-based column # in the source (offset from the start of the file for binary formats)
421  *  "fatal" [ OUT ] - no further parsing should be done (likely an unsupported format)
422  */
423 rc_t CC RejectedGetError( const Rejected* self, const char** text, uint64_t* line, uint64_t* column, bool* fatal );
424 
425 /* GetData
426  *  "data" [ OUT ] - raw input representing the rejected record. held internally
427  *  "length" [ OUT ] - size of the data buffer
428  */
429 rc_t CC RejectedGetData( const Rejected* self, const void** text, size_t* length );
430 
431 /*--------------------------------------------------------------------------
432  * ReferenceInfo
433  */
434 typedef struct ReferenceSequence
435 {
436     uint64_t length;
437     const char *name; /* not null unique */
438     const uint8_t *checksum;
439 } ReferenceSequence;
440 
441 typedef struct ReadGroup
442 {
443     const char *name; /* not null unique, accession e.g. SRR001138 */
444     const char *platform; /* e.g. ILLUMINA */
445 } ReadGroup;
446 
447 rc_t CC ReferenceInfoAddRef ( const ReferenceInfo *self );
448 rc_t CC ReferenceInfoRelease ( const ReferenceInfo *self );
449 
450 /* GetRefSeqCount
451  *  get the number of Reference Sequences refered to in the header
452  *  this is not necessarily the number of Reference Sequences referenced
453  *  by the alignments
454  */
455 rc_t CC ReferenceInfoGetRefSeqCount ( const ReferenceInfo *self, uint32_t* count );
456 
457 /* GetRefSeq
458  *  get the n'th Ref. Seq. where n is [0..RefSeqCount)
459  *  the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists.
460  */
461 rc_t CC ReferenceInfoGetRefSeq ( const ReferenceInfo *self, uint32_t n, ReferenceSequence *result );
462 
463 /* GetReadGroupCount
464  *  get the number of Read Groups (accessions, etc.) refered to in the header
465  *  this is not necessarily the number of Read Groups referenced
466  *  by the alignments
467  */
468 rc_t CC ReferenceInfoGetReadGroupCount ( const ReferenceInfo *self, uint32_t *count );
469 
470 /* GetReadGroup
471  *  get the n'th Read Group where n is [0..ReadGroupCount)
472  *  the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists.
473  */
474 rc_t CC ReferenceInfoGetReadGroup ( const ReferenceInfo *self, unsigned n, ReadGroup *result );
475 
476 /* GetReadGroupByName
477  *  get a Read Group by its name
478  *  the result is populated with pointers that are good for precisely at long as the ReferenceInfo exists.
479  */
480 rc_t CC ReferenceInfoGetReadGroupByName ( const ReferenceInfo *self, const char *name, ReadGroup *result );
481 
482 #ifdef __cplusplus
483 }
484 #endif
485 
486 #endif /* _h_common_reader_ */
487