1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #ifndef _h_fastq_parse_
28 #define _h_fastq_parse_
29 
30 #include <align/extern.h>
31 #include <klib/text.h>
32 
33 #include <loader/common-reader-priv.h>
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 enum FASTQQualityFormat
40 {
41     FASTQunknown,
42     FASTQphred33,
43     FASTQphred64,
44     FASTQlogodds
45 };
46 
47 struct FastqSequence
48 {
49     Sequence_vt sequence_vt;
50     KRefcount   refcount;
51 
52     /* tagline components: */
53     String spotname; /* tag line up to and including coordinates */
54     String spotgroup; /* token following '#' */
55     uint8_t readnumber; /* token following '/' 1 - IsFirst, 2 - IsSecond, 0 - dont know */
56 
57     /* not populated at this time: */
58 #if 0
59     String rungroup;
60     String fmt_name; /* x and y replaced with $X and $Y */
61     uint8_t coord_num;
62     int32_t coords[16];
63 #endif
64 
65     /* read bases */
66     String read;
67 
68     bool is_colorspace;
69 
70     String  quality;
71     uint8_t qualityFormat;
72     uint8_t qualityAsciiOffset;
73 
74     bool lowQuality;
75 };
76 
77 struct FastqRecord
78 {
79     Record  dad;
80 
81     KDataBuffer source;
82     struct FastqSequence    seq;
83     Rejected*               rej;
84 };
85 
86 typedef struct FASTQToken
87 {
88     size_t tokenStart;  /* offset into FASTQParseBlock.record->source */
89     size_t tokenLength;
90     size_t line_no;
91     size_t column_no;
92 } FASTQToken;
93 
94 /* obtain a pointer to the token's text */
95 #define TokenTextPtr(pb, token) ((const char*)((pb)->record->source.base) + (token)->tokenStart)
96 
97 typedef struct FASTQParseBlock
98 {
99     void* self;
100     size_t (CC *input)(struct FASTQParseBlock* sb, char* buf, size_t max_size);
101 
102     /* inputs for the parser */
103     size_t  expectedQualityLines;
104     uint8_t qualityFormat; /* see enum FASTQQualityFormat above */
105     int8_t  defaultReadNumber; /* -1: never assign read numbers */
106 
107     /*  Secondary (>1) read number observed previously (usually 2, sometimes 3).
108         Once one is seen, do not allow any other values in the same input file.
109         0 = has not seen one yet in this input.
110         Always record as 2 */
111     uint8_t secondaryReadNumber;
112 
113     bool ignoreSpotGroups;
114 
115     /* temporaries and outputs for the parser */
116     void* scanner;
117     size_t length; /* input characters consumed for the current record */
118     FASTQToken* lastToken;
119     struct FastqRecord* record;
120     size_t column;
121 
122     /* all offsets are into record->source */
123     size_t spotNameOffset;
124     size_t spotNameLength;
125     size_t spotNameOffset_saved; /* sometimes needed to revert to older values */
126     size_t spotNameLength_saved;
127     bool spotNameDone;
128 
129     size_t spotGroupOffset;
130     size_t spotGroupLength;
131 
132     size_t readOffset;
133     size_t readLength;
134 
135     size_t qualityOffset;
136     size_t qualityLength;
137     uint8_t qualityAsciiOffset;
138 
139     bool fatalError;
140 } FASTQParseBlock;
141 
142 extern rc_t FASTQScan_yylex_init(FASTQParseBlock* context, bool debug);
143 extern void FASTQScan_yylex_destroy(FASTQParseBlock* context);
144 
145 /* explicit FLEX state control for bison*/
146 extern void FASTQScan_inline_sequence(FASTQParseBlock* pb);
147 extern void FASTQScan_inline_quality(FASTQParseBlock* pb);
148 extern void FASTQScan_skip_to_eol(FASTQParseBlock* pb); /*the next token will be EOL or EOF*/
149 
150 extern void FASTQ_set_lineno (int line_number, void* scanner);
151 
152 extern int FASTQ_lex(FASTQToken* tok, FASTQParseBlock * pb);
153 extern void FASTQ_unlex(FASTQParseBlock* pb, FASTQToken* token);
154 extern void FASTQ_qualityContext(FASTQParseBlock* pb);
155 
156 extern int FASTQ_debug; /* set to 1 to print Bison trace */
157 
158 extern int FASTQ_parse(FASTQParseBlock* pb); /* 0 = end of input, 1 = success, a new record is in context->record, 2 - syntax error */
159 
160 /* call before parsing every record (FASTQ_parse does so internally; this is for testing the parser) */
161 extern void FASTQ_ParseBlockInit(FASTQParseBlock* pb);
162 
163 extern void FASTQ_error(FASTQParseBlock* pb, const char* msg);
164 
165 #ifdef __cplusplus
166 }
167 #endif
168 
169 #endif /* _h_fastq_parse_ */
170