1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 27 #ifndef _h_fastq_parse_ 28 #define _h_fastq_parse_ 29 30 #include <align/extern.h> 31 #include <klib/text.h> 32 33 #include <loader/common-reader-priv.h> 34 35 #ifdef __cplusplus 36 extern "C" { 37 #endif 38 39 enum FASTQQualityFormat 40 { 41 FASTQunknown, 42 FASTQphred33, 43 FASTQphred64, 44 FASTQlogodds 45 }; 46 47 struct FastqSequence 48 { 49 Sequence_vt sequence_vt; 50 KRefcount refcount; 51 52 /* tagline components: */ 53 String spotname; /* tag line up to and including coordinates */ 54 String spotgroup; /* token following '#' */ 55 uint8_t readnumber; /* token following '/' 1 - IsFirst, 2 - IsSecond, 0 - dont know */ 56 57 /* not populated at this time: */ 58 #if 0 59 String rungroup; 60 String fmt_name; /* x and y replaced with $X and $Y */ 61 uint8_t coord_num; 62 int32_t coords[16]; 63 #endif 64 65 /* read bases */ 66 String read; 67 68 bool is_colorspace; 69 70 String quality; 71 uint8_t qualityFormat; 72 uint8_t qualityAsciiOffset; 73 74 bool lowQuality; 75 }; 76 77 struct FastqRecord 78 { 79 Record dad; 80 81 KDataBuffer source; 82 struct FastqSequence seq; 83 Rejected* rej; 84 }; 85 86 typedef struct FASTQToken 87 { 88 size_t tokenStart; /* offset into FASTQParseBlock.record->source */ 89 size_t tokenLength; 90 size_t line_no; 91 size_t column_no; 92 } FASTQToken; 93 94 /* obtain a pointer to the token's text */ 95 #define TokenTextPtr(pb, token) ((const char*)((pb)->record->source.base) + (token)->tokenStart) 96 97 typedef struct FASTQParseBlock 98 { 99 void* self; 100 size_t (CC *input)(struct FASTQParseBlock* sb, char* buf, size_t max_size); 101 102 /* inputs for the parser */ 103 size_t expectedQualityLines; 104 uint8_t qualityFormat; /* see enum FASTQQualityFormat above */ 105 int8_t defaultReadNumber; /* -1: never assign read numbers */ 106 107 /* Secondary (>1) read number observed previously (usually 2, sometimes 3). 108 Once one is seen, do not allow any other values in the same input file. 109 0 = has not seen one yet in this input. 110 Always record as 2 */ 111 uint8_t secondaryReadNumber; 112 113 bool ignoreSpotGroups; 114 115 /* temporaries and outputs for the parser */ 116 void* scanner; 117 size_t length; /* input characters consumed for the current record */ 118 FASTQToken* lastToken; 119 struct FastqRecord* record; 120 size_t column; 121 122 /* all offsets are into record->source */ 123 size_t spotNameOffset; 124 size_t spotNameLength; 125 size_t spotNameOffset_saved; /* sometimes needed to revert to older values */ 126 size_t spotNameLength_saved; 127 bool spotNameDone; 128 129 size_t spotGroupOffset; 130 size_t spotGroupLength; 131 132 size_t readOffset; 133 size_t readLength; 134 135 size_t qualityOffset; 136 size_t qualityLength; 137 uint8_t qualityAsciiOffset; 138 139 bool fatalError; 140 } FASTQParseBlock; 141 142 extern rc_t FASTQScan_yylex_init(FASTQParseBlock* context, bool debug); 143 extern void FASTQScan_yylex_destroy(FASTQParseBlock* context); 144 145 /* explicit FLEX state control for bison*/ 146 extern void FASTQScan_inline_sequence(FASTQParseBlock* pb); 147 extern void FASTQScan_inline_quality(FASTQParseBlock* pb); 148 extern void FASTQScan_skip_to_eol(FASTQParseBlock* pb); /*the next token will be EOL or EOF*/ 149 150 extern void FASTQ_set_lineno (int line_number, void* scanner); 151 152 extern int FASTQ_lex(FASTQToken* tok, FASTQParseBlock * pb); 153 extern void FASTQ_unlex(FASTQParseBlock* pb, FASTQToken* token); 154 extern void FASTQ_qualityContext(FASTQParseBlock* pb); 155 156 extern int FASTQ_debug; /* set to 1 to print Bison trace */ 157 158 extern int FASTQ_parse(FASTQParseBlock* pb); /* 0 = end of input, 1 = success, a new record is in context->record, 2 - syntax error */ 159 160 /* call before parsing every record (FASTQ_parse does so internally; this is for testing the parser) */ 161 extern void FASTQ_ParseBlockInit(FASTQParseBlock* pb); 162 163 extern void FASTQ_error(FASTQParseBlock* pb, const char* msg); 164 165 #ifdef __cplusplus 166 } 167 #endif 168 169 #endif /* _h_fastq_parse_ */ 170