1// re2c $INPUT -o $OUTPUT -fg 2/* 3 * A push-model scanner example for re2c -f 4 * Written Mon Apr 11 2005 by mgix@mgix.com 5 * This file is in the public domain. 6 * 7 */ 8 9// ---------------------------------------------------------------------- 10 11#include <fcntl.h> 12#include <stdio.h> 13#include <stddef.h> 14#include <stdlib.h> 15#include <string.h> 16 17#if defined(WIN32) 18 19 typedef signed char int8_t; 20 typedef signed short int16_t; 21 typedef signed int int32_t; 22 23 typedef unsigned char uint8_t; 24 typedef unsigned short uint16_t; 25 typedef unsigned int uint32_t; 26 27#else 28 29 #include <stdint.h> 30 #include <unistd.h> 31 32 #ifndef O_BINARY 33 #define O_BINARY 0 34 #endif 35 36#endif 37 38// ---------------------------------------------------------------------- 39#define TOKENS \ 40 \ 41 TOK(kEOF) \ 42 TOK(kEOL) \ 43 TOK(kUnknown) \ 44 TOK(kIdentifier) \ 45 TOK(kDecimalConstant) \ 46 \ 47 TOK(kEqual) \ 48 TOK(kLeftParen) \ 49 TOK(kRightParen) \ 50 TOK(kMinus) \ 51 TOK(kPlus) \ 52 TOK(kStar) \ 53 TOK(kSlash) \ 54 \ 55 TOK(kIf) \ 56 TOK(kFor) \ 57 TOK(kElse) \ 58 TOK(kGoto) \ 59 TOK(kBreak) \ 60 TOK(kWhile) \ 61 TOK(kReturn) \ 62 63 64// ---------------------------------------------------------------------- 65static const char *tokenNames[] = 66{ 67 #define TOK(x) #x, 68 TOKENS 69 #undef TOK 70}; 71 72// ---------------------------------------------------------------------- 73class PushScanner 74{ 75public: 76 77 enum Token 78 { 79 #define TOK(x) x, 80 TOKENS 81 #undef TOK 82 }; 83 84private: 85 86 bool eof; 87 int32_t state; 88 89 uint8_t *limit; 90 uint8_t *start; 91 uint8_t *cursor; 92 uint8_t *marker; 93 94 uint8_t *buffer; 95 uint8_t *bufferEnd; 96 97 uint8_t yych; 98 uint32_t yyaccept; 99 100public: 101 102 // ---------------------------------------------------------------------- 103 PushScanner() 104 { 105 limit = 0; 106 start = 0; 107 state = -1; 108 cursor = 0; 109 marker = 0; 110 buffer = 0; 111 eof = false; 112 bufferEnd = 0; 113 } 114 115 // ---------------------------------------------------------------------- 116 ~PushScanner() 117 { 118 } 119 120 // ---------------------------------------------------------------------- 121 void send( 122 Token token 123 ) 124 { 125 size_t tokenSize = cursor-start; 126 const char *tokenName = tokenNames[token]; 127 printf( 128 "scanner is pushing out a token of type %d (%s)", 129 token, 130 tokenName 131 ); 132 133 if(token==kEOF) putchar('\n'); 134 else 135 { 136 size_t tokenNameSize = strlen(tokenNames[token]); 137 size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize); 138 for(size_t i=0; i<padSize; ++i) putchar(' '); 139 printf(" : ---->"); 140 141 fwrite( 142 start, 143 tokenSize, 144 1, 145 stdout 146 ); 147 148 printf("<----\n"); 149 } 150 } 151 152 // ---------------------------------------------------------------------- 153 uint32_t push( 154 const void *input, 155 ssize_t inputSize 156 ) 157 { 158 printf( 159 "scanner is receiving a new data batch of length %d\n" 160 "scanner continues with saved state = %d\n", 161 inputSize, 162 state 163 ); 164 165 /* 166 * Data source is signaling end of file when batch size 167 * is less than maxFill. This is slightly annoying because 168 * maxFill is a value that can only be known after re2c does 169 * its thing. Practically though, maxFill is never bigger than 170 * the longest keyword, so given our grammar, 32 is a safe bet. 171 */ 172 uint8_t null[64]; 173 const ssize_t maxFill = 32; 174 if(inputSize<maxFill) 175 { 176 eof = true; 177 input = null; 178 inputSize = sizeof(null); 179 memset(null, 0, sizeof(null)); 180 } 181 182 /* 183 * When we get here, we have a partially 184 * consumed buffer which is in the following state: 185 * last valid char last valid buffer spot 186 * v v 187 * +-------------------+-------------+---------------+-------------+----------------------+ 188 * ^ ^ ^ ^ ^ ^ 189 * buffer start marker cursor limit bufferEnd 190 * 191 * We need to stretch the buffer and concatenate the new chunk of input to it 192 * 193 */ 194 size_t used = limit-buffer; 195 size_t needed = used+inputSize; 196 size_t allocated = bufferEnd-buffer; 197 if(allocated<needed) 198 { 199 size_t limitOffset = limit-buffer; 200 size_t startOffset = start-buffer; 201 size_t markerOffset = marker-buffer; 202 size_t cursorOffset = cursor-buffer; 203 204 buffer = (uint8_t*)realloc(buffer, needed); 205 bufferEnd = needed+buffer; 206 207 marker = markerOffset + buffer; 208 cursor = cursorOffset + buffer; 209 start = buffer + startOffset; 210 limit = limitOffset + buffer; 211 } 212 memcpy(limit, input, inputSize); 213 limit += inputSize; 214 215 // The scanner starts here 216 #define YYLIMIT limit 217 #define YYCURSOR cursor 218 #define YYMARKER marker 219 #define YYCTYPE uint8_t 220 221 #define SKIP(x) { start = cursor; goto yy0; } 222 #define SEND(x) { send(x); SKIP(); } 223 #define YYFILL(n) { goto fill; } 224 225 #define YYGETSTATE() state 226 #define YYSETSTATE(x) { state = (x); } 227 228 start: 229 230 /*!re2c 231 re2c:startlabel = 1; 232 eol = "\n"; 233 eof = "\000"; 234 digit = [0-9]; 235 integer = digit+; 236 alpha = [A-Za-z_]; 237 any = [\000-\377]; 238 space = [ \h\t\v\f\r]; 239 240 "if" { SEND(kIf); } 241 "for" { SEND(kFor); } 242 "else" { SEND(kElse); } 243 "goto" { SEND(kGoto); } 244 "break" { SEND(kBreak); } 245 "while" { SEND(kWhile); } 246 "return" { SEND(kReturn); } 247 alpha (alpha|digit)* { SEND(kIdentifier); } 248 integer { SEND(kDecimalConstant);} 249 250 "=" { SEND(kEqual); } 251 "(" { SEND(kLeftParen); } 252 ")" { SEND(kRightParen); } 253 "-" { SEND(kMinus); } 254 "+" { SEND(kPlus); } 255 "*" { SEND(kStar); } 256 "/" { SEND(kSlash); } 257 258 eol { SKIP(); } 259 space { SKIP(); } 260 eof { send(kEOF); return 1; } 261 any { SEND(kUnknown); } 262 */ 263 264 fill: 265 ssize_t unfinishedSize = cursor-start; 266 printf( 267 "scanner needs a refill. Exiting for now with:\n" 268 " saved fill state = %d\n" 269 " unfinished token size = %d\n", 270 state, 271 unfinishedSize 272 ); 273 274 if(0<unfinishedSize && start<limit) 275 { 276 printf(" unfinished token is :"); 277 fwrite(start, 1, cursor-start, stdout); 278 putchar('\n'); 279 } 280 putchar('\n'); 281 282 /* 283 * Once we get here, we can get rid of 284 * everything before start and after limit. 285 */ 286 if(eof==true) goto start; 287 if(buffer<start) 288 { 289 size_t startOffset = start-buffer; 290 memmove(buffer, start, limit-start); 291 marker -= startOffset; 292 cursor -= startOffset; 293 limit -= startOffset; 294 start -= startOffset; 295 } 296 return 0; 297 } 298}; 299 300// ---------------------------------------------------------------------- 301int main( 302 int argc, 303 char **argv 304) 305{ 306 // Parse cmd line 307 int input = 0; 308 if(1<argc) 309 { 310 input = open(argv[1], O_RDONLY | O_BINARY); 311 if(input<0) 312 { 313 fprintf( 314 stderr, 315 "could not open file %s\n", 316 argv[1] 317 ); 318 exit(1); 319 } 320 } 321 322 /* 323 * Tokenize input file by pushing batches 324 * of data one by one into the scanner. 325 */ 326 const size_t batchSize = 256; 327 uint8_t buffer[batchSize]; 328 PushScanner scanner; 329 while(1) 330 { 331 ssize_t n = read(input, buffer, batchSize); 332 scanner.push(buffer, n); 333 if(n<batchSize) break; 334 } 335 scanner.push(0, -1); 336 close(input); 337 338 // Done 339 return 0; 340} 341 342