1// re2c $INPUT -o $OUTPUT -fg
2/*
3 *  A push-model scanner example for re2c -f
4 *  Written Mon Apr 11 2005 by mgix@mgix.com
5 *  This file is in the public domain.
6 *
7 */
8
9// ----------------------------------------------------------------------
10
11#include <fcntl.h>
12#include <stdio.h>
13#include <stddef.h>
14#include <stdlib.h>
15#include <string.h>
16
17#if defined(WIN32)
18
19    typedef signed char     int8_t;
20    typedef signed short    int16_t;
21    typedef signed int      int32_t;
22
23    typedef unsigned char   uint8_t;
24    typedef unsigned short  uint16_t;
25    typedef unsigned int    uint32_t;
26
27#else
28
29    #include <stdint.h>
30    #include <unistd.h>
31
32    #ifndef O_BINARY
33        #define O_BINARY 0
34    #endif
35
36#endif
37
38// ----------------------------------------------------------------------
39#define TOKENS              \
40                            \
41    TOK(kEOF)               \
42    TOK(kEOL)               \
43    TOK(kUnknown)           \
44    TOK(kIdentifier)        \
45    TOK(kDecimalConstant)   \
46                            \
47    TOK(kEqual)             \
48    TOK(kLeftParen)         \
49    TOK(kRightParen)        \
50    TOK(kMinus)             \
51    TOK(kPlus)              \
52    TOK(kStar)              \
53    TOK(kSlash)             \
54                            \
55    TOK(kIf)                \
56    TOK(kFor)               \
57    TOK(kElse)              \
58    TOK(kGoto)              \
59    TOK(kBreak)             \
60    TOK(kWhile)             \
61    TOK(kReturn)            \
62
63
64// ----------------------------------------------------------------------
65static const char *tokenNames[] =
66{
67    #define TOK(x) #x,
68        TOKENS
69    #undef TOK
70};
71
72// ----------------------------------------------------------------------
73class PushScanner
74{
75public:
76
77    enum Token
78    {
79        #define TOK(x) x,
80            TOKENS
81        #undef TOK
82    };
83
84private:
85
86    bool        eof;
87    int32_t     state;
88
89    uint8_t     *limit;
90    uint8_t     *start;
91    uint8_t     *cursor;
92    uint8_t     *marker;
93
94    uint8_t     *buffer;
95    uint8_t     *bufferEnd;
96
97    uint8_t     yych;
98    uint32_t    yyaccept;
99
100public:
101
102    // ----------------------------------------------------------------------
103    PushScanner()
104    {
105        limit = 0;
106        start = 0;
107        state = -1;
108        cursor = 0;
109        marker = 0;
110        buffer = 0;
111        eof = false;
112        bufferEnd = 0;
113    }
114
115    // ----------------------------------------------------------------------
116    ~PushScanner()
117    {
118    }
119
120    // ----------------------------------------------------------------------
121    void send(
122        Token token
123    )
124    {
125        size_t tokenSize = cursor-start;
126        const char *tokenName = tokenNames[token];
127        printf(
128            "scanner is pushing out a token of type %d (%s)",
129            token,
130            tokenName
131        );
132
133        if(token==kEOF) putchar('\n');
134        else
135        {
136            size_t tokenNameSize = strlen(tokenNames[token]);
137            size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize);
138            for(size_t i=0; i<padSize; ++i) putchar(' ');
139            printf(" : ---->");
140
141            fwrite(
142                start,
143                tokenSize,
144                1,
145                stdout
146            );
147
148            printf("<----\n");
149        }
150    }
151
152    // ----------------------------------------------------------------------
153    uint32_t push(
154        const void  *input,
155        ssize_t     inputSize
156    )
157    {
158        printf(
159            "scanner is receiving a new data batch of length %d\n"
160            "scanner continues with saved state = %d\n",
161            inputSize,
162            state
163        );
164
165        /*
166         * Data source is signaling end of file when batch size
167         * is less than maxFill. This is slightly annoying because
168         * maxFill is a value that can only be known after re2c does
169         * its thing. Practically though, maxFill is never bigger than
170         * the longest keyword, so given our grammar, 32 is a safe bet.
171         */
172        uint8_t null[64];
173        const ssize_t maxFill = 32;
174        if(inputSize<maxFill)
175        {
176            eof = true;
177            input = null;
178            inputSize = sizeof(null);
179            memset(null, 0, sizeof(null));
180        }
181
182        /*
183         * When we get here, we have a partially
184         * consumed buffer which is in the following state:
185         *                                                                last valid char        last valid buffer spot
186         *                                                                v                      v
187         * +-------------------+-------------+---------------+-------------+----------------------+
188         * ^                   ^             ^               ^             ^                      ^
189         * buffer              start         marker          cursor        limit                  bufferEnd
190         *
191         * We need to stretch the buffer and concatenate the new chunk of input to it
192         *
193         */
194        size_t used = limit-buffer;
195        size_t needed = used+inputSize;
196        size_t allocated = bufferEnd-buffer;
197        if(allocated<needed)
198        {
199            size_t limitOffset = limit-buffer;
200            size_t startOffset = start-buffer;
201            size_t markerOffset = marker-buffer;
202            size_t cursorOffset = cursor-buffer;
203
204                buffer = (uint8_t*)realloc(buffer, needed);
205                bufferEnd = needed+buffer;
206
207            marker = markerOffset + buffer;
208            cursor = cursorOffset + buffer;
209            start = buffer + startOffset;
210            limit = limitOffset + buffer;
211        }
212        memcpy(limit, input, inputSize);
213        limit += inputSize;
214
215        // The scanner starts here
216        #define YYLIMIT         limit
217        #define YYCURSOR        cursor
218        #define YYMARKER        marker
219        #define YYCTYPE         uint8_t
220
221        #define SKIP(x)         { start = cursor; goto yy0; }
222        #define SEND(x)         { send(x); SKIP();          }
223        #define YYFILL(n)       { goto fill;                }
224
225        #define YYGETSTATE()    state
226        #define YYSETSTATE(x)   { state = (x);  }
227
228    start:
229
230        /*!re2c
231        	re2c:startlabel = 1;
232            eol = "\n";
233            eof = "\000";
234            digit = [0-9];
235            integer = digit+;
236            alpha = [A-Za-z_];
237            any = [\000-\377];
238            space = [ \h\t\v\f\r];
239
240            "if"                    { SEND(kIf);             }
241            "for"                   { SEND(kFor);            }
242            "else"                  { SEND(kElse);           }
243            "goto"                  { SEND(kGoto);           }
244            "break"                 { SEND(kBreak);          }
245            "while"                 { SEND(kWhile);          }
246            "return"                { SEND(kReturn);         }
247            alpha (alpha|digit)*    { SEND(kIdentifier);     }
248            integer                 { SEND(kDecimalConstant);}
249
250            "="                     { SEND(kEqual);          }
251            "("                     { SEND(kLeftParen);      }
252            ")"                     { SEND(kRightParen);     }
253            "-"                     { SEND(kMinus);          }
254            "+"                     { SEND(kPlus);           }
255            "*"                     { SEND(kStar);           }
256            "/"                     { SEND(kSlash);          }
257
258            eol                     { SKIP();                }
259            space                   { SKIP();                }
260            eof                     { send(kEOF); return 1;  }
261            any                     { SEND(kUnknown);        }
262        */
263
264    fill:
265        ssize_t unfinishedSize = cursor-start;
266        printf(
267            "scanner needs a refill. Exiting for now with:\n"
268            "    saved fill state = %d\n"
269            "    unfinished token size = %d\n",
270            state,
271            unfinishedSize
272        );
273
274        if(0<unfinishedSize && start<limit)
275        {
276            printf("    unfinished token is :");
277            fwrite(start, 1, cursor-start, stdout);
278            putchar('\n');
279        }
280        putchar('\n');
281
282        /*
283         * Once we get here, we can get rid of
284         * everything before start and after limit.
285         */
286        if(eof==true) goto start;
287        if(buffer<start)
288        {
289            size_t startOffset = start-buffer;
290            memmove(buffer, start, limit-start);
291            marker -= startOffset;
292            cursor -= startOffset;
293            limit -= startOffset;
294            start -= startOffset;
295        }
296        return 0;
297    }
298};
299
300// ----------------------------------------------------------------------
301int main(
302    int     argc,
303    char    **argv
304)
305{
306    // Parse cmd line
307    int input = 0;
308    if(1<argc)
309    {
310        input = open(argv[1], O_RDONLY | O_BINARY);
311        if(input<0)
312        {
313            fprintf(
314                stderr,
315                "could not open file %s\n",
316                argv[1]
317            );
318            exit(1);
319        }
320    }
321
322    /*
323     * Tokenize input file by pushing batches
324     * of data one by one into the scanner.
325     */
326    const size_t batchSize = 256;
327    uint8_t buffer[batchSize];
328    PushScanner scanner;
329    while(1)
330    {
331        ssize_t n = read(input, buffer, batchSize);
332        scanner.push(buffer, n);
333        if(n<batchSize) break;
334    }
335    scanner.push(0, -1);
336    close(input);
337
338    // Done
339    return 0;
340}
341
342