1 // stb_c_lexer.h - v0.10 - public domain Sean Barrett 2013
2 // lexer for making little C-like languages with recursive-descent parsers
3 //
4 // This file provides both the interface and the implementation.
5 // To instantiate the implementation,
6 //      #define STB_C_LEXER_IMPLEMENTATION
7 // in *ONE* source file, before #including this file.
8 //
9 // The default configuration is fairly close to a C lexer, although
10 // suffixes on integer constants are not handled (you can override this).
11 //
12 // History:
13 //     0.10 fix warnings
14 //     0.09 hex floats, no-stdlib fixes
15 //     0.08 fix bad pointer comparison
16 //     0.07 fix mishandling of hexadecimal constants parsed by strtol
17 //     0.06 fix missing next character after ending quote mark (Andreas Fredriksson)
18 //     0.05 refixed get_location because github version had lost the fix
19 //     0.04 fix octal parsing bug
20 //     0.03 added STB_C_LEX_DISCARD_PREPROCESSOR option
21 //          refactor API to simplify (only one struct instead of two)
22 //          change literal enum names to have 'lit' at the end
23 //     0.02 first public release
24 //
25 // Status:
26 //     - haven't tested compiling as C++
27 //     - haven't tested the float parsing path
28 //     - haven't tested the non-default-config paths (e.g. non-stdlib)
29 //     - only tested default-config paths by eyeballing output of self-parse
30 //
31 //     - haven't implemented multiline strings
32 //     - haven't implemented octal/hex character constants
33 //     - haven't implemented support for unicode CLEX_char
34 //     - need to expand error reporting so you don't just get "CLEX_parse_error"
35 //
36 // Contributors:
37 //   Arpad Goretity (bugfix)
38 //   Alan Hickman (hex floats)
39 //
40 // LICENSE
41 //
42 //   See end of file for license information.
43 
44 #ifndef STB_C_LEXER_DEFINITIONS
45 // to change the default parsing rules, copy the following lines
46 // into your C/C++ file *before* including this, and then replace
47 // the Y's with N's for the ones you don't want.
48 // --BEGIN--
49 
50 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
51 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
52 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
53 #define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
54 #define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
55 #define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
56 #define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
57 #define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
58 #define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
59 #define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
60 #define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
61 #define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
62 #define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
63 #define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
64 #define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
65 #define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
66 #define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
67 #define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
68 #define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
69                                         //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
70                                         //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
71                                         //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
72 
73 #define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
74 #define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
75 #define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
76 #define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
77 #define STB_C_LEX_FLOAT_SUFFIXES    ""  //
78 
79 #define STB_C_LEX_0_IS_EOF             N  // if Y, ends parsing at '\0'; if N, returns '\0' as token
80 #define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
81 #define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
82 #define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
83 #define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
84 #define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
85 #define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
86 
87 #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
88                                               // leaving it as N should help you catch config bugs
89 
90 #define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
91                                               // still have #line, #pragma, etc)
92 
93 //#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
94 
95 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
96 // --END--
97 
98 #endif
99 
100 #ifndef INCLUDE_STB_C_LEXER_H
101 #define INCLUDE_STB_C_LEXER_H
102 
103 typedef struct
104 {
105    // lexer variables
106    char *input_stream;
107    char *eof;
108    char *parse_point;
109    char *string_storage;
110    int   string_storage_len;
111 
112    // lexer parse location for error messages
113    char *where_firstchar;
114    char *where_lastchar;
115 
116    // lexer token variables
117    long token;
118    double real_number;
119    long   int_number;
120    char *string;
121    int string_len;
122 } stb_lexer;
123 
124 typedef struct
125 {
126    int line_number;
127    int line_offset;
128 } stb_lex_location;
129 
130 #ifdef __cplusplus
131 extern "C" {
132 #endif
133 
134 extern void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length);
135 // this function initialize the 'lexer' structure
136 //   Input:
137 //   - input_stream points to the file to parse, loaded into memory
138 //   - input_stream_end points to the end of the file, or NULL if you use 0-for-EOF
139 //   - string_store is storage the lexer can use for storing parsed strings and identifiers
140 //   - store_length is the length of that storage
141 
142 extern int stb_c_lexer_get_token(stb_lexer *lexer);
143 // this function returns non-zero if a token is parsed, or 0 if at EOF
144 //   Output:
145 //   - lexer->token is the token ID, which is unicode code point for a single-char token, < 0 for a multichar or eof or error
146 //   - lexer->real_number is a double constant value for CLEX_floatlit, or CLEX_intlit if STB_C_LEX_INTEGERS_AS_DOUBLES
147 //   - lexer->int_number is an integer constant for CLEX_intlit if !STB_C_LEX_INTEGERS_AS_DOUBLES, or character for CLEX_charlit
148 //   - lexer->string is a 0-terminated string for CLEX_dqstring or CLEX_sqstring or CLEX_identifier
149 //   - lexer->string_len is the byte length of lexer->string
150 
151 extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc);
152 // this inefficient function returns the line number and character offset of a
153 // given location in the file as returned by stb_lex_token. Because it's inefficient,
154 // you should only call it for errors, not for every token.
155 // For error messages of invalid tokens, you typically want the location of the start
156 // of the token (which caused the token to be invalid). For bugs involving legit
157 // tokens, you can report the first or the range.
158 //    Output:
159 //    - loc->line_number is the line number in the file, counting from 1, of the location
160 //    - loc->line_offset is the char-offset in the line, counting from 0, of the location
161 
162 
163 #ifdef __cplusplus
164 }
165 #endif
166 
167 #endif // INCLUDE_STB_C_LEXER_H
168 
169 #ifdef STB_C_LEXER_IMPLEMENTATION
170 
171    #if defined(Y) || defined(N)
172    #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
173    #endif
174 
175 
176 // Hacky definitions so we can easily #if on them
177 #define Y(x) 1
178 #define N(x) 0
179 
180 #if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
181 typedef double     stb__clex_int;
182 #define intfield   real_number
183 #define STB__clex_int_as_double
184 #else
185 typedef long       stb__clex_int;
186 #define intfield   int_number
187 #endif
188 
189 // Convert these config options to simple conditional #defines so we can more
190 // easily test them once we've change the meaning of Y/N
191 
192 #if STB_C_LEX_PARSE_SUFFIXES(x)
193 #define STB__clex_parse_suffixes
194 #endif
195 
196 #if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
197 #define STB__clex_define_int
198 #endif
199 
200 #if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
201 #define STB__clex_define_shifts
202 #endif
203 
204 #if STB_C_LEX_C99_HEX_FLOATS(x)
205 #define STB__clex_hex_floats
206 #endif
207 
208 #if STB_C_LEX_C_HEX_INTS(x)
209 #define STB__clex_hex_ints
210 #endif
211 
212 #if STB_C_LEX_C_DECIMAL_INTS(x)
213 #define STB__clex_decimal_ints
214 #endif
215 
216 #if STB_C_LEX_C_OCTAL_INTS(x)
217 #define STB__clex_octal_ints
218 #endif
219 
220 #if STB_C_LEX_C_DECIMAL_FLOATS(x)
221 #define STB__clex_decimal_floats
222 #endif
223 
224 #if STB_C_LEX_DISCARD_PREPROCESSOR(x)
225 #define STB__clex_discard_preprocessor
226 #endif
227 
228 #if STB_C_LEX_USE_STDLIB(x) && (!defined(STB__clex_hex_floats) || __STDC_VERSION__ >= 199901L)
229 #define STB__CLEX_use_stdlib
230 #include <stdlib.h>
231 #endif
232 
233 // Now pick a definition of Y/N that's conducive to
234 // defining the enum of token names.
235 #if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
236   #undef  N
237   #define N(a) Y(a)
238 #else
239   #undef  N
240   #define N(a)
241 #endif
242 
243 #undef  Y
244 #define Y(a) a,
245 
246 enum
247 {
248    CLEX_eof = 256,
249    CLEX_parse_error,
250 
251 #ifdef STB__clex_define_int
252    CLEX_intlit,
253 #endif
254 
255    STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit    )
256    STB_C_LEX_C_IDENTIFIERS(  CLEX_id            )
257    STB_C_LEX_C_DQ_STRINGS(   CLEX_dqstring      )
258    STB_C_LEX_C_SQ_STRINGS(   CLEX_sqstring      )
259    STB_C_LEX_C_CHARS(        CLEX_charlit       )
260    STB_C_LEX_C_COMPARISONS(  CLEX_eq            )
261    STB_C_LEX_C_COMPARISONS(  CLEX_noteq         )
262    STB_C_LEX_C_COMPARISONS(  CLEX_lesseq        )
263    STB_C_LEX_C_COMPARISONS(  CLEX_greatereq     )
264    STB_C_LEX_C_LOGICAL(      CLEX_andand        )
265    STB_C_LEX_C_LOGICAL(      CLEX_oror          )
266    STB_C_LEX_C_SHIFTS(       CLEX_shl           )
267    STB_C_LEX_C_SHIFTS(       CLEX_shr           )
268    STB_C_LEX_C_INCREMENTS(   CLEX_plusplus      )
269    STB_C_LEX_C_INCREMENTS(   CLEX_minusminus    )
270    STB_C_LEX_C_ARITHEQ(      CLEX_pluseq        )
271    STB_C_LEX_C_ARITHEQ(      CLEX_minuseq       )
272    STB_C_LEX_C_ARITHEQ(      CLEX_muleq         )
273    STB_C_LEX_C_ARITHEQ(      CLEX_diveq         )
274    STB_C_LEX_C_ARITHEQ(      CLEX_modeq         )
275    STB_C_LEX_C_BITWISEEQ(    CLEX_andeq         )
276    STB_C_LEX_C_BITWISEEQ(    CLEX_oreq          )
277    STB_C_LEX_C_BITWISEEQ(    CLEX_xoreq         )
278    STB_C_LEX_C_ARROW(        CLEX_arrow         )
279    STB_C_LEX_EQUAL_ARROW(    CLEX_eqarrow       )
280 
281 #ifdef STB__clex_define_shifts
282    CLEX_shleq, CLEX_shreq,
283 #endif
284 
285    CLEX_first_unused_token
286 
287 #undef Y
288 #define Y(a) a
289 };
290 
291 // Now for the rest of the file we'll use the basic definition where
292 // where Y expands to its contents and N expands to nothing
293 #undef N
294 #define N(a)
295 
296 // API function
stb_c_lexer_init(stb_lexer * lexer,const char * input_stream,const char * input_stream_end,char * string_store,int store_length)297 void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length)
298 {
299    lexer->input_stream = (char *) input_stream;
300    lexer->eof = (char *) input_stream_end;
301    lexer->parse_point = (char *) input_stream;
302    lexer->string_storage = string_store;
303    lexer->string_storage_len = store_length;
304 }
305 
306 // API function
stb_c_lexer_get_location(const stb_lexer * lexer,const char * where,stb_lex_location * loc)307 void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc)
308 {
309    char *p = lexer->input_stream;
310    int line_number = 1;
311    int char_offset = 0;
312    while (*p && p < where) {
313       if (*p == '\n' || *p == '\r') {
314          p += (p[0]+p[1] == '\r'+'\n' ? 2 : 1); // skip newline
315          line_number += 1;
316          char_offset = 0;
317       } else {
318          ++p;
319          ++char_offset;
320       }
321    }
322    loc->line_number = line_number;
323    loc->line_offset = char_offset;
324 }
325 
326 // main helper function for returning a parsed token
stb__clex_token(stb_lexer * lexer,int token,char * start,char * end)327 static int stb__clex_token(stb_lexer *lexer, int token, char *start, char *end)
328 {
329    lexer->token = token;
330    lexer->where_firstchar = start;
331    lexer->where_lastchar = end;
332    lexer->parse_point = end+1;
333    return 1;
334 }
335 
336 // helper function for returning eof
stb__clex_eof(stb_lexer * lexer)337 static int stb__clex_eof(stb_lexer *lexer)
338 {
339    lexer->token = CLEX_eof;
340    return 0;
341 }
342 
stb__clex_iswhite(int x)343 static int stb__clex_iswhite(int x)
344 {
345    return x == ' ' || x == '\t' || x == '\r' || x == '\n' || x == '\f';
346 }
347 
stb__strchr(const char * str,int ch)348 static const char *stb__strchr(const char *str, int ch)
349 {
350    for (; *str; ++str)
351       if (*str == ch)
352          return str;
353    return 0;
354 }
355 
356 // parse suffixes at the end of a number
stb__clex_parse_suffixes(stb_lexer * lexer,long tokenid,char * start,char * cur,const char * suffixes)357 static int stb__clex_parse_suffixes(stb_lexer *lexer, long tokenid, char *start, char *cur, const char *suffixes)
358 {
359    #ifdef STB__clex_parse_suffixes
360    lexer->string = lexer->string_storage;
361    lexer->string_len = 0;
362 
363    while ((*cur >= 'a' && *cur <= 'z') || (*cur >= 'A' && *cur <= 'Z')) {
364       if (stb__strchr(suffixes, *cur) == 0)
365          return stb__clex_token(lexer, CLEX_parse_error, start, cur);
366       if (lexer->string_len+1 >= lexer->string_storage_len)
367          return stb__clex_token(lexer, CLEX_parse_error, start, cur);
368       lexer->string[lexer->string_len++] = *cur++;
369    }
370    #else
371    suffixes = suffixes; // attempt to suppress warnings
372    #endif
373    return stb__clex_token(lexer, tokenid, start, cur-1);
374 }
375 
376 #ifndef STB__CLEX_use_stdlib
stb__clex_pow(double base,unsigned int exponent)377 static double stb__clex_pow(double base, unsigned int exponent)
378 {
379    double value=1;
380    for ( ; exponent; exponent >>= 1) {
381       if (exponent & 1)
382          value *= base;
383       base *= base;
384    }
385    return value;
386 }
387 
stb__clex_parse_float(char * p,char ** q)388 static double stb__clex_parse_float(char *p, char **q)
389 {
390    char *s = p;
391    double value=0;
392    int base=10;
393    int exponent=0;
394 
395 #ifdef STB__clex_hex_floats
396    if (*p == '0') {
397       if (p[1] == 'x' || p[1] == 'X') {
398          base=16;
399          p += 2;
400       }
401    }
402 #endif
403 
404    for (;;) {
405       if (*p >= '0' && *p <= '9')
406          value = value*base + (*p++ - '0');
407 #ifdef STB__clex_hex_floats
408       else if (base == 16 && *p >= 'a' && *p <= 'f')
409          value = value*base + 10 + (*p++ - 'a');
410       else if (base == 16 && *p >= 'A' && *p <= 'F')
411          value = value*base + 10 + (*p++ - 'A');
412 #endif
413       else
414          break;
415    }
416 
417    if (*p == '.') {
418       double pow, addend = 0;
419       ++p;
420       for (pow=1; ; pow*=base) {
421          if (*p >= '0' && *p <= '9')
422             addend = addend*base + (*p++ - '0');
423 #ifdef STB__clex_hex_floats
424          else if (base == 16 && *p >= 'a' && *p <= 'f')
425             addend = addend*base + 10 + (*p++ - 'a');
426          else if (base == 16 && *p >= 'A' && *p <= 'F')
427             addend = addend*base + 10 + (*p++ - 'A');
428 #endif
429          else
430             break;
431       }
432       value += addend / pow;
433    }
434 #ifdef STB__clex_hex_floats
435    if (base == 16) {
436       // exponent required for hex float literal
437       if (*p != 'p' && *p != 'P') {
438          *q = s;
439          return 0;
440       }
441       exponent = 1;
442    } else
443 #endif
444       exponent = (*p == 'e' || *p == 'E');
445 
446    if (exponent) {
447       int sign = p[1] == '-';
448       unsigned int exponent=0;
449       double power=1;
450       ++p;
451       if (*p == '-' || *p == '+')
452          ++p;
453       while (*p >= '0' && *p <= '9')
454          exponent = exponent*10 + (*p++ - '0');
455 
456 #ifdef STB__clex_hex_floats
457       if (base == 16)
458          power = stb__clex_pow(2, exponent);
459       else
460 #endif
461          power = stb__clex_pow(10, exponent);
462       if (sign)
463          value /= power;
464       else
465          value *= power;
466    }
467    *q = p;
468    return value;
469 }
470 #endif
471 
stb__clex_parse_char(char * p,char ** q)472 static int stb__clex_parse_char(char *p, char **q)
473 {
474    if (*p == '\\') {
475       *q = p+2; // tentatively guess we'll parse two characters
476       switch(p[1]) {
477          case '\\': return '\\';
478          case '\'': return '\'';
479          case '"': return '"';
480          case 't': return '\t';
481          case 'f': return '\f';
482          case 'n': return '\n';
483          case 'r': return '\r';
484          case '0': return '\0'; // @TODO ocatal constants
485          case 'x': case 'X': return -1; // @TODO hex constants
486          case 'u': return -1; // @TODO unicode constants
487       }
488    }
489    *q = p+1;
490    return (unsigned char) *p;
491 }
492 
stb__clex_parse_string(stb_lexer * lexer,char * p,int type)493 static int stb__clex_parse_string(stb_lexer *lexer, char *p, int type)
494 {
495    char *start = p;
496    char delim = *p++; // grab the " or ' for later matching
497    char *out = lexer->string_storage;
498    char *outend = lexer->string_storage + lexer->string_storage_len;
499    while (*p != delim) {
500       int n;
501       if (*p == '\\') {
502          char *q;
503          n = stb__clex_parse_char(p, &q);
504          if (n < 0)
505             return stb__clex_token(lexer, CLEX_parse_error, start, q);
506          p = q;
507       } else {
508          // @OPTIMIZE: could speed this up by looping-while-not-backslash
509          n = (unsigned char) *p++;
510       }
511       if (out+1 > outend)
512          return stb__clex_token(lexer, CLEX_parse_error, start, p);
513       // @TODO expand unicode escapes to UTF8
514       *out++ = (char) n;
515    }
516    *out = 0;
517    lexer->string = lexer->string_storage;
518    lexer->string_len = (int) (out - lexer->string_storage);
519    return stb__clex_token(lexer, type, start, p);
520 }
521 
stb_c_lexer_get_token(stb_lexer * lexer)522 int stb_c_lexer_get_token(stb_lexer *lexer)
523 {
524    char *p = lexer->parse_point;
525 
526    // skip whitespace and comments
527    for (;;) {
528       #ifdef STB_C_LEX_ISWHITE
529       while (p != lexer->stream_end) {
530          int n;
531          n = STB_C_LEX_ISWHITE(p);
532          if (n == 0) break;
533          if (lexer->eof && lexer->eof - lexer->parse_point < n)
534             return stb__clex_token(tok, CLEX_parse_error, p,lexer->eof-1);
535          p += n;
536       }
537       #else
538       while (p != lexer->eof && stb__clex_iswhite(*p))
539          ++p;
540       #endif
541 
542       STB_C_LEX_CPP_COMMENTS(
543          if (p != lexer->eof && p[0] == '/' && p[1] == '/') {
544             while (p != lexer->eof && *p != '\r' && *p != '\n')
545                ++p;
546             continue;
547          }
548       )
549 
550       STB_C_LEX_C_COMMENTS(
551          if (p != lexer->eof && p[0] == '/' && p[1] == '*') {
552             char *start = p;
553             p += 2;
554             while (p != lexer->eof && (p[0] != '*' || p[1] != '/'))
555                ++p;
556             if (p == lexer->eof)
557                return stb__clex_token(lexer, CLEX_parse_error, start, p-1);
558             p += 2;
559             continue;
560          }
561       )
562 
563       #ifdef STB__clex_discard_preprocessor
564          // @TODO this discards everything after a '#', regardless
565          // of where in the line the # is, rather than requiring it
566          // be at the start. (because this parser doesn't otherwise
567          // check for line breaks!)
568          if (p != lexer->eof && p[0] == '#') {
569             while (p != lexer->eof && *p != '\r' && *p != '\n')
570                ++p;
571             continue;
572          }
573       #endif
574 
575       break;
576    }
577 
578    if (p == lexer->eof)
579       return stb__clex_eof(lexer);
580 
581    switch (*p) {
582       default:
583          if (   (*p >= 'a' && *p <= 'z')
584              || (*p >= 'A' && *p <= 'Z')
585              || *p == '_' || (unsigned char) *p >= 128    // >= 128 is UTF8 char
586              STB_C_LEX_DOLLAR_IDENTIFIER( || *p == '$' ) )
587          {
588             int n = 0;
589             lexer->string = lexer->string_storage;
590             lexer->string_len = n;
591             do {
592                if (n+1 >= lexer->string_storage_len)
593                   return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
594                lexer->string[n] = p[n];
595                ++n;
596             } while (
597                   (p[n] >= 'a' && p[n] <= 'z')
598                || (p[n] >= 'A' && p[n] <= 'Z')
599                || (p[n] >= '0' && p[n] <= '9') // allow digits in middle of identifier
600                || p[n] == '_' || (unsigned char) p[n] >= 128
601                 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
602             );
603             lexer->string[n] = 0;
604             return stb__clex_token(lexer, CLEX_id, p, p+n-1);
605          }
606 
607          // check for EOF
608          STB_C_LEX_0_IS_EOF(
609             if (*p == 0)
610                return stb__clex_eof(tok);
611          )
612 
613       single_char:
614          // not an identifier, return the character as itself
615          return stb__clex_token(lexer, *p, p, p);
616 
617       case '+':
618          if (p+1 != lexer->eof) {
619             STB_C_LEX_C_INCREMENTS(if (p[1] == '+') return stb__clex_token(lexer, CLEX_plusplus, p,p+1);)
620             STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_pluseq  , p,p+1);)
621          }
622          goto single_char;
623       case '-':
624          if (p+1 != lexer->eof) {
625             STB_C_LEX_C_INCREMENTS(if (p[1] == '-') return stb__clex_token(lexer, CLEX_minusminus, p,p+1);)
626             STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_minuseq   , p,p+1);)
627             STB_C_LEX_C_ARROW(     if (p[1] == '>') return stb__clex_token(lexer, CLEX_arrow     , p,p+1);)
628          }
629          goto single_char;
630       case '&':
631          if (p+1 != lexer->eof) {
632             STB_C_LEX_C_LOGICAL(  if (p[1] == '&') return stb__clex_token(lexer, CLEX_andand, p,p+1);)
633             STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_andeq , p,p+1);)
634          }
635          goto single_char;
636       case '|':
637          if (p+1 != lexer->eof) {
638             STB_C_LEX_C_LOGICAL(  if (p[1] == '|') return stb__clex_token(lexer, CLEX_oror, p,p+1);)
639             STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_oreq, p,p+1);)
640          }
641          goto single_char;
642       case '=':
643          if (p+1 != lexer->eof) {
644             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_eq, p,p+1);)
645             STB_C_LEX_EQUAL_ARROW(  if (p[1] == '>') return stb__clex_token(lexer, CLEX_eqarrow, p,p+1);)
646          }
647          goto single_char;
648       case '!':
649          STB_C_LEX_C_COMPARISONS(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_noteq, p,p+1);)
650          goto single_char;
651       case '^':
652          STB_C_LEX_C_BITWISEEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_xoreq, p,p+1));
653          goto single_char;
654       case '%':
655          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_modeq, p,p+1));
656          goto single_char;
657       case '*':
658          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_muleq, p,p+1));
659          goto single_char;
660       case '/':
661          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_diveq, p,p+1));
662          goto single_char;
663       case '<':
664          if (p+1 != lexer->eof) {
665             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_lesseq, p,p+1);)
666             STB_C_LEX_C_SHIFTS(     if (p[1] == '<') {
667                                        STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
668                                                               return stb__clex_token(lexer, CLEX_shleq, p,p+2);)
669                                        return stb__clex_token(lexer, CLEX_shl, p,p+1);
670                                     }
671                               )
672          }
673          goto single_char;
674       case '>':
675          if (p+1 != lexer->eof) {
676             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_greatereq, p,p+1);)
677             STB_C_LEX_C_SHIFTS(     if (p[1] == '>') {
678                                        STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
679                                                               return stb__clex_token(lexer, CLEX_shreq, p,p+2);)
680                                        return stb__clex_token(lexer, CLEX_shr, p,p+1);
681                                     }
682                               )
683          }
684          goto single_char;
685 
686       case '"':
687          STB_C_LEX_C_DQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_dqstring);)
688          goto single_char;
689       case '\'':
690          STB_C_LEX_C_SQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_sqstring);)
691          STB_C_LEX_C_CHARS(
692          {
693             char *start = p;
694             lexer->int_number = stb__clex_parse_char(p+1, &p);
695             if (lexer->int_number < 0)
696                return stb__clex_token(lexer, CLEX_parse_error, start,start);
697             if (p == lexer->eof || *p != '\'')
698                return stb__clex_token(lexer, CLEX_parse_error, start,p);
699             return stb__clex_token(lexer, CLEX_charlit, start, p+1);
700          })
701          goto single_char;
702 
703       case '0':
704          #if defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
705             if (p+1 != lexer->eof) {
706                if (p[1] == 'x' || p[1] == 'X') {
707                   char *q;
708 
709                   #ifdef STB__clex_hex_floats
710                   for (q=p+2;
711                        q != lexer->eof && ((*q >= '0' && *q <= '9') || (*q >= 'a' && *q <= 'f') || (*q >= 'A' && *q <= 'F'));
712                        ++q);
713                   if (q != lexer->eof) {
714                      if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'p' || *q == 'P')) {
715                         #ifdef STB__CLEX_use_stdlib
716                         lexer->real_number = strtod((char *) p, (char**) &q);
717                         #else
718                         lexer->real_number = stb__clex_parse_float(p, &q);
719                         #endif
720 
721                         if (p == q)
722                            return stb__clex_token(lexer, CLEX_parse_error, p,q);
723                         return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
724 
725                      }
726                   }
727                   #endif   // STB__CLEX_hex_floats
728 
729                   #ifdef STB__clex_hex_ints
730                   #ifdef STB__CLEX_use_stdlib
731                   lexer->int_number = strtol((char *) p, (char **) &q, 16);
732                   #else
733                   {
734                      stb__clex_int n=0;
735                      for (q=p+2; q != lexer->eof; ++q) {
736                         if (*q >= '0' && *q <= '9')
737                            n = n*16 + (*q - '0');
738                         else if (*q >= 'a' && *q <= 'f')
739                            n = n*16 + (*q - 'a') + 10;
740                         else if (*q >= 'A' && *q <= 'F')
741                            n = n*16 + (*q - 'A') + 10;
742                         else
743                            break;
744                      }
745                      lexer->int_number = n;
746                   }
747                   #endif
748                   if (q == p+2)
749                      return stb__clex_token(lexer, CLEX_parse_error, p-2,p-1);
750                   return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_HEX_SUFFIXES);
751                   #endif
752                }
753             }
754          #endif // defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
755          // can't test for octal because we might parse '0.0' as float or as '0' '.' '0',
756          // so have to do float first
757 
758          /* FALL THROUGH */
759       case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
760 
761          #ifdef STB__clex_decimal_floats
762          {
763             char *q = p;
764             while (q != lexer->eof && (*q >= '0' && *q <= '9'))
765                ++q;
766             if (q != lexer->eof) {
767                if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'e' || *q == 'E')) {
768                   #ifdef STB__CLEX_use_stdlib
769                   lexer->real_number = strtod((char *) p, (char**) &q);
770                   #else
771                   lexer->real_number = stb__clex_parse_float(p, &q);
772                   #endif
773 
774                   return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
775 
776                }
777             }
778          }
779          #endif // STB__clex_decimal_floats
780 
781          #ifdef STB__clex_octal_ints
782          if (p[0] == '0') {
783             char *q = p;
784             #ifdef STB__CLEX_use_stdlib
785             lexer->int_number = strtol((char *) p, (char **) &q, 8);
786             #else
787             stb__clex_int n=0;
788             while (q != lexer->eof) {
789                if (*q >= '0' && *q <= '7')
790                   n = n*8 + (*q - '0');
791                else
792                   break;
793                ++q;
794             }
795             if (q != lexer->eof && (*q == '8' || *q=='9'))
796                return stb__clex_token(lexer, CLEX_parse_error, p, q);
797             lexer->int_number = n;
798             #endif
799             return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
800          }
801          #endif // STB__clex_octal_ints
802 
803          #ifdef STB__clex_decimal_ints
804          {
805             char *q = p;
806             #ifdef STB__CLEX_use_stdlib
807             lexer->int_number = strtol((char *) p, (char **) &q, 10);
808             #else
809             stb__clex_int n=0;
810             while (q != lexer->eof) {
811                if (*q >= '0' && *q <= '9')
812                   n = n*10 + (*q - '0');
813                else
814                   break;
815                ++q;
816             }
817             lexer->int_number = n;
818             #endif
819             return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
820          }
821          #endif // STB__clex_decimal_ints
822          goto single_char;
823    }
824 }
825 #endif // STB_C_LEXER_IMPLEMENTATION
826 
827 #ifdef STB_C_LEXER_SELF_TEST
828 #define _CRT_SECURE_NO_WARNINGS
829 #include <stdio.h>
830 #include <stdlib.h>
831 
print_token(stb_lexer * lexer)832 static void print_token(stb_lexer *lexer)
833 {
834    switch (lexer->token) {
835       case CLEX_id        : printf("_%s", lexer->string); break;
836       case CLEX_eq        : printf("=="); break;
837       case CLEX_noteq     : printf("!="); break;
838       case CLEX_lesseq    : printf("<="); break;
839       case CLEX_greatereq : printf(">="); break;
840       case CLEX_andand    : printf("&&"); break;
841       case CLEX_oror      : printf("||"); break;
842       case CLEX_shl       : printf("<<"); break;
843       case CLEX_shr       : printf(">>"); break;
844       case CLEX_plusplus  : printf("++"); break;
845       case CLEX_minusminus: printf("--"); break;
846       case CLEX_arrow     : printf("->"); break;
847       case CLEX_andeq     : printf("&="); break;
848       case CLEX_oreq      : printf("|="); break;
849       case CLEX_xoreq     : printf("^="); break;
850       case CLEX_pluseq    : printf("+="); break;
851       case CLEX_minuseq   : printf("-="); break;
852       case CLEX_muleq     : printf("*="); break;
853       case CLEX_diveq     : printf("/="); break;
854       case CLEX_modeq     : printf("%%="); break;
855       case CLEX_shleq     : printf("<<="); break;
856       case CLEX_shreq     : printf(">>="); break;
857       case CLEX_eqarrow   : printf("=>"); break;
858       case CLEX_dqstring  : printf("\"%s\"", lexer->string); break;
859       case CLEX_sqstring  : printf("'\"%s\"'", lexer->string); break;
860       case CLEX_charlit   : printf("'%s'", lexer->string); break;
861       #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
862       case CLEX_intlit    : printf("#%g", lexer->real_number); break;
863       #else
864       case CLEX_intlit    : printf("#%ld", lexer->int_number); break;
865       #endif
866       case CLEX_floatlit  : printf("%g", lexer->real_number); break;
867       default:
868          if (lexer->token >= 0 && lexer->token < 256)
869             printf("%c", (int) lexer->token);
870          else {
871             printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
872          }
873          break;
874    }
875 }
876 
877 /* Force a test
878 of parsing
879 multiline comments */
880 
881 /*/ comment /*/
882 /**/ extern /**/
883 
dummy(void)884 void dummy(void)
885 {
886    double some_floats[] = {
887       1.0501, -10.4e12, 5E+10,
888 #if 0   // not supported in C++ or C-pre-99, so don't try to compile it, but let our parser test it
889       0x1.0p+24, 0xff.FP-8, 0x1p-23,
890 #endif
891       4.
892    };
893    (void) sizeof(some_floats);
894 
895    printf("test %d",1); // https://github.com/nothings/stb/issues/13
896 }
897 
main(int argc,char ** argv)898 int main(int argc, char **argv)
899 {
900    FILE *f = fopen("stb_c_lexer.h","rb");
901    char *text = (char *) malloc(1 << 20);
902    int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
903    stb_lexer lex;
904    if (len < 0) {
905       fprintf(stderr, "Error opening file\n");
906       free(text);
907       fclose(f);
908       return 1;
909    }
910    fclose(f);
911 
912    stb_c_lexer_init(&lex, text, text+len, (char *) malloc(0x10000), 0x10000);
913    while (stb_c_lexer_get_token(&lex)) {
914       if (lex.token == CLEX_parse_error) {
915          printf("\n<<<PARSE ERROR>>>\n");
916          break;
917       }
918       print_token(&lex);
919       printf("  ");
920    }
921    return 0;
922 }
923 #endif
924 /*
925 ------------------------------------------------------------------------------
926 This software is available under 2 licenses -- choose whichever you prefer.
927 ------------------------------------------------------------------------------
928 ALTERNATIVE A - MIT License
929 Copyright (c) 2017 Sean Barrett
930 Permission is hereby granted, free of charge, to any person obtaining a copy of
931 this software and associated documentation files (the "Software"), to deal in
932 the Software without restriction, including without limitation the rights to
933 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
934 of the Software, and to permit persons to whom the Software is furnished to do
935 so, subject to the following conditions:
936 The above copyright notice and this permission notice shall be included in all
937 copies or substantial portions of the Software.
938 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
939 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
940 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
941 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
942 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
943 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
944 SOFTWARE.
945 ------------------------------------------------------------------------------
946 ALTERNATIVE B - Public Domain (www.unlicense.org)
947 This is free and unencumbered software released into the public domain.
948 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
949 software, either in source code form or as a compiled binary, for any purpose,
950 commercial or non-commercial, and by any means.
951 In jurisdictions that recognize copyright laws, the author or authors of this
952 software dedicate any and all copyright interest in the software to the public
953 domain. We make this dedication for the benefit of the public at large and to
954 the detriment of our heirs and successors. We intend this dedication to be an
955 overt act of relinquishment in perpetuity of all present and future rights to
956 this software under copyright law.
957 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
958 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
959 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
960 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
961 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
962 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
963 ------------------------------------------------------------------------------
964 */
965