1 // stb_c_lexer.h - v0.10 - public domain Sean Barrett 2013
2 // lexer for making little C-like languages with recursive-descent parsers
3 //
4 // This file provides both the interface and the implementation.
5 // To instantiate the implementation,
6 // #define STB_C_LEXER_IMPLEMENTATION
7 // in *ONE* source file, before #including this file.
8 //
9 // The default configuration is fairly close to a C lexer, although
10 // suffixes on integer constants are not handled (you can override this).
11 //
12 // History:
13 // 0.10 fix warnings
14 // 0.09 hex floats, no-stdlib fixes
15 // 0.08 fix bad pointer comparison
16 // 0.07 fix mishandling of hexadecimal constants parsed by strtol
17 // 0.06 fix missing next character after ending quote mark (Andreas Fredriksson)
18 // 0.05 refixed get_location because github version had lost the fix
19 // 0.04 fix octal parsing bug
20 // 0.03 added STB_C_LEX_DISCARD_PREPROCESSOR option
21 // refactor API to simplify (only one struct instead of two)
22 // change literal enum names to have 'lit' at the end
23 // 0.02 first public release
24 //
25 // Status:
26 // - haven't tested compiling as C++
27 // - haven't tested the float parsing path
28 // - haven't tested the non-default-config paths (e.g. non-stdlib)
29 // - only tested default-config paths by eyeballing output of self-parse
30 //
31 // - haven't implemented multiline strings
32 // - haven't implemented octal/hex character constants
33 // - haven't implemented support for unicode CLEX_char
34 // - need to expand error reporting so you don't just get "CLEX_parse_error"
35 //
36 // Contributors:
37 // Arpad Goretity (bugfix)
38 // Alan Hickman (hex floats)
39 //
40 // LICENSE
41 //
42 // See end of file for license information.
43
44 #ifndef STB_C_LEXER_DEFINITIONS
45 // to change the default parsing rules, copy the following lines
46 // into your C/C++ file *before* including this, and then replace
47 // the Y's with N's for the ones you don't want.
48 // --BEGIN--
49
50 #define STB_C_LEX_C_DECIMAL_INTS Y // "0|[1-9][0-9]*" CLEX_intlit
51 #define STB_C_LEX_C_HEX_INTS Y // "0x[0-9a-fA-F]+" CLEX_intlit
52 #define STB_C_LEX_C_OCTAL_INTS Y // "[0-7]+" CLEX_intlit
53 #define STB_C_LEX_C_DECIMAL_FLOATS Y // "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?) CLEX_floatlit
54 #define STB_C_LEX_C99_HEX_FLOATS N // "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+ CLEX_floatlit
55 #define STB_C_LEX_C_IDENTIFIERS Y // "[_a-zA-Z][_a-zA-Z0-9]*" CLEX_id
56 #define STB_C_LEX_C_DQ_STRINGS Y // double-quote-delimited strings with escapes CLEX_dqstring
57 #define STB_C_LEX_C_SQ_STRINGS N // single-quote-delimited strings with escapes CLEX_ssstring
58 #define STB_C_LEX_C_CHARS Y // single-quote-delimited character with escape CLEX_charlits
59 #define STB_C_LEX_C_COMMENTS Y // "/* comment */"
60 #define STB_C_LEX_CPP_COMMENTS Y // "// comment to end of line\n"
61 #define STB_C_LEX_C_COMPARISONS Y // "==" CLEX_eq "!=" CLEX_noteq "<=" CLEX_lesseq ">=" CLEX_greatereq
62 #define STB_C_LEX_C_LOGICAL Y // "&&" CLEX_andand "||" CLEX_oror
63 #define STB_C_LEX_C_SHIFTS Y // "<<" CLEX_shl ">>" CLEX_shr
64 #define STB_C_LEX_C_INCREMENTS Y // "++" CLEX_plusplus "--" CLEX_minusminus
65 #define STB_C_LEX_C_ARROW Y // "->" CLEX_arrow
66 #define STB_C_LEX_EQUAL_ARROW N // "=>" CLEX_eqarrow
67 #define STB_C_LEX_C_BITWISEEQ Y // "&=" CLEX_andeq "|=" CLEX_oreq "^=" CLEX_xoreq
68 #define STB_C_LEX_C_ARITHEQ Y // "+=" CLEX_pluseq "-=" CLEX_minuseq
69 // "*=" CLEX_muleq "/=" CLEX_diveq "%=" CLEX_modeq
70 // if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
71 // "<<=" CLEX_shleq ">>=" CLEX_shreq
72
73 #define STB_C_LEX_PARSE_SUFFIXES N // letters after numbers are parsed as part of those numbers, and must be in suffix list below
74 #define STB_C_LEX_DECIMAL_SUFFIXES "" // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
75 #define STB_C_LEX_HEX_SUFFIXES "" // e.g. "uUlL"
76 #define STB_C_LEX_OCTAL_SUFFIXES "" // e.g. "uUlL"
77 #define STB_C_LEX_FLOAT_SUFFIXES "" //
78
79 #define STB_C_LEX_0_IS_EOF N // if Y, ends parsing at '\0'; if N, returns '\0' as token
80 #define STB_C_LEX_INTEGERS_AS_DOUBLES N // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
81 #define STB_C_LEX_MULTILINE_DSTRINGS N // allow newlines in double-quoted strings
82 #define STB_C_LEX_MULTILINE_SSTRINGS N // allow newlines in single-quoted strings
83 #define STB_C_LEX_USE_STDLIB Y // use strtod,strtol for parsing #s; otherwise inaccurate hack
84 #define STB_C_LEX_DOLLAR_IDENTIFIER Y // allow $ as an identifier character
85 #define STB_C_LEX_FLOAT_NO_DECIMAL Y // allow floats that have no decimal point if they have an exponent
86
87 #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES N // if Y, all CLEX_ token names are defined, even if never returned
88 // leaving it as N should help you catch config bugs
89
90 #define STB_C_LEX_DISCARD_PREPROCESSOR Y // discard C-preprocessor directives (e.g. after prepocess
91 // still have #line, #pragma, etc)
92
93 //#define STB_C_LEX_ISWHITE(str) ... // return length in bytes of whitespace characters if first char is whitespace
94
95 #define STB_C_LEXER_DEFINITIONS // This line prevents the header file from replacing your definitions
96 // --END--
97
98 #endif
99
100 #ifndef INCLUDE_STB_C_LEXER_H
101 #define INCLUDE_STB_C_LEXER_H
102
103 typedef struct
104 {
105 // lexer variables
106 char *input_stream;
107 char *eof;
108 char *parse_point;
109 char *string_storage;
110 int string_storage_len;
111
112 // lexer parse location for error messages
113 char *where_firstchar;
114 char *where_lastchar;
115
116 // lexer token variables
117 long token;
118 double real_number;
119 long int_number;
120 char *string;
121 int string_len;
122 } stb_lexer;
123
124 typedef struct
125 {
126 int line_number;
127 int line_offset;
128 } stb_lex_location;
129
130 #ifdef __cplusplus
131 extern "C" {
132 #endif
133
134 extern void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length);
135 // this function initialize the 'lexer' structure
136 // Input:
137 // - input_stream points to the file to parse, loaded into memory
138 // - input_stream_end points to the end of the file, or NULL if you use 0-for-EOF
139 // - string_store is storage the lexer can use for storing parsed strings and identifiers
140 // - store_length is the length of that storage
141
142 extern int stb_c_lexer_get_token(stb_lexer *lexer);
143 // this function returns non-zero if a token is parsed, or 0 if at EOF
144 // Output:
145 // - lexer->token is the token ID, which is unicode code point for a single-char token, < 0 for a multichar or eof or error
146 // - lexer->real_number is a double constant value for CLEX_floatlit, or CLEX_intlit if STB_C_LEX_INTEGERS_AS_DOUBLES
147 // - lexer->int_number is an integer constant for CLEX_intlit if !STB_C_LEX_INTEGERS_AS_DOUBLES, or character for CLEX_charlit
148 // - lexer->string is a 0-terminated string for CLEX_dqstring or CLEX_sqstring or CLEX_identifier
149 // - lexer->string_len is the byte length of lexer->string
150
151 extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc);
152 // this inefficient function returns the line number and character offset of a
153 // given location in the file as returned by stb_lex_token. Because it's inefficient,
154 // you should only call it for errors, not for every token.
155 // For error messages of invalid tokens, you typically want the location of the start
156 // of the token (which caused the token to be invalid). For bugs involving legit
157 // tokens, you can report the first or the range.
158 // Output:
159 // - loc->line_number is the line number in the file, counting from 1, of the location
160 // - loc->line_offset is the char-offset in the line, counting from 0, of the location
161
162
163 #ifdef __cplusplus
164 }
165 #endif
166
167 #endif // INCLUDE_STB_C_LEXER_H
168
169 #ifdef STB_C_LEXER_IMPLEMENTATION
170
171 #if defined(Y) || defined(N)
172 #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
173 #endif
174
175
176 // Hacky definitions so we can easily #if on them
177 #define Y(x) 1
178 #define N(x) 0
179
180 #if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
181 typedef double stb__clex_int;
182 #define intfield real_number
183 #define STB__clex_int_as_double
184 #else
185 typedef long stb__clex_int;
186 #define intfield int_number
187 #endif
188
189 // Convert these config options to simple conditional #defines so we can more
190 // easily test them once we've change the meaning of Y/N
191
192 #if STB_C_LEX_PARSE_SUFFIXES(x)
193 #define STB__clex_parse_suffixes
194 #endif
195
196 #if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
197 #define STB__clex_define_int
198 #endif
199
200 #if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
201 #define STB__clex_define_shifts
202 #endif
203
204 #if STB_C_LEX_C99_HEX_FLOATS(x)
205 #define STB__clex_hex_floats
206 #endif
207
208 #if STB_C_LEX_C_HEX_INTS(x)
209 #define STB__clex_hex_ints
210 #endif
211
212 #if STB_C_LEX_C_DECIMAL_INTS(x)
213 #define STB__clex_decimal_ints
214 #endif
215
216 #if STB_C_LEX_C_OCTAL_INTS(x)
217 #define STB__clex_octal_ints
218 #endif
219
220 #if STB_C_LEX_C_DECIMAL_FLOATS(x)
221 #define STB__clex_decimal_floats
222 #endif
223
224 #if STB_C_LEX_DISCARD_PREPROCESSOR(x)
225 #define STB__clex_discard_preprocessor
226 #endif
227
228 #if STB_C_LEX_USE_STDLIB(x) && (!defined(STB__clex_hex_floats) || __STDC_VERSION__ >= 199901L)
229 #define STB__CLEX_use_stdlib
230 #include <stdlib.h>
231 #endif
232
233 // Now pick a definition of Y/N that's conducive to
234 // defining the enum of token names.
235 #if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
236 #undef N
237 #define N(a) Y(a)
238 #else
239 #undef N
240 #define N(a)
241 #endif
242
243 #undef Y
244 #define Y(a) a,
245
246 enum
247 {
248 CLEX_eof = 256,
249 CLEX_parse_error,
250
251 #ifdef STB__clex_define_int
252 CLEX_intlit,
253 #endif
254
255 STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit )
256 STB_C_LEX_C_IDENTIFIERS( CLEX_id )
257 STB_C_LEX_C_DQ_STRINGS( CLEX_dqstring )
258 STB_C_LEX_C_SQ_STRINGS( CLEX_sqstring )
259 STB_C_LEX_C_CHARS( CLEX_charlit )
260 STB_C_LEX_C_COMPARISONS( CLEX_eq )
261 STB_C_LEX_C_COMPARISONS( CLEX_noteq )
262 STB_C_LEX_C_COMPARISONS( CLEX_lesseq )
263 STB_C_LEX_C_COMPARISONS( CLEX_greatereq )
264 STB_C_LEX_C_LOGICAL( CLEX_andand )
265 STB_C_LEX_C_LOGICAL( CLEX_oror )
266 STB_C_LEX_C_SHIFTS( CLEX_shl )
267 STB_C_LEX_C_SHIFTS( CLEX_shr )
268 STB_C_LEX_C_INCREMENTS( CLEX_plusplus )
269 STB_C_LEX_C_INCREMENTS( CLEX_minusminus )
270 STB_C_LEX_C_ARITHEQ( CLEX_pluseq )
271 STB_C_LEX_C_ARITHEQ( CLEX_minuseq )
272 STB_C_LEX_C_ARITHEQ( CLEX_muleq )
273 STB_C_LEX_C_ARITHEQ( CLEX_diveq )
274 STB_C_LEX_C_ARITHEQ( CLEX_modeq )
275 STB_C_LEX_C_BITWISEEQ( CLEX_andeq )
276 STB_C_LEX_C_BITWISEEQ( CLEX_oreq )
277 STB_C_LEX_C_BITWISEEQ( CLEX_xoreq )
278 STB_C_LEX_C_ARROW( CLEX_arrow )
279 STB_C_LEX_EQUAL_ARROW( CLEX_eqarrow )
280
281 #ifdef STB__clex_define_shifts
282 CLEX_shleq, CLEX_shreq,
283 #endif
284
285 CLEX_first_unused_token
286
287 #undef Y
288 #define Y(a) a
289 };
290
291 // Now for the rest of the file we'll use the basic definition where
292 // where Y expands to its contents and N expands to nothing
293 #undef N
294 #define N(a)
295
296 // API function
stb_c_lexer_init(stb_lexer * lexer,const char * input_stream,const char * input_stream_end,char * string_store,int store_length)297 void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length)
298 {
299 lexer->input_stream = (char *) input_stream;
300 lexer->eof = (char *) input_stream_end;
301 lexer->parse_point = (char *) input_stream;
302 lexer->string_storage = string_store;
303 lexer->string_storage_len = store_length;
304 }
305
306 // API function
stb_c_lexer_get_location(const stb_lexer * lexer,const char * where,stb_lex_location * loc)307 void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc)
308 {
309 char *p = lexer->input_stream;
310 int line_number = 1;
311 int char_offset = 0;
312 while (*p && p < where) {
313 if (*p == '\n' || *p == '\r') {
314 p += (p[0]+p[1] == '\r'+'\n' ? 2 : 1); // skip newline
315 line_number += 1;
316 char_offset = 0;
317 } else {
318 ++p;
319 ++char_offset;
320 }
321 }
322 loc->line_number = line_number;
323 loc->line_offset = char_offset;
324 }
325
326 // main helper function for returning a parsed token
stb__clex_token(stb_lexer * lexer,int token,char * start,char * end)327 static int stb__clex_token(stb_lexer *lexer, int token, char *start, char *end)
328 {
329 lexer->token = token;
330 lexer->where_firstchar = start;
331 lexer->where_lastchar = end;
332 lexer->parse_point = end+1;
333 return 1;
334 }
335
336 // helper function for returning eof
stb__clex_eof(stb_lexer * lexer)337 static int stb__clex_eof(stb_lexer *lexer)
338 {
339 lexer->token = CLEX_eof;
340 return 0;
341 }
342
stb__clex_iswhite(int x)343 static int stb__clex_iswhite(int x)
344 {
345 return x == ' ' || x == '\t' || x == '\r' || x == '\n' || x == '\f';
346 }
347
stb__strchr(const char * str,int ch)348 static const char *stb__strchr(const char *str, int ch)
349 {
350 for (; *str; ++str)
351 if (*str == ch)
352 return str;
353 return 0;
354 }
355
356 // parse suffixes at the end of a number
stb__clex_parse_suffixes(stb_lexer * lexer,long tokenid,char * start,char * cur,const char * suffixes)357 static int stb__clex_parse_suffixes(stb_lexer *lexer, long tokenid, char *start, char *cur, const char *suffixes)
358 {
359 #ifdef STB__clex_parse_suffixes
360 lexer->string = lexer->string_storage;
361 lexer->string_len = 0;
362
363 while ((*cur >= 'a' && *cur <= 'z') || (*cur >= 'A' && *cur <= 'Z')) {
364 if (stb__strchr(suffixes, *cur) == 0)
365 return stb__clex_token(lexer, CLEX_parse_error, start, cur);
366 if (lexer->string_len+1 >= lexer->string_storage_len)
367 return stb__clex_token(lexer, CLEX_parse_error, start, cur);
368 lexer->string[lexer->string_len++] = *cur++;
369 }
370 #else
371 suffixes = suffixes; // attempt to suppress warnings
372 #endif
373 return stb__clex_token(lexer, tokenid, start, cur-1);
374 }
375
376 #ifndef STB__CLEX_use_stdlib
stb__clex_pow(double base,unsigned int exponent)377 static double stb__clex_pow(double base, unsigned int exponent)
378 {
379 double value=1;
380 for ( ; exponent; exponent >>= 1) {
381 if (exponent & 1)
382 value *= base;
383 base *= base;
384 }
385 return value;
386 }
387
stb__clex_parse_float(char * p,char ** q)388 static double stb__clex_parse_float(char *p, char **q)
389 {
390 char *s = p;
391 double value=0;
392 int base=10;
393 int exponent=0;
394
395 #ifdef STB__clex_hex_floats
396 if (*p == '0') {
397 if (p[1] == 'x' || p[1] == 'X') {
398 base=16;
399 p += 2;
400 }
401 }
402 #endif
403
404 for (;;) {
405 if (*p >= '0' && *p <= '9')
406 value = value*base + (*p++ - '0');
407 #ifdef STB__clex_hex_floats
408 else if (base == 16 && *p >= 'a' && *p <= 'f')
409 value = value*base + 10 + (*p++ - 'a');
410 else if (base == 16 && *p >= 'A' && *p <= 'F')
411 value = value*base + 10 + (*p++ - 'A');
412 #endif
413 else
414 break;
415 }
416
417 if (*p == '.') {
418 double pow, addend = 0;
419 ++p;
420 for (pow=1; ; pow*=base) {
421 if (*p >= '0' && *p <= '9')
422 addend = addend*base + (*p++ - '0');
423 #ifdef STB__clex_hex_floats
424 else if (base == 16 && *p >= 'a' && *p <= 'f')
425 addend = addend*base + 10 + (*p++ - 'a');
426 else if (base == 16 && *p >= 'A' && *p <= 'F')
427 addend = addend*base + 10 + (*p++ - 'A');
428 #endif
429 else
430 break;
431 }
432 value += addend / pow;
433 }
434 #ifdef STB__clex_hex_floats
435 if (base == 16) {
436 // exponent required for hex float literal
437 if (*p != 'p' && *p != 'P') {
438 *q = s;
439 return 0;
440 }
441 exponent = 1;
442 } else
443 #endif
444 exponent = (*p == 'e' || *p == 'E');
445
446 if (exponent) {
447 int sign = p[1] == '-';
448 unsigned int exponent=0;
449 double power=1;
450 ++p;
451 if (*p == '-' || *p == '+')
452 ++p;
453 while (*p >= '0' && *p <= '9')
454 exponent = exponent*10 + (*p++ - '0');
455
456 #ifdef STB__clex_hex_floats
457 if (base == 16)
458 power = stb__clex_pow(2, exponent);
459 else
460 #endif
461 power = stb__clex_pow(10, exponent);
462 if (sign)
463 value /= power;
464 else
465 value *= power;
466 }
467 *q = p;
468 return value;
469 }
470 #endif
471
stb__clex_parse_char(char * p,char ** q)472 static int stb__clex_parse_char(char *p, char **q)
473 {
474 if (*p == '\\') {
475 *q = p+2; // tentatively guess we'll parse two characters
476 switch(p[1]) {
477 case '\\': return '\\';
478 case '\'': return '\'';
479 case '"': return '"';
480 case 't': return '\t';
481 case 'f': return '\f';
482 case 'n': return '\n';
483 case 'r': return '\r';
484 case '0': return '\0'; // @TODO ocatal constants
485 case 'x': case 'X': return -1; // @TODO hex constants
486 case 'u': return -1; // @TODO unicode constants
487 }
488 }
489 *q = p+1;
490 return (unsigned char) *p;
491 }
492
stb__clex_parse_string(stb_lexer * lexer,char * p,int type)493 static int stb__clex_parse_string(stb_lexer *lexer, char *p, int type)
494 {
495 char *start = p;
496 char delim = *p++; // grab the " or ' for later matching
497 char *out = lexer->string_storage;
498 char *outend = lexer->string_storage + lexer->string_storage_len;
499 while (*p != delim) {
500 int n;
501 if (*p == '\\') {
502 char *q;
503 n = stb__clex_parse_char(p, &q);
504 if (n < 0)
505 return stb__clex_token(lexer, CLEX_parse_error, start, q);
506 p = q;
507 } else {
508 // @OPTIMIZE: could speed this up by looping-while-not-backslash
509 n = (unsigned char) *p++;
510 }
511 if (out+1 > outend)
512 return stb__clex_token(lexer, CLEX_parse_error, start, p);
513 // @TODO expand unicode escapes to UTF8
514 *out++ = (char) n;
515 }
516 *out = 0;
517 lexer->string = lexer->string_storage;
518 lexer->string_len = (int) (out - lexer->string_storage);
519 return stb__clex_token(lexer, type, start, p);
520 }
521
stb_c_lexer_get_token(stb_lexer * lexer)522 int stb_c_lexer_get_token(stb_lexer *lexer)
523 {
524 char *p = lexer->parse_point;
525
526 // skip whitespace and comments
527 for (;;) {
528 #ifdef STB_C_LEX_ISWHITE
529 while (p != lexer->stream_end) {
530 int n;
531 n = STB_C_LEX_ISWHITE(p);
532 if (n == 0) break;
533 if (lexer->eof && lexer->eof - lexer->parse_point < n)
534 return stb__clex_token(tok, CLEX_parse_error, p,lexer->eof-1);
535 p += n;
536 }
537 #else
538 while (p != lexer->eof && stb__clex_iswhite(*p))
539 ++p;
540 #endif
541
542 STB_C_LEX_CPP_COMMENTS(
543 if (p != lexer->eof && p[0] == '/' && p[1] == '/') {
544 while (p != lexer->eof && *p != '\r' && *p != '\n')
545 ++p;
546 continue;
547 }
548 )
549
550 STB_C_LEX_C_COMMENTS(
551 if (p != lexer->eof && p[0] == '/' && p[1] == '*') {
552 char *start = p;
553 p += 2;
554 while (p != lexer->eof && (p[0] != '*' || p[1] != '/'))
555 ++p;
556 if (p == lexer->eof)
557 return stb__clex_token(lexer, CLEX_parse_error, start, p-1);
558 p += 2;
559 continue;
560 }
561 )
562
563 #ifdef STB__clex_discard_preprocessor
564 // @TODO this discards everything after a '#', regardless
565 // of where in the line the # is, rather than requiring it
566 // be at the start. (because this parser doesn't otherwise
567 // check for line breaks!)
568 if (p != lexer->eof && p[0] == '#') {
569 while (p != lexer->eof && *p != '\r' && *p != '\n')
570 ++p;
571 continue;
572 }
573 #endif
574
575 break;
576 }
577
578 if (p == lexer->eof)
579 return stb__clex_eof(lexer);
580
581 switch (*p) {
582 default:
583 if ( (*p >= 'a' && *p <= 'z')
584 || (*p >= 'A' && *p <= 'Z')
585 || *p == '_' || (unsigned char) *p >= 128 // >= 128 is UTF8 char
586 STB_C_LEX_DOLLAR_IDENTIFIER( || *p == '$' ) )
587 {
588 int n = 0;
589 lexer->string = lexer->string_storage;
590 lexer->string_len = n;
591 do {
592 if (n+1 >= lexer->string_storage_len)
593 return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
594 lexer->string[n] = p[n];
595 ++n;
596 } while (
597 (p[n] >= 'a' && p[n] <= 'z')
598 || (p[n] >= 'A' && p[n] <= 'Z')
599 || (p[n] >= '0' && p[n] <= '9') // allow digits in middle of identifier
600 || p[n] == '_' || (unsigned char) p[n] >= 128
601 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
602 );
603 lexer->string[n] = 0;
604 return stb__clex_token(lexer, CLEX_id, p, p+n-1);
605 }
606
607 // check for EOF
608 STB_C_LEX_0_IS_EOF(
609 if (*p == 0)
610 return stb__clex_eof(tok);
611 )
612
613 single_char:
614 // not an identifier, return the character as itself
615 return stb__clex_token(lexer, *p, p, p);
616
617 case '+':
618 if (p+1 != lexer->eof) {
619 STB_C_LEX_C_INCREMENTS(if (p[1] == '+') return stb__clex_token(lexer, CLEX_plusplus, p,p+1);)
620 STB_C_LEX_C_ARITHEQ( if (p[1] == '=') return stb__clex_token(lexer, CLEX_pluseq , p,p+1);)
621 }
622 goto single_char;
623 case '-':
624 if (p+1 != lexer->eof) {
625 STB_C_LEX_C_INCREMENTS(if (p[1] == '-') return stb__clex_token(lexer, CLEX_minusminus, p,p+1);)
626 STB_C_LEX_C_ARITHEQ( if (p[1] == '=') return stb__clex_token(lexer, CLEX_minuseq , p,p+1);)
627 STB_C_LEX_C_ARROW( if (p[1] == '>') return stb__clex_token(lexer, CLEX_arrow , p,p+1);)
628 }
629 goto single_char;
630 case '&':
631 if (p+1 != lexer->eof) {
632 STB_C_LEX_C_LOGICAL( if (p[1] == '&') return stb__clex_token(lexer, CLEX_andand, p,p+1);)
633 STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_andeq , p,p+1);)
634 }
635 goto single_char;
636 case '|':
637 if (p+1 != lexer->eof) {
638 STB_C_LEX_C_LOGICAL( if (p[1] == '|') return stb__clex_token(lexer, CLEX_oror, p,p+1);)
639 STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_oreq, p,p+1);)
640 }
641 goto single_char;
642 case '=':
643 if (p+1 != lexer->eof) {
644 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_eq, p,p+1);)
645 STB_C_LEX_EQUAL_ARROW( if (p[1] == '>') return stb__clex_token(lexer, CLEX_eqarrow, p,p+1);)
646 }
647 goto single_char;
648 case '!':
649 STB_C_LEX_C_COMPARISONS(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_noteq, p,p+1);)
650 goto single_char;
651 case '^':
652 STB_C_LEX_C_BITWISEEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_xoreq, p,p+1));
653 goto single_char;
654 case '%':
655 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_modeq, p,p+1));
656 goto single_char;
657 case '*':
658 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_muleq, p,p+1));
659 goto single_char;
660 case '/':
661 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_diveq, p,p+1));
662 goto single_char;
663 case '<':
664 if (p+1 != lexer->eof) {
665 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_lesseq, p,p+1);)
666 STB_C_LEX_C_SHIFTS( if (p[1] == '<') {
667 STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
668 return stb__clex_token(lexer, CLEX_shleq, p,p+2);)
669 return stb__clex_token(lexer, CLEX_shl, p,p+1);
670 }
671 )
672 }
673 goto single_char;
674 case '>':
675 if (p+1 != lexer->eof) {
676 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_greatereq, p,p+1);)
677 STB_C_LEX_C_SHIFTS( if (p[1] == '>') {
678 STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
679 return stb__clex_token(lexer, CLEX_shreq, p,p+2);)
680 return stb__clex_token(lexer, CLEX_shr, p,p+1);
681 }
682 )
683 }
684 goto single_char;
685
686 case '"':
687 STB_C_LEX_C_DQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_dqstring);)
688 goto single_char;
689 case '\'':
690 STB_C_LEX_C_SQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_sqstring);)
691 STB_C_LEX_C_CHARS(
692 {
693 char *start = p;
694 lexer->int_number = stb__clex_parse_char(p+1, &p);
695 if (lexer->int_number < 0)
696 return stb__clex_token(lexer, CLEX_parse_error, start,start);
697 if (p == lexer->eof || *p != '\'')
698 return stb__clex_token(lexer, CLEX_parse_error, start,p);
699 return stb__clex_token(lexer, CLEX_charlit, start, p+1);
700 })
701 goto single_char;
702
703 case '0':
704 #if defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
705 if (p+1 != lexer->eof) {
706 if (p[1] == 'x' || p[1] == 'X') {
707 char *q;
708
709 #ifdef STB__clex_hex_floats
710 for (q=p+2;
711 q != lexer->eof && ((*q >= '0' && *q <= '9') || (*q >= 'a' && *q <= 'f') || (*q >= 'A' && *q <= 'F'));
712 ++q);
713 if (q != lexer->eof) {
714 if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'p' || *q == 'P')) {
715 #ifdef STB__CLEX_use_stdlib
716 lexer->real_number = strtod((char *) p, (char**) &q);
717 #else
718 lexer->real_number = stb__clex_parse_float(p, &q);
719 #endif
720
721 if (p == q)
722 return stb__clex_token(lexer, CLEX_parse_error, p,q);
723 return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
724
725 }
726 }
727 #endif // STB__CLEX_hex_floats
728
729 #ifdef STB__clex_hex_ints
730 #ifdef STB__CLEX_use_stdlib
731 lexer->int_number = strtol((char *) p, (char **) &q, 16);
732 #else
733 {
734 stb__clex_int n=0;
735 for (q=p+2; q != lexer->eof; ++q) {
736 if (*q >= '0' && *q <= '9')
737 n = n*16 + (*q - '0');
738 else if (*q >= 'a' && *q <= 'f')
739 n = n*16 + (*q - 'a') + 10;
740 else if (*q >= 'A' && *q <= 'F')
741 n = n*16 + (*q - 'A') + 10;
742 else
743 break;
744 }
745 lexer->int_number = n;
746 }
747 #endif
748 if (q == p+2)
749 return stb__clex_token(lexer, CLEX_parse_error, p-2,p-1);
750 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_HEX_SUFFIXES);
751 #endif
752 }
753 }
754 #endif // defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
755 // can't test for octal because we might parse '0.0' as float or as '0' '.' '0',
756 // so have to do float first
757
758 /* FALL THROUGH */
759 case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
760
761 #ifdef STB__clex_decimal_floats
762 {
763 char *q = p;
764 while (q != lexer->eof && (*q >= '0' && *q <= '9'))
765 ++q;
766 if (q != lexer->eof) {
767 if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'e' || *q == 'E')) {
768 #ifdef STB__CLEX_use_stdlib
769 lexer->real_number = strtod((char *) p, (char**) &q);
770 #else
771 lexer->real_number = stb__clex_parse_float(p, &q);
772 #endif
773
774 return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
775
776 }
777 }
778 }
779 #endif // STB__clex_decimal_floats
780
781 #ifdef STB__clex_octal_ints
782 if (p[0] == '0') {
783 char *q = p;
784 #ifdef STB__CLEX_use_stdlib
785 lexer->int_number = strtol((char *) p, (char **) &q, 8);
786 #else
787 stb__clex_int n=0;
788 while (q != lexer->eof) {
789 if (*q >= '0' && *q <= '7')
790 n = n*8 + (*q - '0');
791 else
792 break;
793 ++q;
794 }
795 if (q != lexer->eof && (*q == '8' || *q=='9'))
796 return stb__clex_token(lexer, CLEX_parse_error, p, q);
797 lexer->int_number = n;
798 #endif
799 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
800 }
801 #endif // STB__clex_octal_ints
802
803 #ifdef STB__clex_decimal_ints
804 {
805 char *q = p;
806 #ifdef STB__CLEX_use_stdlib
807 lexer->int_number = strtol((char *) p, (char **) &q, 10);
808 #else
809 stb__clex_int n=0;
810 while (q != lexer->eof) {
811 if (*q >= '0' && *q <= '9')
812 n = n*10 + (*q - '0');
813 else
814 break;
815 ++q;
816 }
817 lexer->int_number = n;
818 #endif
819 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
820 }
821 #endif // STB__clex_decimal_ints
822 goto single_char;
823 }
824 }
825 #endif // STB_C_LEXER_IMPLEMENTATION
826
827 #ifdef STB_C_LEXER_SELF_TEST
828 #define _CRT_SECURE_NO_WARNINGS
829 #include <stdio.h>
830 #include <stdlib.h>
831
print_token(stb_lexer * lexer)832 static void print_token(stb_lexer *lexer)
833 {
834 switch (lexer->token) {
835 case CLEX_id : printf("_%s", lexer->string); break;
836 case CLEX_eq : printf("=="); break;
837 case CLEX_noteq : printf("!="); break;
838 case CLEX_lesseq : printf("<="); break;
839 case CLEX_greatereq : printf(">="); break;
840 case CLEX_andand : printf("&&"); break;
841 case CLEX_oror : printf("||"); break;
842 case CLEX_shl : printf("<<"); break;
843 case CLEX_shr : printf(">>"); break;
844 case CLEX_plusplus : printf("++"); break;
845 case CLEX_minusminus: printf("--"); break;
846 case CLEX_arrow : printf("->"); break;
847 case CLEX_andeq : printf("&="); break;
848 case CLEX_oreq : printf("|="); break;
849 case CLEX_xoreq : printf("^="); break;
850 case CLEX_pluseq : printf("+="); break;
851 case CLEX_minuseq : printf("-="); break;
852 case CLEX_muleq : printf("*="); break;
853 case CLEX_diveq : printf("/="); break;
854 case CLEX_modeq : printf("%%="); break;
855 case CLEX_shleq : printf("<<="); break;
856 case CLEX_shreq : printf(">>="); break;
857 case CLEX_eqarrow : printf("=>"); break;
858 case CLEX_dqstring : printf("\"%s\"", lexer->string); break;
859 case CLEX_sqstring : printf("'\"%s\"'", lexer->string); break;
860 case CLEX_charlit : printf("'%s'", lexer->string); break;
861 #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
862 case CLEX_intlit : printf("#%g", lexer->real_number); break;
863 #else
864 case CLEX_intlit : printf("#%ld", lexer->int_number); break;
865 #endif
866 case CLEX_floatlit : printf("%g", lexer->real_number); break;
867 default:
868 if (lexer->token >= 0 && lexer->token < 256)
869 printf("%c", (int) lexer->token);
870 else {
871 printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
872 }
873 break;
874 }
875 }
876
877 /* Force a test
878 of parsing
879 multiline comments */
880
881 /*/ comment /*/
882 /**/ extern /**/
883
dummy(void)884 void dummy(void)
885 {
886 double some_floats[] = {
887 1.0501, -10.4e12, 5E+10,
888 #if 0 // not supported in C++ or C-pre-99, so don't try to compile it, but let our parser test it
889 0x1.0p+24, 0xff.FP-8, 0x1p-23,
890 #endif
891 4.
892 };
893 (void) sizeof(some_floats);
894
895 printf("test %d",1); // https://github.com/nothings/stb/issues/13
896 }
897
main(int argc,char ** argv)898 int main(int argc, char **argv)
899 {
900 FILE *f = fopen("stb_c_lexer.h","rb");
901 char *text = (char *) malloc(1 << 20);
902 int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
903 stb_lexer lex;
904 if (len < 0) {
905 fprintf(stderr, "Error opening file\n");
906 free(text);
907 fclose(f);
908 return 1;
909 }
910 fclose(f);
911
912 stb_c_lexer_init(&lex, text, text+len, (char *) malloc(0x10000), 0x10000);
913 while (stb_c_lexer_get_token(&lex)) {
914 if (lex.token == CLEX_parse_error) {
915 printf("\n<<<PARSE ERROR>>>\n");
916 break;
917 }
918 print_token(&lex);
919 printf(" ");
920 }
921 return 0;
922 }
923 #endif
924 /*
925 ------------------------------------------------------------------------------
926 This software is available under 2 licenses -- choose whichever you prefer.
927 ------------------------------------------------------------------------------
928 ALTERNATIVE A - MIT License
929 Copyright (c) 2017 Sean Barrett
930 Permission is hereby granted, free of charge, to any person obtaining a copy of
931 this software and associated documentation files (the "Software"), to deal in
932 the Software without restriction, including without limitation the rights to
933 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
934 of the Software, and to permit persons to whom the Software is furnished to do
935 so, subject to the following conditions:
936 The above copyright notice and this permission notice shall be included in all
937 copies or substantial portions of the Software.
938 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
939 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
940 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
941 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
942 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
943 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
944 SOFTWARE.
945 ------------------------------------------------------------------------------
946 ALTERNATIVE B - Public Domain (www.unlicense.org)
947 This is free and unencumbered software released into the public domain.
948 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
949 software, either in source code form or as a compiled binary, for any purpose,
950 commercial or non-commercial, and by any means.
951 In jurisdictions that recognize copyright laws, the author or authors of this
952 software dedicate any and all copyright interest in the software to the public
953 domain. We make this dedication for the benefit of the public at large and to
954 the detriment of our heirs and successors. We intend this dedication to be an
955 overt act of relinquishment in perpetuity of all present and future rights to
956 this software under copyright law.
957 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
958 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
959 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
960 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
961 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
962 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
963 ------------------------------------------------------------------------------
964 */
965