1 // Python tokenizer
2 // Conforms to the Python 3.x language reference lexical structure:
3 // https://docs.python.org/3.6/reference/lexical_analysis.html
4 // Uses RE/flex matcher for lazy quants and for indent/dedent matching.
5 
6 // inject code into the Lexer class for the scanner state and auxiliary methods
7 %class{
8   // emit token to Lexer's current stream out()
9   void emit(const char *token, const char *what = NULL)
10   {
11     out() << token << (what ? what : "") << std::endl;
12   }
13   // keep track of (, [, { nesting for implicit line joins
14   int join;
15 }
16 
17 // import Python 3.x token definitions
18 %include "pdefs.l"
19 
20 // implicit line joining is done with the JOINING start condition state
21 %x JOINING
22 
23 // use full/fast option for speed and to eliminate tokenizer startup time
24 %option full
25 
26 // set indent tabs=8
27 %option tabs=8
28 
29 // dot matches \n
30 %option dotall
31 // warning: in unicode mode dot is a "catch all" and also matches invalid UTF-8
32 // so we use {any} to match anything that is valid Unicode/UTF-8, where {any}
33 // is defined in pdefs.l as all Unicode planes without the surrogate halves:
34 // {any} stands for \p{Unicode}
35 
36 %%
37 
38 ^\f?\h*                 // eat nodent margin space
39 
40 \r?\n                   |
41 #{any}*?\r?\n           emit("NEWLINE");
42 
43 ^\f?\h+\i               emit("INDENT");
44 
45 ^\f?\h*\j               |
46 \j                      emit("DEDENT");
47 
48 (?^\h+)                 // eat space and tabs
49 
50 (?^^(\f?\h*(#{any}*?)?\r?\n)+)
51                         // eat blank lines and comments, anchor ^ in (?^X)
52 
53 (?^\\\r?\n\f?\h*)       // explicit line joining by eating \ \n ...
54 
55 [[({]                   // implicit line joining
56                         join = 1;
57                         matcher().push_stops(); // save the indent stops
58                         start(JOINING);
59                         emit("DELIMITER  ", text());
60 
61 <JOINING>{
62 
63 \s+                     // eat all white space
64 
65 #{any}*?\r?\n           // eat comments
66 
67 [[({]                   ++join;
68                         emit("DELIMITER  ", text());
69 
70 [])}]                   if (--join == 0)
71                         {
72                           matcher().pop_stops(); // restore the indent stops
73                           start(INITIAL);
74                         }
75                         emit("DELIMITER  ", text());
76 
77 }
78 
79 <*>{
80 
81 {stringliteral}         emit("STRING     ", text());
82 
83 {bytesliteral}          emit("BYTES      ", text());
84 
85 {integer}               emit("INTEGER    ", text());
86 
87 {floatnumber}           emit("FLOAT      ", text());
88 
89 {imagnumber}            emit("IMAG       ", text());
90 
91 False                   |
92 None                    |
93 True                    |
94 and                     |
95 as                      |
96 assert                  |
97 break                   |
98 class                   |
99 continue                |
100 def                     |
101 del                     |
102 elif                    |
103 else                    |
104 except                  |
105 finally                 |
106 for                     |
107 from                    |
108 global                  |
109 if                      |
110 import                  |
111 in                      |
112 is                      |
113 lambda                  |
114 nonlocal                |
115 not                     |
116 or                      |
117 pass                    |
118 raise                   |
119 return                  |
120 try                     |
121 while                   |
122 with                    |
123 yield                   emit("KEYWORD    ", text());
124 
125 {identifier}            emit("IDENTIFIER ", text());
126 
127 "+"                     |
128 "-"                     |
129 "*"                     |
130 "/"                     |
131 "%"                     |
132 "&"                     |
133 "|"                     |
134 "^"                     |
135 "~"                     |
136 "<"                     |
137 ">"                     |
138 "**"                    |
139 "//"                    |
140 "<<"                    |
141 ">>"                    |
142 "<="                    |
143 ">="                    |
144 "=="                    |
145 "!="                    emit("OPERATOR   ", text());
146 
147 "@"                     |
148 "->"                    |
149 "+="                    |
150 "-="                    |
151 "*="                    |
152 "/="                    |
153 "%="                    |
154 "@="                    |
155 "&="                    |
156 "|="                    |
157 "^="                    |
158 ">>="                   |
159 "<<="                   |
160 "**="                   |
161 "//="                   |
162 "="                     |
163 ","                     |
164 ":"                     |
165 "."                     |
166 ";"                     emit("DELIMITER  ", text());
167 
168 .                       std::cerr << "Error: invalid input at line " << lineno() << " column " << columno() << std::endl;
169                         return 0;
170 
171 }
172 
173 %%
174 
175 int main(int argc, char **argv)
176 {
177   // in this example we'll use the Input class with a FILE or stdin to scan UTF-8/16/32 input
178   reflex::Input input;
179 
180   if (argc > 1)
181   {
182     input = fopen(argv[1], "r");
183     if (input.file() == NULL)
184     {
185       perror("Cannot open file for reading");
186       exit(EXIT_FAILURE);
187     }
188   }
189   else
190   {
191     input = stdin;
192   }
193 
194   Lexer(input).lex();
195 
196   if (input.file() != stdin)
197     fclose(input.file());
198 
199   return 0;
200 }
201