1 // Python tokenizer 2 // Conforms to the Python 3.x language reference lexical structure: 3 // https://docs.python.org/3.6/reference/lexical_analysis.html 4 // Uses RE/flex matcher for lazy quants and for indent/dedent matching. 5 6 // inject code into the Lexer class for the scanner state and auxiliary methods 7 %class{ 8 // emit token to Lexer's current stream out() 9 void emit(const char *token, const char *what = NULL) 10 { 11 out() << token << (what ? what : "") << std::endl; 12 } 13 // keep track of (, [, { nesting for implicit line joins 14 int join; 15 } 16 17 // import Python 3.x token definitions 18 %include "pdefs.l" 19 20 // implicit line joining is done with the JOINING start condition state 21 %x JOINING 22 23 // use full/fast option for speed and to eliminate tokenizer startup time 24 %option full 25 26 // set indent tabs=8 27 %option tabs=8 28 29 // dot matches \n 30 %option dotall 31 // warning: in unicode mode dot is a "catch all" and also matches invalid UTF-8 32 // so we use {any} to match anything that is valid Unicode/UTF-8, where {any} 33 // is defined in pdefs.l as all Unicode planes without the surrogate halves: 34 // {any} stands for \p{Unicode} 35 36 %% 37 38 ^\f?\h* // eat nodent margin space 39 40 \r?\n | 41 #{any}*?\r?\n emit("NEWLINE"); 42 43 ^\f?\h+\i emit("INDENT"); 44 45 ^\f?\h*\j | 46 \j emit("DEDENT"); 47 48 (?^\h+) // eat space and tabs 49 50 (?^^(\f?\h*(#{any}*?)?\r?\n)+) 51 // eat blank lines and comments, anchor ^ in (?^X) 52 53 (?^\\\r?\n\f?\h*) // explicit line joining by eating \ \n ... 54 55 [[({] // implicit line joining 56 join = 1; 57 matcher().push_stops(); // save the indent stops 58 start(JOINING); 59 emit("DELIMITER ", text()); 60 61 <JOINING>{ 62 63 \s+ // eat all white space 64 65 #{any}*?\r?\n // eat comments 66 67 [[({] ++join; 68 emit("DELIMITER ", text()); 69 70 [])}] if (--join == 0) 71 { 72 matcher().pop_stops(); // restore the indent stops 73 start(INITIAL); 74 } 75 emit("DELIMITER ", text()); 76 77 } 78 79 <*>{ 80 81 {stringliteral} emit("STRING ", text()); 82 83 {bytesliteral} emit("BYTES ", text()); 84 85 {integer} emit("INTEGER ", text()); 86 87 {floatnumber} emit("FLOAT ", text()); 88 89 {imagnumber} emit("IMAG ", text()); 90 91 False | 92 None | 93 True | 94 and | 95 as | 96 assert | 97 break | 98 class | 99 continue | 100 def | 101 del | 102 elif | 103 else | 104 except | 105 finally | 106 for | 107 from | 108 global | 109 if | 110 import | 111 in | 112 is | 113 lambda | 114 nonlocal | 115 not | 116 or | 117 pass | 118 raise | 119 return | 120 try | 121 while | 122 with | 123 yield emit("KEYWORD ", text()); 124 125 {identifier} emit("IDENTIFIER ", text()); 126 127 "+" | 128 "-" | 129 "*" | 130 "/" | 131 "%" | 132 "&" | 133 "|" | 134 "^" | 135 "~" | 136 "<" | 137 ">" | 138 "**" | 139 "//" | 140 "<<" | 141 ">>" | 142 "<=" | 143 ">=" | 144 "==" | 145 "!=" emit("OPERATOR ", text()); 146 147 "@" | 148 "->" | 149 "+=" | 150 "-=" | 151 "*=" | 152 "/=" | 153 "%=" | 154 "@=" | 155 "&=" | 156 "|=" | 157 "^=" | 158 ">>=" | 159 "<<=" | 160 "**=" | 161 "//=" | 162 "=" | 163 "," | 164 ":" | 165 "." | 166 ";" emit("DELIMITER ", text()); 167 168 . std::cerr << "Error: invalid input at line " << lineno() << " column " << columno() << std::endl; 169 return 0; 170 171 } 172 173 %% 174 175 int main(int argc, char **argv) 176 { 177 // in this example we'll use the Input class with a FILE or stdin to scan UTF-8/16/32 input 178 reflex::Input input; 179 180 if (argc > 1) 181 { 182 input = fopen(argv[1], "r"); 183 if (input.file() == NULL) 184 { 185 perror("Cannot open file for reading"); 186 exit(EXIT_FAILURE); 187 } 188 } 189 else 190 { 191 input = stdin; 192 } 193 194 Lexer(input).lex(); 195 196 if (input.file() != stdin) 197 fclose(input.file()); 198 199 return 0; 200 } 201