1// vim:set ft=cpp: 2 3#include "adlib/lib.h" 4#include "adlib/set.h" 5// "fixstr.h" must be included before "map.h" 6#include "fixstr.h" 7#include "adlib/map.h" 8 9#include "pplex.h" 10 11const char *SymbolNames[] = { 12 "SymNone", 13 "SymClass", 14 "SymNamespace", 15 "SymExtern", 16 "SymVAR", 17 "SymEXTERN_VAR", 18 "SymSTATIC_VAR", 19 "SymINST_VAR", 20 "SymEXTERN_INST_VAR", 21 "SymSTATIC_INST_VAR", 22 "SymIdent", 23 "SymLiteral", 24 "SymOp", 25 "SymSemicolon", 26 "SymComma", 27 "SymEqual", 28 "SymAst", 29 "SymAnd", 30 "SymAndAnd", 31 "SymColonColon", 32 "SymLPar", 33 "SymRPar", 34 "SymLBrkt", 35 "SymRBrkt", 36 "SymLBrace", 37 "SymRBrace", 38 "SymWS", 39 "SymEOL", 40 "SymComment", 41 "SymBAD", 42 "SymLineDir", 43 "SymEOF", 44 "SymGen", 45}; 46 47typedef Map<FixStr, Str *> InternMap; 48 49Str *Intern(const char *ptr, Int len) { 50 static InternMap *map = NULL; 51 if (!map) { 52 GCVar(map, new InternMap()); 53 } 54 FixStr fs; 55 fs.str = ptr; 56 fs.len = len; 57 Str *result = map->get(fs, NULL); 58 if (!result) { 59 result = new Str(ptr, len); 60 // `ptr` above is an interior pointer whose contents may disappear 61 // due to GC once the `SourceFile` object containing it is no 62 // longer reachable. 63 // 64 // Therefore, we replace it with `result->c_str()`. This is not only 65 // not an interior pointer, but is kept alive by the value that the 66 // key is in use for. 67 fs.str = result->c_str(); 68 map->add(fs, result); 69 } 70 return result; 71} 72 73#define PUSH_TOKEN(s) \ 74 token.sym = s; \ 75 goto pushtoken; 76 77bool Tokenize(SourceFile *source) { 78 Str *input = source->filedata; 79 const char *cursor = input->c_str(); 80 const char *marker = NULL; 81 const char *ctxmarker = NULL; 82 bool done = false; 83 bool error = false; 84 TokenList *result = new TokenList(); 85 Token token; 86 while (!done) { 87 const char *last = cursor; 88 /*!re2c 89 re2c:define:YYCTYPE = "unsigned char"; 90 re2c:yyfill:enable = 0; 91 re2c:define:YYCURSOR = cursor; 92 re2c:define:YYMARKER = marker; 93 re2c:define:YYCTXMARKER = ctxmarker; 94 95 alpha = [a-zA-Z_]; 96 digit = [0-9]; 97 oct = [0-7]; 98 hex = [0-9a-fA-F]; 99 floatsuffix = [fFlL]?; 100 intsuffix = [uUlL]*; 101 exp = 'e' [-+]? digit+; 102 squote = [']; 103 quote = ["]; 104 any = [^\000\r\n]; 105 anyunescaped = [^\000\r\n\\]; 106 sp = [ \t\f]; 107 eol = [\000\r\n]; 108 nl = "\r" | "\n" | "\r\n"; 109 postpparg = [^a-zA-Z0-9_\r\n\000]; 110 ppany = anyunescaped | ("\\" sp* nl); 111 pparg = (postpparg ppany *)?; 112 anystr = any \ ["\\]; 113 anych = any \ ['\\]; 114 longops = "..." | ">>=" | "<<=" | "+=" | "-=" | "*=" | "/=" | "%=" 115 | "&=" | "^=" | "|=" | ">>" | "<<" | "++" | "--" | "->" 116 | "&&" | "||" | "<=" | ">=" | "==" | "!="; 117 esc = "\\"; 118 119 "class" { PUSH_TOKEN(SymClass); } 120 "namespace" { PUSH_TOKEN(SymNamespace); } 121 "extern" { PUSH_TOKEN(SymExtern); } 122 "VAR" { PUSH_TOKEN(SymVAR); } 123 "EXTERN_VAR" { PUSH_TOKEN(SymEXTERN_VAR); } 124 "STATIC_VAR" { PUSH_TOKEN(SymSTATIC_VAR); } 125 "INST_VAR" { PUSH_TOKEN(SymINST_VAR); } 126 "EXTERN_INST_VAR" { PUSH_TOKEN(SymEXTERN_INST_VAR); } 127 "STATIC_INST_VAR" { PUSH_TOKEN(SymSTATIC_INST_VAR); } 128 alpha (alpha | digit)* { PUSH_TOKEN(SymIdent); } 129 '0x' hex+ intsuffix { PUSH_TOKEN(SymLiteral); } 130 '0' oct+ intsuffix { PUSH_TOKEN(SymLiteral); } 131 digit+ intsuffix { PUSH_TOKEN(SymLiteral); } 132 "L"? squote (esc any anych* | anych) squote { PUSH_TOKEN(SymLiteral); } 133 "L"? quote (esc any | anystr)* quote { PUSH_TOKEN(SymLiteral); } 134 digit+ exp floatsuffix { PUSH_TOKEN(SymLiteral); } 135 digit* "." digit+ exp? floatsuffix { PUSH_TOKEN(SymLiteral); } 136 digit+ "." digit* exp? floatsuffix { PUSH_TOKEN(SymLiteral); } 137 "(" { PUSH_TOKEN(SymLPar); } 138 ")" { PUSH_TOKEN(SymRPar); } 139 "[" { PUSH_TOKEN(SymLBrkt); } 140 "]" { PUSH_TOKEN(SymRBrkt); } 141 "{" { PUSH_TOKEN(SymLBrace); } 142 "}" { PUSH_TOKEN(SymRBrace); } 143 "=" { PUSH_TOKEN(SymEqual); } 144 "," { PUSH_TOKEN(SymComma); } 145 ";" { PUSH_TOKEN(SymSemicolon); } 146 "&" { PUSH_TOKEN(SymAnd); } 147 "&&" { PUSH_TOKEN(SymAndAnd); } 148 "::" { PUSH_TOKEN(SymColonColon); } 149 "*" { PUSH_TOKEN(SymAst); } 150 [-.&!~+*%/<>^|?:=,] { PUSH_TOKEN(SymOp); } 151 longops { PUSH_TOKEN(SymOp); } 152 ";" { PUSH_TOKEN(SymSemicolon); } 153 "//" any+ { PUSH_TOKEN(SymComment); } 154 "/" "*" { goto comment; } 155 nl { PUSH_TOKEN(SymEOL); } 156 "\\" sp* / nl { PUSH_TOKEN(SymWS); } 157 sp+ { PUSH_TOKEN(SymWS); } 158 "#" sp* digit+ "\"" anystr* "\"" (sp | digit)* { 159 PUSH_TOKEN(SymLineDir); 160 } 161 "\000" { done = true; continue; } 162 any { error = true; PUSH_TOKEN(SymBAD); } 163 * { done = true; continue; } 164 */ 165 comment: 166 /*!re2c 167 "*" "/" { PUSH_TOKEN(SymComment); } 168 [^\000] { goto comment; } 169 "\000" { done = true; PUSH_TOKEN(SymComment); } 170 */ 171 pushtoken: 172 token.str = Intern(last, cursor - last); 173 result->add(token); 174 } 175 token.sym = SymEOF; 176 token.str = S(""); 177 result->add(token); 178 source->tokens = result; 179 return !error; 180} 181 182SourceFile *ReadSource(Str *filename, Str *filedata) { 183 SourceFile *result = new SourceFile(); 184 result->filename = filename; 185 Str *modulename = filename->clone(); 186 for (Int i = 0; i < modulename->len(); i++) { 187 char ch = modulename->at(i); 188 if (ch >= 'a' && ch <= 'z') continue; 189 if (ch >= 'A' && ch <= 'Z') continue; 190 if (ch >= '0' && ch <= '9') continue; 191 if (ch == '_') continue; 192 modulename->at(i) = '_'; 193 } 194 result->modulename = modulename; 195 if (!filedata) 196 filedata = ReadFile(filename); 197 result->filedata = filedata; 198 if (!result->filedata) 199 return NULL; 200 Tokenize(result); 201 return result; 202} 203