1// vim:set ft=cpp:
2
3#include "adlib/lib.h"
4#include "adlib/set.h"
5// "fixstr.h" must be included before "map.h"
6#include "fixstr.h"
7#include "adlib/map.h"
8
9#include "pplex.h"
10
11const char *SymbolNames[] = {
12  "SymNone",
13  "SymClass",
14  "SymNamespace",
15  "SymExtern",
16  "SymVAR",
17  "SymEXTERN_VAR",
18  "SymSTATIC_VAR",
19  "SymINST_VAR",
20  "SymEXTERN_INST_VAR",
21  "SymSTATIC_INST_VAR",
22  "SymIdent",
23  "SymLiteral",
24  "SymOp",
25  "SymSemicolon",
26  "SymComma",
27  "SymEqual",
28  "SymAst",
29  "SymAnd",
30  "SymAndAnd",
31  "SymColonColon",
32  "SymLPar",
33  "SymRPar",
34  "SymLBrkt",
35  "SymRBrkt",
36  "SymLBrace",
37  "SymRBrace",
38  "SymWS",
39  "SymEOL",
40  "SymComment",
41  "SymBAD",
42  "SymLineDir",
43  "SymEOF",
44  "SymGen",
45};
46
47typedef Map<FixStr, Str *> InternMap;
48
49Str *Intern(const char *ptr, Int len) {
50  static InternMap *map = NULL;
51  if (!map) {
52    GCVar(map, new InternMap());
53  }
54  FixStr fs;
55  fs.str = ptr;
56  fs.len = len;
57  Str *result = map->get(fs, NULL);
58  if (!result) {
59    result = new Str(ptr, len);
60    // `ptr` above is an interior pointer whose contents may disappear
61    // due to GC once the `SourceFile` object containing it is no
62    // longer reachable.
63    //
64    // Therefore, we replace it with `result->c_str()`. This is not only
65    // not an interior pointer, but is kept alive by the value that the
66    // key is in use for.
67    fs.str = result->c_str();
68    map->add(fs, result);
69  }
70  return result;
71}
72
73#define PUSH_TOKEN(s) \
74  token.sym = s; \
75  goto pushtoken;
76
77bool Tokenize(SourceFile *source) {
78  Str *input = source->filedata;
79  const char *cursor = input->c_str();
80  const char *marker = NULL;
81  const char *ctxmarker = NULL;
82  bool done = false;
83  bool error = false;
84  TokenList *result = new TokenList();
85  Token token;
86  while (!done) {
87    const char *last = cursor;
88    /*!re2c
89    re2c:define:YYCTYPE = "unsigned char";
90    re2c:yyfill:enable = 0;
91    re2c:define:YYCURSOR = cursor;
92    re2c:define:YYMARKER = marker;
93    re2c:define:YYCTXMARKER = ctxmarker;
94
95    alpha = [a-zA-Z_];
96    digit = [0-9];
97    oct = [0-7];
98    hex = [0-9a-fA-F];
99    floatsuffix = [fFlL]?;
100    intsuffix = [uUlL]*;
101    exp = 'e' [-+]? digit+;
102    squote = ['];
103    quote = ["];
104    any = [^\000\r\n];
105    anyunescaped = [^\000\r\n\\];
106    sp = [ \t\f];
107    eol = [\000\r\n];
108    nl = "\r" | "\n" | "\r\n";
109    postpparg = [^a-zA-Z0-9_\r\n\000];
110    ppany = anyunescaped | ("\\" sp* nl);
111    pparg = (postpparg ppany *)?;
112    anystr = any \ ["\\];
113    anych = any \ ['\\];
114    longops = "..." | ">>=" | "<<=" | "+=" | "-=" | "*=" | "/=" | "%="
115            | "&=" | "^=" | "|=" | ">>" | "<<" | "++" | "--" | "->"
116            | "&&" | "||" | "<=" | ">=" | "==" | "!=";
117    esc = "\\";
118
119    "class"         { PUSH_TOKEN(SymClass); }
120    "namespace"     { PUSH_TOKEN(SymNamespace); }
121    "extern"        { PUSH_TOKEN(SymExtern); }
122    "VAR"           { PUSH_TOKEN(SymVAR); }
123    "EXTERN_VAR"    { PUSH_TOKEN(SymEXTERN_VAR); }
124    "STATIC_VAR"    { PUSH_TOKEN(SymSTATIC_VAR); }
125    "INST_VAR"      { PUSH_TOKEN(SymINST_VAR); }
126    "EXTERN_INST_VAR" { PUSH_TOKEN(SymEXTERN_INST_VAR); }
127    "STATIC_INST_VAR" { PUSH_TOKEN(SymSTATIC_INST_VAR); }
128    alpha (alpha | digit)* { PUSH_TOKEN(SymIdent); }
129    '0x' hex+ intsuffix { PUSH_TOKEN(SymLiteral); }
130    '0' oct+ intsuffix { PUSH_TOKEN(SymLiteral); }
131    digit+ intsuffix { PUSH_TOKEN(SymLiteral); }
132    "L"? squote (esc any anych* | anych) squote { PUSH_TOKEN(SymLiteral); }
133    "L"? quote (esc any | anystr)* quote { PUSH_TOKEN(SymLiteral); }
134    digit+ exp floatsuffix { PUSH_TOKEN(SymLiteral); }
135    digit* "." digit+ exp? floatsuffix { PUSH_TOKEN(SymLiteral); }
136    digit+ "." digit* exp? floatsuffix { PUSH_TOKEN(SymLiteral); }
137    "(" { PUSH_TOKEN(SymLPar); }
138    ")" { PUSH_TOKEN(SymRPar); }
139    "[" { PUSH_TOKEN(SymLBrkt); }
140    "]" { PUSH_TOKEN(SymRBrkt); }
141    "{" { PUSH_TOKEN(SymLBrace); }
142    "}" { PUSH_TOKEN(SymRBrace); }
143    "=" { PUSH_TOKEN(SymEqual); }
144    "," { PUSH_TOKEN(SymComma); }
145    ";" { PUSH_TOKEN(SymSemicolon); }
146    "&" { PUSH_TOKEN(SymAnd); }
147    "&&" { PUSH_TOKEN(SymAndAnd); }
148    "::" { PUSH_TOKEN(SymColonColon); }
149    "*" { PUSH_TOKEN(SymAst); }
150    [-.&!~+*%/<>^|?:=,] { PUSH_TOKEN(SymOp); }
151    longops { PUSH_TOKEN(SymOp); }
152    ";" { PUSH_TOKEN(SymSemicolon); }
153    "//" any+ { PUSH_TOKEN(SymComment); }
154    "/" "*" { goto comment; }
155    nl { PUSH_TOKEN(SymEOL); }
156    "\\" sp* / nl { PUSH_TOKEN(SymWS); }
157    sp+ { PUSH_TOKEN(SymWS); }
158    "#" sp* digit+ "\"" anystr* "\"" (sp | digit)* {
159      PUSH_TOKEN(SymLineDir);
160    }
161    "\000" { done = true; continue; }
162    any { error = true; PUSH_TOKEN(SymBAD); }
163    * { done = true; continue; }
164    */
165    comment:
166    /*!re2c
167    "*" "/" { PUSH_TOKEN(SymComment); }
168    [^\000] { goto comment; }
169    "\000" { done = true; PUSH_TOKEN(SymComment); }
170    */
171    pushtoken:
172      token.str = Intern(last, cursor - last);
173      result->add(token);
174  }
175  token.sym = SymEOF;
176  token.str = S("");
177  result->add(token);
178  source->tokens = result;
179  return !error;
180}
181
182SourceFile *ReadSource(Str *filename, Str *filedata) {
183  SourceFile *result = new SourceFile();
184  result->filename = filename;
185  Str *modulename = filename->clone();
186  for (Int i = 0; i < modulename->len(); i++) {
187    char ch = modulename->at(i);
188    if (ch >= 'a' && ch <= 'z') continue;
189    if (ch >= 'A' && ch <= 'Z') continue;
190    if (ch >= '0' && ch <= '9') continue;
191    if (ch == '_') continue;
192    modulename->at(i) = '_';
193  }
194  result->modulename = modulename;
195  if (!filedata)
196    filedata = ReadFile(filename);
197  result->filedata = filedata;
198  if (!result->filedata)
199    return NULL;
200  Tokenize(result);
201  return result;
202}
203