1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 #include "splib.h"
8 #include <limits.h>
9 #include <stdlib.h>
10 #include "macros.h"
11 #include "types.h"
12 #include "Syntax.h"
13 #include "token.h"
14 #include "Sd.h"
15 #include "Mode.h"
16 #include "ModeInfo.h"
17
18 #ifdef SP_NAMESPACE
19 namespace SP_NAMESPACE {
20 #endif
21
22 const unsigned REQUIRE_EMPTY_STARTTAG = 01;
23 const unsigned REQUIRE_EMPTY_ENDTAG = 02;
24 const unsigned REQUIRE_CONCUR = 04;
25 const unsigned REQUIRE_LINK_OR_CONCUR = 010;
26 const unsigned REQUIRE_NOT_KEEPRSRE = 020;
27 const unsigned REQUIRE_FLAGS = 037;
28
29 #define ULONG_BIT (CHAR_BIT * sizeof(unsigned long))
30
31 struct PackedTokenInfo {
32 Token token; // token to be returned
33 unsigned flags;
34 unsigned char contents[2]; // components of the delimiter or d-i-c
35 unsigned char modes[25]; // list of modes in which it is recognized,
36 // terminated by EOM
37 // a bit vector computed from modes (lo to hi)
38 unsigned long modeBits[(nModes + ULONG_BIT - 1)/ULONG_BIT];
39 void computeModeBits();
40 Boolean inMode(Mode mode) const;
41 };
42
43 const unsigned char SET = Syntax::nDelimGeneral;
44 const unsigned char FUNCTION = SET + Syntax::nSet;
45 const unsigned char NOTHING = UCHAR_MAX;
46
47 const unsigned char EOM = 255; // end of modes
48
49 static PackedTokenInfo tokenTable[] = {
50 // Delimiters and delimiters in context
51 { tokenAnd, 0, { Syntax::dAND, NOTHING }, { grpMode, EOM }},
52 { tokenCom, 0, { Syntax::dCOM, NOTHING },
53 { mdMode, mdMinusMode, mdPeroMode, sdMode, comMode, sdcomMode, piPasMode, EOM }},
54 { tokenCroDigit, 0, { Syntax::dCRO, SET + Syntax::digit },
55 { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
56 rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
57 alitMode, alitaMode, aliteMode,
58 talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
59 { tokenCroNameStart, 0, { Syntax::dCRO, SET + Syntax::nameStart },
60 { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
61 rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
62 alitMode, alitaMode, aliteMode,
63 talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
64 { tokenDsc, 0, { Syntax::dDSC, NOTHING },
65 { /* mdMode, */ asMode, dsMode, EOM }},
66 { tokenDso, 0, { Syntax::dDSO, NOTHING }, { mdMode, EOM }},
67 { tokenDtgc, 0, { Syntax::dDTGC, NOTHING }, { grpMode, EOM }},
68 { tokenDtgo, 0, { Syntax::dDTGO, NOTHING }, { grpMode, EOM }},
69 { tokenEroNameStart, 0, { Syntax::dERO, SET + Syntax::nameStart },
70 { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
71 rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
72 taliteMode, rcmsMode, EOM }},
73 { tokenEroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dERO, Syntax::dGRPO },
74 { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
75 rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
76 taliteMode, rcmsMode, EOM }},
77 { tokenEtago, 0, { Syntax::dETAGO, NOTHING }, { tagMode, EOM }},
78 { tokenEtagoNameStart, 0, { Syntax::dETAGO, SET + Syntax::nameStart },
79 { econMode, mconMode, cconMode, rcconMode,
80 econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
81 { tokenEtagoTagc, REQUIRE_EMPTY_ENDTAG, { Syntax::dETAGO, Syntax::dTAGC },
82 { econMode, mconMode, cconMode, rcconMode,
83 econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
84 { tokenEtagoGrpo, REQUIRE_CONCUR, { Syntax::dETAGO, Syntax::dGRPO },
85 { econMode, mconMode, cconMode, rcconMode,
86 econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
87 { tokenGrpc, 0, { Syntax::dGRPC, NOTHING }, { grpMode, EOM }},
88 { tokenGrpo, 0, { Syntax::dGRPO, NOTHING },
89 { mdMode, mdMinusMode, grpMode, EOM }},
90 { tokenHcroHexDigit, 0, { Syntax::dHCRO, SET + Syntax::hexDigit },
91 { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
92 rcconeMode, plitMode, plitaMode, pliteMode,
93 alitMode, alitaMode, aliteMode,
94 talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
95 { tokenLit, 0, { Syntax::dLIT, NOTHING },
96 { alitMode, talitMode, plitMode, sdplitMode, mlitMode, slitMode, sdslitMode,
97 asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
98 { tokenLita, 0, { Syntax::dLITA, NOTHING },
99 { alitaMode, talitaMode, plitaMode, sdplitaMode, mlitaMode, slitaMode, sdslitaMode,
100 asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
101 { tokenMdc, 0, { Syntax::dMDC, NOTHING }, { mdMode, sdMode, EOM }},
102 { tokenMdoNameStart, 0, { Syntax::dMDO, SET + Syntax::nameStart },
103 { econMode, mconMode, econnetMode, mconnetMode,
104 proMode, dsMode, dsiMode, EOM }},
105 { tokenMdoMdc, 0, { Syntax::dMDO, Syntax::dMDC },
106 { econMode, mconMode, econnetMode, mconnetMode,
107 proMode, dsMode, dsiMode, EOM }},
108 { tokenMdoCom, 0, { Syntax::dMDO, Syntax::dCOM },
109 { econMode, mconMode, econnetMode, mconnetMode,
110 proMode, dsMode, dsiMode, EOM }},
111 { tokenMdoDso, 0, { Syntax::dMDO, Syntax::dDSO },
112 { econMode, mconMode, econnetMode, mconnetMode,
113 dsMode, dsiMode, imsMode, EOM }},
114 { tokenMinus, 0, { Syntax::dMINUS, NOTHING }, { mdMinusMode, sdMode, EOM }},
115 { tokenMinusGrpo, 0, { Syntax::dMINUS, Syntax::dGRPO }, { mdMode, EOM }},
116 { tokenMscMdc, 0, { Syntax::dMSC, Syntax::dMDC},
117 { imsMode, cmsMode, rcmsMode,
118 econMode, mconMode, econnetMode, mconnetMode, dsMode, dsiMode, EOM }},
119 { tokenNestc, 0, { Syntax::dNESTC, NOTHING }, { tagMode, EOM }},
120 { tokenNet, 0, { Syntax::dNET, NOTHING },
121 { econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
122 { tokenOpt, 0, { Syntax::dOPT, NOTHING }, { grpMode, grpsufMode, EOM }},
123 { tokenOr, 0, { Syntax::dOR, NOTHING }, { grpMode, EOM }},
124 { tokenPero, 0, { Syntax::dPERO, NOTHING }, { mdPeroMode, EOM }},
125 { tokenPeroNameStart, 0, { Syntax::dPERO, SET + Syntax::nameStart }, {
126 mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
127 plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
128 { tokenPeroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dPERO, Syntax::dGRPO },
129 { mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
130 plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
131 { tokenPic, 0, { Syntax::dPIC, NOTHING }, { piMode, EOM }},
132 { tokenPio, 0, { Syntax::dPIO, NOTHING },
133 { econMode, mconMode, econnetMode, mconnetMode, proMode,
134 dsMode, dsiMode, EOM }},
135 { tokenPlus, 0, { Syntax::dPLUS, NOTHING }, { grpMode, grpsufMode, EOM }},
136 { tokenPlusGrpo, 0, { Syntax::dPLUS, Syntax::dGRPO }, { mdMode, EOM }},
137 { tokenRefc, 0, { Syntax::dREFC, NOTHING }, { refMode, EOM }},
138 { tokenRep, 0, { Syntax::dREP, NOTHING }, { grpMode, grpsufMode, EOM }},
139 { tokenRni, 0, { Syntax::dRNI, NOTHING },
140 { grpMode, mdMode, mdPeroMode, EOM }},
141 { tokenSeq, 0, { Syntax::dSEQ, NOTHING }, { grpMode, EOM }},
142 { tokenStago, 0, { Syntax::dSTAGO, NOTHING }, { tagMode, EOM }},
143 { tokenStagoNameStart, 0, { Syntax::dSTAGO, SET + Syntax::nameStart },
144 { econMode, mconMode, econnetMode, mconnetMode, EOM }},
145 { tokenStagoTagc, REQUIRE_EMPTY_STARTTAG, { Syntax::dSTAGO, Syntax::dTAGC },
146 { econMode, mconMode, econnetMode, mconnetMode, EOM }},
147 { tokenStagoGrpo, REQUIRE_CONCUR, { Syntax::dSTAGO, Syntax::dGRPO },
148 { econMode, mconMode, econnetMode, mconnetMode, EOM }},
149 { tokenTagc, 0, { Syntax::dTAGC, NOTHING }, { tagMode, EOM }},
150 { tokenVi, 0, { Syntax::dVI, NOTHING }, { tagMode, asMode, piPasMode, EOM }},
151 // Other tokens
152 { tokenRe, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRE, NOTHING },
153 { mconMode, cconMode, rcconMode,
154 mconnetMode, cconnetMode, rcconnetMode,
155 rcconeMode, cmsMode, rcmsMode, EOM }},
156 { tokenRe, 0, { FUNCTION + Syntax::fRE, NOTHING },
157 { refMode,
158 mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
159 talitMode, talitaMode, taliteMode,
160 EOM }},
161 { tokenRs, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRS, NOTHING },
162 { mconMode, cconMode, rcconMode,
163 mconnetMode, cconnetMode, rcconnetMode,
164 rcconeMode, cmsMode, rcmsMode, EOM }},
165 { tokenRs, 0, { FUNCTION + Syntax::fRS, NOTHING },
166 { mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
167 talitMode, talitaMode, taliteMode,
168 EOM }},
169 { tokenSpace, 0, { FUNCTION + Syntax::fSPACE, NOTHING },
170 { mlitMode, mlitaMode, talitMode, talitaMode, taliteMode, EOM }},
171 { tokenSepchar, 0, { SET + Syntax::sepchar, NOTHING },
172 { alitMode, alitaMode, aliteMode,
173 talitMode, talitaMode, taliteMode, EOM }},
174 { tokenS, 0, { SET + Syntax::s, NOTHING },
175 { econMode, econnetMode, grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
176 proMode, dsMode, dsiMode, asMode, piPasMode, tagMode, EOM }},
177 { tokenNameStart, 0, { SET + Syntax::nameStart, NOTHING },
178 { grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
179 asMode, piPasMode, tagMode, EOM }},
180 { tokenDigit, 0, { SET + Syntax::digit, NOTHING },
181 { grpMode, mdMode, mdMinusMode, sdMode, asMode, piPasMode, tagMode, EOM }},
182 { tokenLcUcNmchar, 0, { SET + Syntax::nmchar, NOTHING },
183 { grpMode, mdMode, asMode, piPasMode, tagMode, EOM }},
184 { tokenIgnoredChar, 0, { SET + Syntax::sgmlChar, NOTHING },
185 { imsMode, EOM }},
186 { tokenChar, 0, { SET + Syntax::sgmlChar, NOTHING },
187 // Note that character data is recognized in element content,
188 // and will cause #PCDATA to begin.
189 { alitMode, alitaMode, aliteMode,
190 talitMode, talitaMode, taliteMode,
191 comMode, piMode,
192 cmsMode, rcmsMode,
193 plitMode, plitaMode, pliteMode,
194 slitMode, slitaMode,
195 econMode, mconMode, cconMode, rcconMode,
196 econnetMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, EOM }},
197 { tokenChar, 0, { SET + Syntax::minimumData, NOTHING },
198 { mlitMode, mlitaMode, EOM }},
199 { tokenChar, 0, { SET + Syntax::significant, NOTHING },
200 { sdplitMode, sdplitaMode, sdslitMode, sdslitaMode, sdcomMode, EOM }},
201 };
202
inMode(Mode mode) const203 inline Boolean PackedTokenInfo::inMode(Mode mode) const
204 {
205 return ((modeBits[unsigned(mode) / ULONG_BIT]
206 & ((unsigned long)1 << (unsigned(mode) % ULONG_BIT)))
207 != 0);
208 }
209
computeModeBits()210 void PackedTokenInfo::computeModeBits()
211 {
212 for (unsigned char *p = modes; *p != EOM; p++)
213 modeBits[*p / ULONG_BIT] |= (unsigned long)1 << (*p % ULONG_BIT);
214 }
215
216 struct TokenTableIniter {
217 TokenTableIniter();
218 };
219
220 static TokenTableIniter tokenTableIniter;
221
TokenTableIniter()222 TokenTableIniter::TokenTableIniter()
223 {
224 for (size_t i = 0; i < SIZEOF(tokenTable); i++)
225 tokenTable[i].computeModeBits();
226 }
227
ModeInfo(Mode mode,const Sd & sd)228 ModeInfo::ModeInfo(Mode mode, const Sd &sd)
229 : mode_(mode), p_(tokenTable), count_(SIZEOF(tokenTable)),
230 missingRequirements_(REQUIRE_FLAGS)
231 {
232 if (sd.startTagEmpty())
233 missingRequirements_ &= ~REQUIRE_EMPTY_STARTTAG;
234 if (sd.endTagEmpty())
235 missingRequirements_ &= ~REQUIRE_EMPTY_ENDTAG;
236 if (sd.concur())
237 missingRequirements_ &= ~(REQUIRE_CONCUR|REQUIRE_LINK_OR_CONCUR);
238 if (sd.link())
239 missingRequirements_ &= ~REQUIRE_LINK_OR_CONCUR;
240 if (!sd.keeprsre())
241 missingRequirements_ &= ~REQUIRE_NOT_KEEPRSRE;
242 }
243
nextToken(TokenInfo * t)244 Boolean ModeInfo::nextToken(TokenInfo *t)
245 {
246 for (; count_ > 0; --count_, ++p_)
247 if (p_->inMode(mode_) && (p_->flags & missingRequirements_) == 0) {
248 t->token = p_->token;
249 t->priority = Priority::delim;
250 const unsigned char *contents = p_->contents;
251 --count_;
252 ++p_;
253 unsigned char c = contents[0];
254 if (c < SET)
255 t->delim1 = Syntax::DelimGeneral(c);
256 else if (c < SET + Syntax::nSet) {
257 t->set = Syntax::Set(c - SET);
258 t->type = TokenInfo::setType;
259 switch (t->set) {
260 case Syntax::sepchar:
261 case Syntax::s:
262 case Syntax::blank:
263 t->priority = Priority::function;
264 break;
265 default:
266 t->priority = Priority::data;
267 break;
268 }
269 return 1;
270 }
271 else {
272 t->function = Syntax::StandardFunction(c - FUNCTION);
273 t->priority = Priority::function;
274 t->type = TokenInfo::functionType;
275 return 1;
276 }
277 c = contents[1];
278 if (c == NOTHING) {
279 t->type = TokenInfo::delimType;
280 return 1;
281 }
282 if (c < SET) {
283 t->delim2 = Syntax::DelimGeneral(c);
284 t->type = TokenInfo::delimDelimType;
285 return 1;
286 }
287 if (c < SET + Syntax::nSet) {
288 t->set = Syntax::Set(c - SET);
289 t->type = TokenInfo::delimSetType;
290 return 1;
291 }
292 abort();
293 }
294 return 0;
295 }
296
297 #ifdef SP_NAMESPACE
298 }
299 #endif
300