1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 #include "splib.h"
8 #include <limits.h>
9 #include <stdlib.h>
10 #include "macros.h"
11 #include "types.h"
12 #include "Syntax.h"
13 #include "token.h"
14 #include "Sd.h"
15 #include "Mode.h"
16 #include "ModeInfo.h"
17 
18 #ifdef SP_NAMESPACE
19 namespace SP_NAMESPACE {
20 #endif
21 
22 const unsigned REQUIRE_EMPTY_STARTTAG = 01;
23 const unsigned REQUIRE_EMPTY_ENDTAG = 02;
24 const unsigned REQUIRE_CONCUR = 04;
25 const unsigned REQUIRE_LINK_OR_CONCUR = 010;
26 const unsigned REQUIRE_NOT_KEEPRSRE = 020;
27 const unsigned REQUIRE_FLAGS = 037;
28 
29 #define ULONG_BIT (CHAR_BIT * sizeof(unsigned long))
30 
31 struct PackedTokenInfo {
32   Token token;			// token to be returned
33   unsigned flags;
34   unsigned char contents[2];	// components of the delimiter or d-i-c
35   unsigned char modes[25];	// list of modes in which it is recognized,
36 				// terminated by EOM
37   // a bit vector computed from modes (lo to hi)
38   unsigned long modeBits[(nModes + ULONG_BIT - 1)/ULONG_BIT];
39   void computeModeBits();
40   Boolean inMode(Mode mode) const;
41 };
42 
43 const unsigned char SET = Syntax::nDelimGeneral;
44 const unsigned char FUNCTION = SET + Syntax::nSet;
45 const unsigned char NOTHING = UCHAR_MAX;
46 
47 const unsigned char EOM = 255;	// end of modes
48 
49 static PackedTokenInfo tokenTable[] = {
50   // Delimiters and delimiters in context
51   { tokenAnd, 0, { Syntax::dAND, NOTHING }, { grpMode, EOM }},
52   { tokenCom, 0, { Syntax::dCOM, NOTHING },
53     { mdMode, mdMinusMode, mdPeroMode, sdMode, comMode, sdcomMode, piPasMode, EOM }},
54   { tokenCroDigit, 0, { Syntax::dCRO, SET + Syntax::digit },
55     { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
56       rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
57       alitMode, alitaMode, aliteMode,
58       talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
59   { tokenCroNameStart, 0, { Syntax::dCRO, SET + Syntax::nameStart },
60     { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
61       rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
62       alitMode, alitaMode, aliteMode,
63       talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
64   { tokenDsc, 0, { Syntax::dDSC, NOTHING },
65     { /* mdMode, */ asMode, dsMode, EOM }},
66   { tokenDso, 0, { Syntax::dDSO, NOTHING }, { mdMode, EOM }},
67   { tokenDtgc, 0, { Syntax::dDTGC, NOTHING }, { grpMode, EOM }},
68   { tokenDtgo, 0, { Syntax::dDTGO, NOTHING }, { grpMode, EOM }},
69   { tokenEroNameStart, 0, { Syntax::dERO, SET + Syntax::nameStart },
70     { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
71       rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
72       taliteMode, rcmsMode, EOM }},
73   { tokenEroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dERO, Syntax::dGRPO },
74     { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
75       rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
76       taliteMode, rcmsMode, EOM }},
77   { tokenEtago, 0, { Syntax::dETAGO, NOTHING }, { tagMode, EOM }},
78   { tokenEtagoNameStart, 0, { Syntax::dETAGO, SET + Syntax::nameStart },
79     { econMode, mconMode, cconMode, rcconMode,
80       econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
81   { tokenEtagoTagc, REQUIRE_EMPTY_ENDTAG, { Syntax::dETAGO, Syntax::dTAGC },
82     { econMode, mconMode, cconMode, rcconMode,
83       econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
84   { tokenEtagoGrpo, REQUIRE_CONCUR, { Syntax::dETAGO, Syntax::dGRPO },
85     { econMode, mconMode, cconMode, rcconMode,
86       econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
87   { tokenGrpc, 0, { Syntax::dGRPC, NOTHING }, { grpMode, EOM }},
88   { tokenGrpo, 0, { Syntax::dGRPO, NOTHING },
89     { mdMode, mdMinusMode, grpMode, EOM }},
90   { tokenHcroHexDigit, 0, { Syntax::dHCRO, SET + Syntax::hexDigit },
91     { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
92       rcconeMode, plitMode, plitaMode, pliteMode,
93       alitMode, alitaMode, aliteMode,
94       talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
95   { tokenLit, 0, { Syntax::dLIT, NOTHING },
96     { alitMode, talitMode, plitMode, sdplitMode, mlitMode, slitMode, sdslitMode,
97       asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
98   { tokenLita, 0, { Syntax::dLITA, NOTHING },
99     { alitaMode, talitaMode, plitaMode, sdplitaMode, mlitaMode, slitaMode, sdslitaMode,
100       asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
101   { tokenMdc, 0, { Syntax::dMDC, NOTHING }, { mdMode, sdMode, EOM }},
102   { tokenMdoNameStart, 0, { Syntax::dMDO, SET + Syntax::nameStart },
103     { econMode, mconMode, econnetMode, mconnetMode,
104       proMode, dsMode, dsiMode, EOM }},
105   { tokenMdoMdc, 0, { Syntax::dMDO, Syntax::dMDC },
106     { econMode, mconMode, econnetMode, mconnetMode,
107       proMode, dsMode, dsiMode, EOM }},
108   { tokenMdoCom, 0, { Syntax::dMDO, Syntax::dCOM },
109     { econMode, mconMode, econnetMode, mconnetMode,
110       proMode, dsMode, dsiMode, EOM }},
111   { tokenMdoDso, 0, { Syntax::dMDO, Syntax::dDSO },
112     { econMode, mconMode, econnetMode, mconnetMode,
113       dsMode, dsiMode, imsMode, EOM }},
114   { tokenMinus, 0, { Syntax::dMINUS, NOTHING }, { mdMinusMode, sdMode, EOM }},
115   { tokenMinusGrpo, 0, { Syntax::dMINUS, Syntax::dGRPO }, { mdMode, EOM }},
116   { tokenMscMdc, 0, { Syntax::dMSC, Syntax::dMDC},
117     { imsMode, cmsMode, rcmsMode,
118       econMode, mconMode, econnetMode, mconnetMode, dsMode, dsiMode, EOM }},
119   { tokenNestc, 0, { Syntax::dNESTC, NOTHING }, { tagMode, EOM }},
120   { tokenNet, 0, { Syntax::dNET, NOTHING },
121     { econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
122   { tokenOpt, 0, { Syntax::dOPT, NOTHING }, { grpMode, grpsufMode, EOM }},
123   { tokenOr, 0, { Syntax::dOR, NOTHING }, { grpMode, EOM }},
124   { tokenPero, 0, { Syntax::dPERO, NOTHING }, { mdPeroMode, EOM }},
125   { tokenPeroNameStart, 0, { Syntax::dPERO, SET + Syntax::nameStart }, {
126     mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
127     plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
128   { tokenPeroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dPERO, Syntax::dGRPO },
129     { mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
130       plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
131   { tokenPic, 0, { Syntax::dPIC, NOTHING }, { piMode, EOM }},
132   { tokenPio, 0, { Syntax::dPIO, NOTHING },
133     { econMode, mconMode, econnetMode, mconnetMode, proMode,
134       dsMode, dsiMode, EOM }},
135   { tokenPlus, 0, { Syntax::dPLUS, NOTHING }, { grpMode, grpsufMode, EOM }},
136   { tokenPlusGrpo, 0, { Syntax::dPLUS, Syntax::dGRPO }, { mdMode, EOM }},
137   { tokenRefc, 0, { Syntax::dREFC, NOTHING }, { refMode, EOM }},
138   { tokenRep, 0, { Syntax::dREP, NOTHING }, { grpMode, grpsufMode, EOM }},
139   { tokenRni, 0, { Syntax::dRNI, NOTHING },
140     { grpMode, mdMode, mdPeroMode, EOM }},
141   { tokenSeq, 0, { Syntax::dSEQ, NOTHING }, { grpMode, EOM }},
142   { tokenStago, 0, { Syntax::dSTAGO, NOTHING }, { tagMode, EOM }},
143   { tokenStagoNameStart, 0, { Syntax::dSTAGO, SET + Syntax::nameStart },
144     { econMode, mconMode, econnetMode, mconnetMode, EOM }},
145   { tokenStagoTagc, REQUIRE_EMPTY_STARTTAG, { Syntax::dSTAGO, Syntax::dTAGC },
146     { econMode, mconMode, econnetMode, mconnetMode, EOM }},
147   { tokenStagoGrpo, REQUIRE_CONCUR, { Syntax::dSTAGO, Syntax::dGRPO },
148     { econMode, mconMode, econnetMode, mconnetMode, EOM }},
149   { tokenTagc, 0, { Syntax::dTAGC, NOTHING }, { tagMode, EOM }},
150   { tokenVi, 0, { Syntax::dVI, NOTHING }, { tagMode, asMode, piPasMode, EOM }},
151   // Other tokens
152   { tokenRe, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRE, NOTHING },
153     { mconMode, cconMode, rcconMode,
154       mconnetMode, cconnetMode, rcconnetMode,
155       rcconeMode, cmsMode, rcmsMode, EOM }},
156   { tokenRe, 0, { FUNCTION + Syntax::fRE, NOTHING },
157     { refMode,
158       mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
159       talitMode, talitaMode, taliteMode,
160       EOM }},
161   { tokenRs, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRS, NOTHING },
162     { mconMode, cconMode, rcconMode,
163       mconnetMode, cconnetMode, rcconnetMode,
164       rcconeMode, cmsMode, rcmsMode, EOM }},
165   { tokenRs, 0, { FUNCTION + Syntax::fRS, NOTHING },
166     { mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
167       talitMode, talitaMode, taliteMode,
168       EOM }},
169   { tokenSpace, 0, { FUNCTION + Syntax::fSPACE, NOTHING },
170     { mlitMode, mlitaMode, talitMode, talitaMode, taliteMode, EOM }},
171   { tokenSepchar, 0, { SET + Syntax::sepchar, NOTHING },
172     { alitMode, alitaMode, aliteMode,
173       talitMode, talitaMode, taliteMode, EOM }},
174   { tokenS, 0, { SET + Syntax::s, NOTHING },
175     { econMode, econnetMode, grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
176       proMode, dsMode, dsiMode, asMode, piPasMode, tagMode, EOM }},
177   { tokenNameStart, 0, { SET + Syntax::nameStart, NOTHING },
178     { grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
179       asMode, piPasMode, tagMode, EOM }},
180   { tokenDigit, 0, { SET + Syntax::digit, NOTHING },
181     { grpMode, mdMode, mdMinusMode, sdMode, asMode, piPasMode, tagMode, EOM }},
182   { tokenLcUcNmchar, 0, { SET + Syntax::nmchar, NOTHING },
183     { grpMode, mdMode, asMode, piPasMode, tagMode, EOM }},
184   { tokenIgnoredChar, 0, { SET + Syntax::sgmlChar, NOTHING },
185     { imsMode, EOM }},
186   { tokenChar, 0, { SET + Syntax::sgmlChar, NOTHING },
187     // Note that character data is recognized in element content,
188     // and will cause #PCDATA to begin.
189     { alitMode, alitaMode, aliteMode,
190       talitMode, talitaMode, taliteMode,
191       comMode, piMode,
192       cmsMode, rcmsMode,
193       plitMode, plitaMode, pliteMode,
194       slitMode, slitaMode,
195       econMode, mconMode, cconMode, rcconMode,
196       econnetMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, EOM }},
197   { tokenChar, 0, { SET + Syntax::minimumData, NOTHING },
198     { mlitMode, mlitaMode, EOM }},
199   { tokenChar, 0, { SET + Syntax::significant, NOTHING },
200     { sdplitMode, sdplitaMode, sdslitMode, sdslitaMode, sdcomMode, EOM }},
201 };
202 
inMode(Mode mode) const203 inline Boolean PackedTokenInfo::inMode(Mode mode) const
204 {
205   return ((modeBits[unsigned(mode) / ULONG_BIT]
206 	   & ((unsigned long)1 << (unsigned(mode) % ULONG_BIT)))
207 	  != 0);
208 }
209 
computeModeBits()210 void PackedTokenInfo::computeModeBits()
211 {
212   for (unsigned char *p = modes; *p != EOM; p++)
213     modeBits[*p / ULONG_BIT] |= (unsigned long)1 << (*p % ULONG_BIT);
214 }
215 
216 struct TokenTableIniter {
217   TokenTableIniter();
218 };
219 
220 static TokenTableIniter tokenTableIniter;
221 
TokenTableIniter()222 TokenTableIniter::TokenTableIniter()
223 {
224   for (size_t i = 0; i < SIZEOF(tokenTable); i++)
225     tokenTable[i].computeModeBits();
226 }
227 
ModeInfo(Mode mode,const Sd & sd)228 ModeInfo::ModeInfo(Mode mode, const Sd &sd)
229 : mode_(mode), p_(tokenTable), count_(SIZEOF(tokenTable)),
230   missingRequirements_(REQUIRE_FLAGS)
231 {
232   if (sd.startTagEmpty())
233     missingRequirements_ &= ~REQUIRE_EMPTY_STARTTAG;
234   if (sd.endTagEmpty())
235     missingRequirements_ &= ~REQUIRE_EMPTY_ENDTAG;
236   if (sd.concur())
237     missingRequirements_ &= ~(REQUIRE_CONCUR|REQUIRE_LINK_OR_CONCUR);
238   if (sd.link())
239     missingRequirements_ &= ~REQUIRE_LINK_OR_CONCUR;
240   if (!sd.keeprsre())
241     missingRequirements_ &= ~REQUIRE_NOT_KEEPRSRE;
242 }
243 
nextToken(TokenInfo * t)244 Boolean ModeInfo::nextToken(TokenInfo *t)
245 {
246   for (; count_ > 0; --count_, ++p_)
247     if (p_->inMode(mode_) && (p_->flags & missingRequirements_) == 0) {
248       t->token = p_->token;
249       t->priority = Priority::delim;
250       const unsigned char *contents = p_->contents;
251       --count_;
252       ++p_;
253       unsigned char c = contents[0];
254       if (c < SET)
255 	t->delim1 = Syntax::DelimGeneral(c);
256       else if (c < SET + Syntax::nSet) {
257 	t->set = Syntax::Set(c - SET);
258 	t->type = TokenInfo::setType;
259 	switch (t->set) {
260 	case Syntax::sepchar:
261 	case Syntax::s:
262 	case Syntax::blank:
263 	  t->priority = Priority::function;
264 	  break;
265 	default:
266 	  t->priority = Priority::data;
267 	  break;
268 	}
269 	return 1;
270       }
271       else {
272 	t->function = Syntax::StandardFunction(c - FUNCTION);
273 	t->priority = Priority::function;
274 	t->type = TokenInfo::functionType;
275 	return 1;
276       }
277       c = contents[1];
278       if (c == NOTHING) {
279 	t->type = TokenInfo::delimType;
280 	return 1;
281       }
282       if (c < SET) {
283 	t->delim2 = Syntax::DelimGeneral(c);
284 	t->type = TokenInfo::delimDelimType;
285 	return 1;
286       }
287       if (c < SET + Syntax::nSet) {
288 	t->set = Syntax::Set(c - SET);
289 	t->type = TokenInfo::delimSetType;
290 	return 1;
291       }
292       abort();
293     }
294   return 0;
295 }
296 
297 #ifdef SP_NAMESPACE
298 }
299 #endif
300