1 /*------------------------------------------------------------------------
2 Copyright (C) 2002-2016 SIL International. All rights reserved.
3 
4 Distributable under the terms of either the Common Public License or the
5 GNU Lesser General Public License, as specified in the LICENSING.txt file.
6 
7 File: Compiler.cpp
8 Responsibility: Jonathan Kew
9 Last reviewed: Not yet.
10 
11 Description:
12     Implements the TECkit mapping compiler.
13 -------------------------------------------------------------------------*/
14 
15 /*
16 	2008-11-17	jk			include <cstdio> (Debian bug 505693)
17 	2006-06-19	jk			added new APIs to look up Unicode names
18 	2006-01-12	jk			removed multi-char constants, use FOUR_CHAR_CODE to define UInt32 values instead
19 							(no functional change, just to avoid compiler warnings)
20     2005-07-07  jk  2.1.5   changed to use WORDS_BIGENDIAN rather than TARGET_RT_BIG_ENDIAN
21     2005-06-20  jk  2.1.4   added lhsDefault/rhsDefault attributes to <pass> elem in xml output
22 	23-May-2005		changes for 64-bit compilation, from Ulrik P
23 	21-May-2005		changes based on Ulrik Petersen's patch for MS VC++ 6
24     2004-11-11  jk  2.1.3   added support for XML export
25 	2004-07-21	jk	2.1.2	removed trailing spaces from 2 names in UnicodeNames.cpp
26 	2004-06-16	jk	2.1.1	fixed bug of ignoring char after '_'
27 	2004-03-12	jk	2.1		updated for version 2.1 with ...Opt APIs
28 							modified compiler to accept Unicode source text
29 */
30 
31 #include "Compiler.h"
32 
33 #include <cstdio>
34 #include <iostream>
35 #include <iomanip>
36 #include <algorithm>
37 #include <cstring>
38 #include <cstdio>
39 
40 #include "zlib.h"
41 
42 const UInt32	kInvalidChar	= 0xfffffffdUL;
43 
44 static UInt32
45 offsetsFromUTF8[6] =	{
46 	0x00000000UL,
47 	0x00003080UL,
48 	0x000E2080UL,
49 	0x03C82080UL,
50 	0xFA082080UL,
51 	0x82082080UL
52 };
53 
54 static UInt8
55 bytesFromUTF8[256] = {
56 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
64 };
65 
66 static UInt8
67 firstByteMark[7] = {
68 	0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
69 };
70 
71 const int halfShift					= 10;
72 const UInt32 halfBase				= 0x0010000UL;
73 const UInt32 kSurrogateHighStart	= 0xD800UL;
74 const UInt32 kSurrogateHighEnd		= 0xDBFFUL;
75 const UInt32 kSurrogateLowStart		= 0xDC00UL;
76 const UInt32 byteMask				= 0x000000BFUL;
77 const UInt32 byteMark				= 0x00000080UL;
78 
79 #define FOUR_CHAR_CODE(a,b,c,d)	(UInt32)((a << 24) + (b << 16) + (c << 8) + d)
80 
81 const UInt32 kCode_Byte	= FOUR_CHAR_CODE('B','y','t','e');
82 const UInt32 kCode_BU	= FOUR_CHAR_CODE('B','-','>','U');
83 const UInt32 kCode_UB	= FOUR_CHAR_CODE('U','-','>','B');
84 const UInt32 kCode_Unic	= FOUR_CHAR_CODE('U','n','i','c');
85 const UInt32 kCode_NFCf	= FOUR_CHAR_CODE('N','F','C','f');
86 const UInt32 kCode_NFCr	= FOUR_CHAR_CODE('N','F','C','r');
87 const UInt32 kCode_NFC	= FOUR_CHAR_CODE('N','F','C',' ');
88 const UInt32 kCode_NFDf	= FOUR_CHAR_CODE('N','F','D','f');
89 const UInt32 kCode_NFDr	= FOUR_CHAR_CODE('N','F','D','r');
90 const UInt32 kCode_NFD	= FOUR_CHAR_CODE('N','F','D',' ');
91 
92 Compiler::Keyword
93 Compiler::keywords[] = {
94 	{ "Pass",				tok_Pass,		0						},
95 	{ "Byte",				tok_PassType,	kCode_Byte				},
96 	{ "Byte_Unicode",		tok_PassType,	kCode_BU					},
97 	{ "Unicode_Byte",		tok_PassType,	kCode_UB					},
98 	{ "Unicode",			tok_PassType,	kCode_Unic				},
99 	{ "NFC_fwd",			tok_PassType,	kCode_NFCf				},
100 	{ "NFC_rev",			tok_PassType,	kCode_NFCr				},
101 	{ "NFC",				tok_PassType,	kCode_NFC				},
102 	{ "NFD_fwd",			tok_PassType,	kCode_NFDf				},
103 	{ "NFD_rev",			tok_PassType,	kCode_NFDr				},
104 	{ "NFD",				tok_PassType,	kCode_NFD				},
105 	{ "Class",				tok_Class,		0						},
106 	{ "ByteClass",			tok_Class,		'B'						},
107 	{ "UniClass",			tok_Class,		'U'						},
108 	{ "ByteDefault",		tok_Default,	'B'						},
109 	{ "UniDefault",			tok_Default,	'U'						},
110 	{ "EncodingName",		tok_Name,		kNameID_LHS_Name		},
111 	{ "DescriptiveName",	tok_Name,		kNameID_LHS_Description	},
112 	{ "Name",				tok_Name,		0xffffffff				},
113 	{ "LHSName",			tok_Name,		kNameID_LHS_Name		},
114 	{ "LHSDescription",		tok_Name,		kNameID_LHS_Description	},
115 	{ "RHSName",			tok_Name,		kNameID_RHS_Name		},
116 	{ "RHSDescription",		tok_Name,		kNameID_RHS_Description	},
117 	{ "Version",			tok_Name,		kNameID_Version			},
118 	{ "Contact",			tok_Name,		kNameID_Contact			},
119 	{ "RegistrationAuthority",	tok_Name,	kNameID_RegAuthority	},
120 	{ "RegistrationName",	tok_Name,		kNameID_RegName			},
121 	{ "Copyright",			tok_Name,		kNameID_Copyright		},
122 	{ "LHSFlags",			tok_Flags,		'S'						},
123 	{ "RHSFlags",			tok_Flags,		'T'						},
124 	{ "ExpectsNFC",			tok_FlagValue,	kFlags_ExpectsNFC		},
125 	{ "ExpectsNFD",			tok_FlagValue,	kFlags_ExpectsNFD		},
126 	{ "GeneratesNFC",		tok_FlagValue,	kFlags_GeneratesNFC		},
127 	{ "GeneratesNFD",		tok_FlagValue,	kFlags_GeneratesNFD		},
128 	{ "VisualOrder",		tok_FlagValue,	kFlags_VisualOrder		},
129 	{ "Define",				tok_Define,		0						},
130 	{ 0,					tok_Identifier,	0						}
131 };
132 
133 UInt32
134 WINAPI
TECkit_GetCompilerVersion()135 TECkit_GetCompilerVersion()
136 {
137 	return kCurrentTECkitVersion;
138 }
139 
140 TECkit_Status
141 WINAPI
TECkit_Compile(char * txt,UInt32 len,Byte doCompression,TECkit_ErrorFn errFunc,void * userData,Byte ** outTable,UInt32 * outLen)142 TECkit_Compile(char* txt, UInt32 len, Byte doCompression, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen)
143 {
144 	TECkit_Status	result = kStatus_CompilationFailed;
145 	try {
146 		Compiler*	cmp = new Compiler(txt, len, kForm_Unspecified, (bool)doCompression, false, errFunc, userData);
147 		cmp->GetCompiledTable(*outTable, *outLen);
148 		if (*outTable == 0)
149 			result = kStatus_CompilationFailed;
150 		else {
151 			cmp->DetachCompiledTable();
152 			result = kStatus_NoError;
153 		}
154 		delete cmp;
155 	}
156 	catch (...) {
157 		result = kStatus_Exception;
158 	}
159 	return result;
160 }
161 
162 TECkit_Status
163 WINAPI
TECkit_CompileOpt(char * txt,UInt32 len,TECkit_ErrorFn errFunc,void * userData,Byte ** outTable,UInt32 * outLen,UInt32 opts)164 TECkit_CompileOpt(char* txt, UInt32 len, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen, UInt32 opts)
165 {
166 	TECkit_Status	result = kStatus_CompilationFailed;
167 	try {
168 		Compiler*	cmp = new Compiler(txt, len, (opts & kCompilerOpts_FormMask),
169 			(opts & kCompilerOpts_Compress) != 0, (opts & kCompilerOpts_XML) != 0, errFunc, userData);
170 		cmp->GetCompiledTable(*outTable, *outLen);
171 		if (*outTable == 0)
172 			result = kStatus_CompilationFailed;
173 		else {
174 			cmp->DetachCompiledTable();
175 			result = kStatus_NoError;
176 		}
177 		delete cmp;
178 	}
179 	catch (...) {
180 		result = kStatus_Exception;
181 	}
182 	return result;
183 }
184 
185 void
186 WINAPI
TECkit_DisposeCompiled(Byte * table)187 TECkit_DisposeCompiled(Byte* table)
188 {
189 	if (table != 0)
190 		free(table);
191 }
192 
193 char*
194 WINAPI
TECkit_GetUnicodeName(UInt32 usv)195 TECkit_GetUnicodeName(UInt32 usv)
196 {
197 	const CharName	*c = &gUnicodeNames[0];
198 	while (c->name != 0)
199 		if (c->usv == usv)
200 			return (char*)c->name;
201 		else
202 			++c;
203 	return NULL;
204 }
205 
206 char*
207 WINAPI
TECkit_GetTECkitName(UInt32 usv)208 TECkit_GetTECkitName(UInt32 usv)
209 {
210 	static char	buffer[256];
211 	const char*	name = TECkit_GetUnicodeName(usv);
212 	if (name == NULL)
213 		sprintf(buffer, "U+%04X", usv);
214 	else {
215 		char* cp = &buffer[0];
216 		while (*name && (cp - buffer < 255)) {
217 			if ((*name < '0') || (*name > '9' && *name < 'A') || (*name > 'Z'))
218 				*cp++ = '_';
219 			else
220 				*cp++ = *name | 0x20;
221 			++name;
222 		}
223 		*cp = 0;
224 	}
225 	return buffer;
226 }
227 
228 static int
unicodeNameCompare(const char * uniName,const char * idStr,UInt32 len)229 unicodeNameCompare(const char* uniName, const char* idStr, UInt32 len)
230 { // idStr could be either a "real" unicode name or a teckit identifier
231   // when this is used by the TECkit_GetUnicodeValue API
232 	while (*uniName || len != 0) {
233 		if (len == 0)
234 			return 1;
235 		char	u = *uniName++;
236 		char	i = *idStr++;
237 		--len;
238 		if ((i >= 'a') && (i <= 'z'))
239 			i &= ~0x20;
240 		if (u == i)
241 			continue;
242 		if ((u < '0') || (u > '9' && u < 'A') || (u > 'Z'))
243 			u = '_';
244 		if (u == i)
245 			continue;
246 		return u < i ? -1 : 1;
247 	}
248 	return 0;
249 }
250 
251 int
252 WINAPI
TECkit_GetUnicodeValue(char * name)253 TECkit_GetUnicodeValue(char* name)
254 {
255 	const CharName	*c = &gUnicodeNames[0];
256 	int	len = strlen(name);
257 	while (c->name != 0)
258 		if (unicodeNameCompare(c->name, name, len) == 0)
259 			return c->usv;
260 		else
261 			++c;
262 	return -1;
263 }
264 
265 inline UInt8
READ(const UInt8 p)266 READ(const UInt8 p)
267 {
268 	return p;
269 }
270 
271 inline UInt16
READ(const UInt16 p)272 READ(const UInt16 p)
273 {
274 #ifdef WORDS_BIGENDIAN
275 	return p;
276 #else
277 	return (p >> 8) + (p << 8);
278 #endif
279 }
280 
281 inline UInt32
READ(const UInt32 p)282 READ(const UInt32 p)
283 {
284 #ifdef WORDS_BIGENDIAN
285 	return p;
286 #else
287 	return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);
288 #endif
289 }
290 
291 template<class T>
292 inline void
WRITE(T & t,UInt32 v)293 WRITE(T& t, UInt32 v)
294 {
295 	t = READ(T(v));
296 }
297 
298 void
appendToTable(string & s,const char * ptr,UInt32 len)299 Compiler::appendToTable(string& s, const char* ptr, UInt32 len)
300 {
301 #ifdef WORDS_BIGENDIAN
302 	s.append(ptr, len);
303 #else
304 	ptr += len;
305 	while (len-- > 0)
306 		s.append(1, *--ptr);
307 #endif
308 }
309 
310 static inline bool
isIDstart(char c)311 isIDstart(char c)
312 {
313 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
314 }
315 
316 static inline bool
isIDcont(char c)317 isIDcont(char c)
318 {
319 	return isIDstart(c) || (c >= '0' && c <= '9');
320 }
321 
322 static bool
strmatch(const char * str,const char * txt,UInt32 len)323 strmatch(const char* str, const char* txt, UInt32 len)
324 {
325 	while (*str || len != 0) {
326 		if (len == 0)
327 			return false;
328 		if ((*str++ | 0x20) != (*txt++ | 0x20))
329 			return false;
330 		--len;
331 	}
332 	return true;
333 }
334 
335 static const char*
getClassName(const map<string,UInt32> & nameMap,UInt32 index)336 getClassName(const map<string,UInt32>& nameMap, UInt32 index)
337 {
338 	for (map<string,UInt32>::const_iterator i = nameMap.begin(); i != nameMap.end(); ++i) {
339 		if (i->second == index) {
340 			return i->first.c_str();
341 		}
342 	}
343 	return "[UNKNOWN]";
344 }
345 
346 static const char*
asHex(UInt32 val,short digits)347 asHex(UInt32 val, short digits)
348 {
349 	static char	str[16];
350 	sprintf(str, "%0*X", digits, val);
351 	return str;
352 }
353 
354 static const char*
asDec(UInt32 val)355 asDec(UInt32 val)
356 {
357 	static char	str[16];
358 	sprintf(str, "%d", val);
359 	return str;
360 }
361 
362 void
xmlOut(const char * s)363 Compiler::xmlOut(const char* s)
364 {
365 	xmlRepresentation += s;
366 }
367 
368 void
xmlOut(const string & s)369 Compiler::xmlOut(const string& s)
370 {
371 	xmlRepresentation += s;
372 }
373 
374 void
xmlOut(char c)375 Compiler::xmlOut(char c)
376 {
377 	xmlRepresentation += c;
378 }
379 
380 string
getContextID(const vector<Item> & ctx,bool isUnicode)381 Compiler::getContextID(const vector<Item>& ctx, bool isUnicode)
382 {
383 	string	contextString = xmlString(ctx.begin(), ctx.end(), isUnicode);
384 	string	contextID = currentPass.xmlContexts[contextString];
385 	if (contextID.length() == 0) {
386 		contextID = isUnicode ? "uctx_" : "bctx_";
387 		contextID += asDec(currentPass.xmlContexts.size());
388 		currentPass.xmlContexts[contextString] = contextID;
389 	}
390 	return contextID;
391 }
392 
393 string
xmlString(vector<Item>::const_iterator b,vector<Item>::const_iterator e,bool isUnicode)394 Compiler::xmlString(vector<Item>::const_iterator b, vector<Item>::const_iterator e, bool isUnicode)
395 {
396 	string	rval;
397 	if (b == e)
398 		return rval;
399 	for (vector<Item>::const_iterator i = b; i != e; ++i) {
400 		switch (i->type) {
401 			case 0:
402 				rval += "<ch n=\"";
403 				rval += asHex(i->val, isUnicode ? 4 : 2);
404 				rval += "\"";
405 				break;
406 			case kMatchElem_Type_EOS:
407 				rval += "<eot";
408 				break;
409 			case kMatchElem_Type_ANY:
410 				rval += "<any";
411 				break;
412 			case kMatchElem_Type_BGroup:
413 				{
414 					vector<Item>::const_iterator j = i;
415 					int nesting = 0;
416 					bool	alt = false;
417 					string	groupStr;
418 					++i;
419 					while (++j != e) {
420 						if (j->type == kMatchElem_Type_BGroup)
421 							++nesting;
422 						else if (j->type == kMatchElem_Type_EGroup) {
423 							if (nesting == 0) {
424 								if (alt && i < j - 1)
425 									groupStr += "<group>\n";
426 								groupStr += xmlString(i, j, isUnicode);
427 								if (alt && i < j - 1)
428 									groupStr += "</group>\n";
429 								break;
430 							}
431 							else
432 								--nesting;
433 						}
434 						else if (j->type == kMatchElem_Type_OR && nesting == 0) {
435 							if (i < j - 1)
436 								groupStr += "<group>\n";
437 							groupStr += xmlString(i, j, isUnicode);
438 							if (i < j - 1)
439 								groupStr += "</group>\n";
440 							i = j + 1;
441 							alt = true;
442 						}
443 					}
444 					i = j;
445 					rval += "<group";
446 					if (alt)
447 						rval += " alt=\"1\"";
448 					if ((i->repeatMin != 1) && (i->repeatMin != 255)) {
449 						rval += " min=\"";
450 						rval += asDec(i->repeatMin);
451 						rval += "\"";
452 					}
453 					if ((i->repeatMax != 1) && (i->repeatMax != 255)) {
454 						rval += " max=\"";
455 						rval += asDec(i->repeatMax);
456 						rval += "\"";
457 					}
458 					if (i->tag.length() > 0) {
459 						if (i->type != kMatchElem_Type_Copy) {
460 							rval += " id=\"";
461 							rval += i->tag;
462 							rval += "\"";
463 						}
464 					}
465 					rval += ">\n";
466 					rval += groupStr;
467 					rval += "</group>\n";
468 					continue;
469 				}
470 				break;
471 			case kMatchElem_Type_OR:
472 				rval += "<OR/>\n";
473 				continue;
474 				break;
475 			case kMatchElem_Type_EGroup:
476 				rval += "<END-GROUP/>\n";
477 				continue;
478 				break;
479 			case kMatchElem_Type_Class:
480 				{
481 					rval += "<class-ref name=\"";
482 					const map<string,UInt32>&	classes = isUnicode ? currentPass.uniClassNames : currentPass.byteClassNames;
483 					rval += isUnicode ? "u_" : "b_";
484 					rval += getClassName(classes, i->val);
485 					rval += "\"";
486 				}
487 				break;
488 			case kMatchElem_Type_Copy:
489 				rval += "<copy-ref id=\"";
490 				rval += i->tag;
491 				rval += "\"";
492 				break;
493 			default:
494 				rval += "<UNKNOWN type=\"";
495 				rval += asHex(i->type, 1);
496 				break;
497 		}
498 		if (i->negate)
499 			rval += " neg=\"1\"";
500 		if ((i->repeatMin != 1) && (i->repeatMin != 255)) {
501 			rval += " min=\"";
502 			rval += asDec(i->repeatMin);
503 			rval += "\"";
504 		}
505 		if ((i->repeatMax != 1) && (i->repeatMax != 255)) {
506 			rval += " max=\"";
507 			rval += asDec(i->repeatMax);
508 			rval += "\"";
509 		}
510 		if (i->tag.length() > 0) {
511 			if (i->type != kMatchElem_Type_Copy) {
512 				rval += " id=\"";
513 				rval += i->tag;
514 				rval += "\"";
515 			}
516 		}
517 		rval += "/>";
518 	}
519 	return rval;
520 }
521 
Compiler(const char * txt,UInt32 len,char inForm,bool cmp,bool genXML,TECkit_ErrorFn errFunc,void * userData)522 Compiler::Compiler(const char* txt, UInt32 len, char inForm, bool cmp, bool genXML, TECkit_ErrorFn errFunc, void* userData)
523 {
524 	compiledTable = 0;
525 	compiledSize = 0;
526 	usedExtStringRules = false;
527 
528 	textPtr = (const unsigned char*)txt;
529 	textEnd = textPtr + len;
530 
531 	ungotten = kInvalidChar;
532 	inputForm = inForm;
533 
534 	generateXML = genXML;
535 
536 	lineNumber = 1;
537 	errorState = false;
538 	errorCount = 0;
539 
540 	errorFunction = errFunc;
541 	errFuncUserData = userData;
542 
543 	names.clear();
544 	fwdTables.clear();
545 	revTables.clear();
546 	currentPass.clear();
547 	buildVars.clear();
548 	currentRule.clear();
549 
550 	lhsFlags = 0;
551 	rhsFlags = 0;
552 
553 	char	classType;
554 
555 	ruleState = notInRule;
556 	int	nestingLevel = 0;
557 
558 	defIter = defEnd;
559 
560 	while (inputForm == kForm_Unspecified) {
561 		// attempt to determine input encoding
562 		if (len >= 4) {
563 			// check for UTF32 BOM or 3 nulls in first 4 bytes
564 			if (strncmp(txt, "\0\0\376\377", 4) == 0) {
565 				inputForm = kForm_UTF32BE;
566 				break;
567 			}
568 			if (strncmp(txt, "\377\376\0\0", 4) == 0) {
569 				inputForm = kForm_UTF32LE;
570 				break;
571 			}
572 			if (strncmp(txt, "\0\0\0", 3) == 0) {
573 				inputForm = kForm_UTF32BE;
574 				break;
575 			}
576 			if (strncmp(txt+1, "\0\0\0", 3) == 0) {
577 				inputForm = kForm_UTF32LE;
578 				break;
579 			}
580 		}
581 		if (len >= 3) {
582 			// check for UTF8 signature
583 			if (strncmp(txt, "\357\273\277", 3) == 0) {
584 				inputForm = kForm_UTF8;
585 				break;
586 			}
587 		}
588 		if (len >= 2) {
589 			// check for UTF16 BOM or null byte
590 			if (strncmp(txt, "\376\377", 2) == 0) {
591 				inputForm = kForm_UTF16BE;
592 				break;
593 			}
594 			if (strncmp(txt, "\377\376", 2) == 0) {
595 				inputForm = kForm_UTF16LE;
596 				break;
597 			}
598 			if (txt[0] == '\0') {
599 				inputForm = kForm_UTF16BE;
600 				break;
601 			}
602 			if (txt[1] == '\0') {
603 				inputForm = kForm_UTF16LE;
604 				break;
605 			}
606 		}
607 		inputForm = kForm_Bytes;
608 	};
609 
610 	if (inputForm != kForm_Bytes) {
611 		// discard initial BOM if present
612 		UInt32 currCh = getChar();
613 		if (currCh != 0xfeff)
614 			ungetChar(currCh);
615 	}
616 
617 	while (GetNextToken()) {
618 		// on error, skip to next newline
619 	GOT_TOKEN:
620 		if (errorState) {
621 			if (ruleState != notInRule) {
622 				ruleState = notInRule;
623 				nestingLevel = 0;
624 				currentRule.clear();
625 			}
626 			if (tok.type != tok_Newline)
627 				continue;
628 		}
629 		errorState = false;
630 
631 		string32::const_iterator i;
632 		switch ((int)tok.type) {
633 			default:
634 				Error("this can't happen!");
635 				break;
636 
637 			case tok_Unknown:
638 				Error("unexpected character", string(1, tok.val).c_str());
639 				break;
640 
641 			case tok_Identifier:
642 				Error("unexpected identifier", asUTF8(tok.strval).c_str());
643 				break;
644 
645 			case tok_Define:
646 				if (!ExpectToken(tok_Identifier, "expected identifier after Define"))
647 					break;
648 				{
649 					string		defName(asUTF8(tok.strval));
650 					tokListT	defToks;
651 					while (GetNextToken()) {
652 						if (tok.type == tok_Newline)
653 							break;
654 						if (tok.type == tok_Unknown)
655 							Error("unexpected character in Define text", string(1, tok.val).c_str());
656 						else
657 							defToks.push_back(tok);
658 					}
659 					defines[defName] = defToks;
660 				}
661 				break;
662 
663 			case tok_Newline:
664 				switch (ruleState) {
665 					default:
666 						break;
667 
668 					case inLHSString:
669 					case inLHSPreContext:
670 					case inLHSPostContext:
671 						Error("no mapping operator found");
672 						goto GOT_TOKEN;
673 
674 					case inRHSString:
675 					case inRHSPreContext:
676 					case inRHSPostContext:
677 						if (nestingLevel > 0) {
678 							Error("unmatched opening parenthesis");
679 							break;
680 						}
681 						if (ruleType == 0 || ruleType == '>') {
682 							currentPass.fwdRules.push_back(Rule(currentRule.lhsString,
683 								reverseContext(currentRule.lhsPreContext), currentRule.lhsPostContext,
684 								currentRule.rhsString, currentRule.startingLine));
685 						}
686 						if (ruleType == 0 || ruleType == '<') {
687 							currentPass.revRules.push_back(Rule(currentRule.rhsString,
688 								reverseContext(currentRule.rhsPreContext), currentRule.rhsPostContext,
689 								currentRule.lhsString, currentRule.startingLine));
690 						}
691 						if (generateXML) {
692 							// create an XML representation of the rule and append to currentPass.xmlRules/xmlContexts
693 							bool	sourceUni = (currentPass.passType == kCode_UB) || (currentPass.passType == kCode_Unic);
694 							bool	targetUni = (currentPass.passType == kCode_BU) || (currentPass.passType == kCode_Unic);
695 
696 							string	xmlRule;
697 							xmlRule += "<a";
698 							xmlRule += " line=\"";
699 							xmlRule += asDec(currentRule.startingLine);
700 							xmlRule += "\"";
701 							if (ruleType == '>')
702 								xmlRule += " dir=\"fwd\"";
703 							else if (ruleType == '<')
704 								xmlRule += " dir=\"rev\"";
705 							xmlRule += ">\n";
706 
707 							string	contextID;
708 							xmlRule += "<l";
709 							if (currentRule.lhsPreContext.size() != 0) {
710 								contextID = getContextID(currentRule.lhsPreContext, sourceUni);
711 								xmlRule += " preCtx=\"";
712 								xmlRule + contextID;
713 								xmlRule += "\"";
714 							}
715 							if (currentRule.lhsPostContext.size() != 0) {
716 								contextID = getContextID(currentRule.lhsPostContext, sourceUni);
717 								xmlRule += " postCtx=\"";
718 								xmlRule += contextID;
719 								xmlRule += "\"";
720 							}
721 							xmlRule += ">";
722 							xmlRule += xmlString(currentRule.lhsString.begin(), currentRule.lhsString.end(), sourceUni);
723 							xmlRule += "</l>\n";
724 
725 							xmlRule += "<r";
726 							if (currentRule.rhsPreContext.size() != 0) {
727 								contextID = getContextID(currentRule.rhsPreContext, targetUni);
728 								xmlRule += " preCtx=\"";
729 								xmlRule += contextID;
730 								xmlRule += "\"";
731 							}
732 							if (currentRule.rhsPostContext.size() != 0) {
733 								contextID = getContextID(currentRule.rhsPostContext, targetUni);
734 								xmlRule += " postCtx=\"";
735 								xmlRule += contextID;
736 								xmlRule += "\"";
737 							}
738 							xmlRule += ">";
739 							xmlRule += xmlString(currentRule.rhsString.begin(), currentRule.rhsString.end(), targetUni);
740 							xmlRule += "</r>\n";
741 
742 							xmlRule += "</a>\n";
743 							currentPass.xmlRules.push_back(xmlRule);
744 						}
745 						currentRule.clear();
746 						ruleState = notInRule;
747 						break;
748 				}
749 				break;
750 
751 			case tok_Number:
752 				AppendLiteral(tok.val);
753 				break;
754 
755 			case tok_USV:
756 				AppendUSV(tok.val);
757 				break;
758 
759 			case tok_String:
760 				if (inputForm == kForm_Bytes && charLimit() != 0xff) {
761 					Error("can't use quoted string for Unicodes in 8-bit source text");
762 					break;
763 				}
764 				if (inputForm != kForm_Bytes && charLimit() == 0xff) {
765 					Error("can't use quoted string for Bytes in Unicode source text");
766 					break;
767 				}
768 				for (i = tok.strval.begin(); i != tok.strval.end(); ++i)
769 					AppendLiteral(*i);
770 				break;
771 
772 			case '^':
773 				// negation can only apply to a few things:
774 				GetNextToken();
775 				switch ((int)tok.type) {
776 					case tok_Number:
777 						AppendLiteral(tok.val, true);
778 						break;
779 					case tok_USV:
780 						AppendUSV(tok.val, true);
781 						break;
782 					case '[':
783 						if (!ExpectToken(tok_Identifier, "expected CLASS-NAME after opening bracket"))
784 							break;
785 						AppendClass(asUTF8(tok.strval), true);
786 						if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
787 							break;
788 						break;
789 					case '#':
790 						AppendSpecial(kMatchElem_Type_EOS, true);
791 						break;
792 					case '.':
793 						AppendSpecial(kMatchElem_Type_ANY, true);
794 						break;
795 					default:
796 						Error("invalid use of negation");
797 						break;
798 				}
799 				break;
800 
801 			case '#':
802 				AppendSpecial(kMatchElem_Type_EOS);
803 				break;
804 
805 			case '.':
806 				AppendSpecial(kMatchElem_Type_ANY);
807 				break;
808 
809 			case '(':
810 				AppendSpecial(kMatchElem_Type_BGroup);
811 				++nestingLevel;
812 				break;
813 
814 			case ')':
815 				if (nestingLevel == 0) {
816 					Error("unmatched closing parenthesis");
817 					break;
818 				}
819 				--nestingLevel;
820 				AppendSpecial(kMatchElem_Type_EGroup);
821 				break;
822 
823 			case '|':
824 				if (nestingLevel == 0) {
825 					Error("alternation only permitted within parentheses");
826 					break;
827 				}
828 				AppendSpecial(kMatchElem_Type_OR);
829 				break;
830 
831 			case tok_Map:
832 			case '>':
833 			case '<':
834 				if (nestingLevel > 0) {
835 					Error("unmatched opening parenthesis");
836 					break;
837 				}
838 				switch (ruleState) {
839 					default:
840 						Error("not within a mapping rule");
841 						break;
842 					case inLHSString:
843 					case inLHSPostContext:
844 						ruleState = inRHSString;
845 						ruleType = (tok.type == tok_Map ? 0 : (tok.type == '>' ? '>' : '<'));
846 						break;
847 					case inLHSPreContext:
848 						Error("no underscore found in context");
849 						break;
850 					case inRHSString:
851 					case inRHSPreContext:
852 					case inRHSPostContext:
853 						Error("extra mapping operator in rule");
854 						break;
855 				}
856 				break;
857 
858 			case '/':
859 				if (nestingLevel > 0) {
860 					Error("unmatched opening parenthesis");
861 					break;
862 				}
863 				switch (ruleState) {
864 					default:
865 						Error("not within a mapping rule");
866 						break;
867 					case inLHSString:
868 						ruleState = inLHSPreContext;
869 						break;
870 					case inRHSString:
871 						ruleState = inRHSPreContext;
872 						break;
873 					case inLHSPreContext:
874 					case inLHSPostContext:
875 					case inRHSPreContext:
876 					case inRHSPostContext:
877 						Error("extra slash in rule");
878 						break;
879 				}
880 				break;
881 
882 			case '_':
883 				if (nestingLevel > 0) {
884 					Error("unmatched opening parenthesis");
885 					break;
886 				}
887 				switch (ruleState) {
888 					default:
889 						Error("not within a mapping rule");
890 						break;
891 					case inLHSPreContext:
892 						ruleState = inLHSPostContext;
893 						break;
894 					case inRHSPreContext:
895 						ruleState = inRHSPostContext;
896 						break;
897 					case inLHSString:
898 					case inRHSString:
899 						Error("underscore only allowed in context");
900 						break;
901 					case inLHSPostContext:
902 					case inRHSPostContext:
903 						Error("extra underscore in context");
904 						break;
905 				}
906 				break;
907 
908 			case '[':
909 				if (!ExpectToken(tok_Identifier, "expected CLASS-NAME after opening bracket"))
910 					break;
911 				AppendClass(asUTF8(tok.strval));
912 				if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
913 					break;
914 				break;
915 
916 			case ']':
917 				Error("unmatched closing bracket");
918 				break;
919 
920 			case '=':
921 				if (!ExpectToken(tok_Identifier, "expected tag name after '='"))
922 					break;
923 				AssignTag(asUTF8(tok.strval));
924 				break;
925 
926 			case '@':
927 				if (!ExpectToken(tok_Identifier, "expected tag name after '@'"))
928 					break;
929 				AppendSpecial(kMatchElem_Type_Copy);
930 				AssignTag(asUTF8(tok.strval));
931 				break;
932 
933 			case '?':
934 				SetMinMax(0, 1);
935 				break;
936 
937 			case '*':
938 				SetMinMax(0, 15);
939 				break;
940 
941 			case '+':
942 				SetMinMax(1, 15);
943 				break;
944 
945 			case '{':
946 				{
947 					int	repeatMin = 0;
948 					int repeatMax = 15;
949 					GetNextToken();
950 					if (tok.type == tok_Number) {
951 						repeatMin = repeatMax = tok.val;
952 						GetNextToken();
953 						if (tok.type == ',') {
954 							GetNextToken();
955 							if (tok.type == tok_Number) {
956 								repeatMax = tok.val;
957 								if (!ExpectToken('}', "expected closing brace after repeat counts"))
958 									break;
959 							}
960 							else if (tok.type == '}')
961 								repeatMax = 15;
962 							else {
963 								Error("expected repeat count or closing brace after comma");
964 								break;
965 							}
966 						}
967 						else if (tok.type != '}') {
968 							Error("expected comma or closing brace after repeat count");
969 							break;
970 						}
971 					}
972 					else if (tok.type == ',') {
973 						GetNextToken();
974 						if (tok.type == tok_Number)
975 							repeatMax = tok.val;
976 						else {
977 							Error("expected repeat count");
978 							break;
979 						}
980 						if (!ExpectToken('}', "expected closing brace after repeat count"))
981 							break;
982 					}
983 					else {
984 						Error("expected repeat counts within braces");
985 						break;
986 					}
987 					SetMinMax(repeatMin, repeatMax);
988 				}
989 				break;
990 
991 			case '}':
992 				Error("unmatched closing brace");
993 				break;
994 
995 			case tok_Name:
996 				if (tok.val == 0xffffffff) {
997 					if (!ExpectToken('(', "expected (NUMBER) STRING after Name"))
998 						break;
999 					if (!ExpectToken(tok_Number,  "expected (NUMBER) STRING after Name"))
1000 						break;
1001 					int nameID = tok.val;
1002 					if (!ExpectToken(')', "expected (NUMBER) STRING after Name"))
1003 						break;
1004 					ReadNameString(nameID);
1005 				}
1006 				else
1007 					ReadNameString(tok.val);
1008 				goto GOT_TOKEN;	// ReadNameString has already read the newline
1009 
1010 			case tok_Flags:
1011 				{
1012 					if (!ExpectToken('(', "expected (FLAG-LIST) after SourceFlags/TargetFlags"))
1013 						break;
1014 					UInt32	flagValue = 0;
1015 					char	whichFlags = tok.val;
1016 					while (1) {
1017 						GetNextToken();
1018 						if (tok.type == tok_FlagValue)
1019 							flagValue |= tok.val;
1020 						else
1021 							break;
1022 					}
1023 					if (tok.type != ')') {
1024 						Error("expected (FLAG-LIST) after SourceFlags/TargetFlags");
1025 						break;
1026 					}
1027 					if (whichFlags == 'S')
1028 						lhsFlags = flagValue;
1029 					else
1030 						rhsFlags = flagValue;
1031 				}
1032 				ExpectToken(tok_Newline, "junk at end of line");
1033 				break;
1034 
1035 			case tok_Pass:
1036 				FinishPass();
1037 				currentPass.setLineNo(lineNumber);
1038 				if (!ExpectToken('(', "expected (PASS-TYPE) after Pass"))
1039 					break;
1040 				GetNextToken();
1041 				if (tok.type == tok_PassType)
1042 					currentPass.passType = tok.val;
1043 				else
1044 					Error("unrecognized pass type");
1045 				if (!ExpectToken(')', "expected (PASS-TYPE) after Pass"))
1046 					break;
1047 				ExpectToken(tok_Newline, "junk at end of line");
1048 				goto  GOT_TOKEN;
1049 
1050 			case tok_Default:
1051 				StartDefaultPass();
1052 				if (currentPass.passType != kCode_BU && currentPass.passType != kCode_UB) {
1053 					Error("defaults are only used in Byte_Unicode and Unicode_Byte passes");
1054 					break;
1055 				}
1056 				{
1057 					char	whichDefault = tok.val;
1058 					GetNextToken();
1059 					switch (tok.type) {
1060 						case tok_String:
1061 							if (tok.strval.length() != 1)
1062 								Error("default can only be a single character, not a multi-character string");
1063 							else if (whichDefault == 'U') {
1064 								if (inputForm == kForm_Bytes)
1065 									Error("UniDefault cannot use quoted character in 8-bit source text");
1066 								else
1067 									currentPass.uniDefault = tok.strval[0];
1068 							}
1069 							else {
1070 								if (inputForm != kForm_Bytes)
1071 									Error("ByteDefault cannot use quoted character in Unicode source text");
1072 								else
1073 									currentPass.byteDefault = tok.strval[0];
1074 							}
1075 							break;
1076 						case tok_Number:
1077 							if (whichDefault == 'U')
1078 								currentPass.uniDefault = tok.val;
1079 							else
1080 								currentPass.byteDefault = tok.val;
1081 							break;
1082 						case tok_USV:
1083 							if (whichDefault == 'U')
1084 								currentPass.uniDefault = tok.val;
1085 							else
1086 								Error("can't use Unicode value in byte encoding");
1087 							break;
1088 						default:
1089 							Error("expected character code after ByteDefault/UniDefault");
1090 							break;
1091 					}
1092 				}
1093 				break;
1094 
1095 			case tok_Class:
1096 				StartDefaultPass();
1097 				classLine = lineNumber;
1098 				if (tok.val == 0) {
1099 					if (currentPass.passType == kCode_Byte)
1100 						classType = 'B';
1101 					else if (currentPass.passType == kCode_Unic)
1102 						classType = 'U';
1103 					else {
1104 						Error("must use ByteClass or UniClass to define classes in this pass");
1105 						break;
1106 					}
1107 				}
1108 				else {
1109 					classType = tok.val;
1110 					if (classType == 'B' && currentPass.passType == kCode_Unic)
1111 						Error("can't use ByteClass in this pass");
1112 					else if (classType == 'U' && currentPass.passType == kCode_Byte)
1113 						Error("can't use UniClass in this pass");
1114 				}
1115 				{
1116 					UInt32	classLimit = (classType == 'U' ? 0x10ffff : 0xff);
1117 					if (!ExpectToken('[', "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1118 						break;
1119 					if (!ExpectToken(tok_Identifier, "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1120 						break;
1121 					string	className(asUTF8(tok.strval));
1122 					if (!ExpectToken(']', "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1123 						break;
1124 					if (!ExpectToken('=', "expected =(CHARACTER-CODE-LIST) after Class/ByteClass/UniClass[CLASS-NAME]"))
1125 						break;
1126 					if (!ExpectToken('(', "expected =(CHARACTER-CODE-LIST) after Class/ByteClass/UniClass[CLASS-NAME]"))
1127 						break;
1128 					vector<UInt32>	classMembers;
1129 					bool	ellipsis = false;
1130 					bool	ellipsisOK = false;
1131 					while (tok.type != ')' && tok.type != tok_Newline) {
1132 						GetNextToken();
1133 						switch ((int)tok.type) {
1134 							case tok_USV:
1135 								if (classType == 'B') {
1136 									Error("can't use Unicode value in byte encoding");
1137 									break;
1138 								}
1139 								// fall through
1140 							case tok_Number:
1141 								if (tok.val > classLimit) {
1142 									Error("class element outside valid range");
1143 									break;
1144 								}
1145 								if (ellipsis) {
1146 									ellipsis = false;
1147 									ellipsisOK = false;
1148 									UInt32	lastVal = classMembers.back();
1149 									if (tok.val < lastVal) {
1150 										Error("range out of order");
1151 										break;
1152 									}
1153 									while (++lastVal <= tok.val)
1154 										classMembers.push_back(lastVal);
1155 								}
1156 								else {
1157 									classMembers.push_back(tok.val);
1158 									ellipsisOK = true;
1159 								}
1160 								if (classMembers.back() > 0x0000ffff)
1161 									currentPass.supplementaryChars = true;
1162 								break;
1163 
1164 							case tok_String:
1165 								if (classType == 'U' && inputForm == kForm_Bytes) {
1166 									Error("can't use quoted string for Unicode class in 8-bit source text");
1167 									break;
1168 								}
1169 								if (classType == 'B' && inputForm != kForm_Bytes) {
1170 									Error("can't use quoted string for Byte class in Unicode source text");
1171 									break;
1172 								}
1173 								if (ellipsis) {
1174 									ellipsis = false;
1175 									ellipsisOK = false;
1176 									if (tok.strval.length() != 1) {
1177 										Error("can only use single-character string with ..");
1178 										break;
1179 									}
1180 									UInt32	lastVal = classMembers.back();
1181 									if (tok.strval[0] < lastVal) {
1182 										Error("range out of order");
1183 										break;
1184 									}
1185 									while (++lastVal <= tok.strval[0])
1186 										classMembers.push_back(lastVal);
1187 									break;
1188 								}
1189 								ellipsisOK = (tok.strval.length() == 1);
1190 								for (i = tok.strval.begin(); i < tok.strval.end(); ++i)
1191 									classMembers.push_back(*i);
1192 								break;
1193 
1194 							case tok_Ellipsis:
1195 								if (ellipsisOK) {
1196 									ellipsisOK = false;
1197 									ellipsis = true;
1198 								}
1199 								else
1200 									Error("illegal .. in class");
1201 								break;
1202 
1203 							case '[':
1204 								{
1205 									if (ellipsis) {
1206 										Error("can't use [CLASS-NAME] after ..");
1207 										break;
1208 									}
1209 									ellipsis = false;
1210 									ellipsisOK = false;
1211 									// get the referenced class and copy in its members
1212 									if (ExpectToken(tok_Identifier, "expected [CLASS-NAME]")) {
1213 										string	refName(asUTF8(tok.strval));
1214 										if (classType == 'U') {
1215 											map<string,UInt32>::const_iterator	c = currentPass.uniClassNames.find(refName);
1216 											if (c == currentPass.uniClassNames.end()) {
1217 												Error("undefined class used", refName.c_str());
1218 												break;
1219 											}
1220 											Class	uc = currentPass.uniClassMembers[c->second];
1221 											for (Class::const_iterator i = uc.begin(); i != uc.end(); ++i)
1222 												classMembers.push_back(*i);
1223 										}
1224 										else {
1225 											map<string,UInt32>::const_iterator	c = currentPass.byteClassNames.find(refName);
1226 											if (c == currentPass.byteClassNames.end()) {
1227 												Error("undefined class used", refName.c_str());
1228 												break;
1229 											}
1230 											Class	bc = currentPass.byteClassMembers[c->second];
1231 											for (Class::const_iterator i = bc.begin(); i != bc.end(); ++i)
1232 												classMembers.push_back(*i);
1233 										}
1234 										if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
1235 											break;
1236 									}
1237 								}
1238 								break;
1239 
1240 							case ')':
1241 								if (ellipsis)
1242 									Error("trailing .. in class");
1243 								break;
1244 
1245 							case tok_Newline:
1246 								Error("unexpected end of line within class");
1247 								break;
1248 
1249 							case tok_Identifier:
1250 								Error("unexpected identifier within class", asUTF8(tok.strval).c_str());
1251 								break;
1252 
1253 							default:
1254 								Error("unexpected token within class", string((const char*)tokStart, (const char*)textPtr - (const char*)tokStart).c_str());
1255 								break;
1256 						}
1257 					}
1258 					if (tok.type != tok_Newline)
1259 						if (!ExpectToken(tok_Newline, "junk at end of line"))
1260 							break;
1261 					// ok, we've got the class name and members; save it
1262 					if (classType == 'U') {
1263 						if (currentPass.uniClassNames.find(className) != currentPass.uniClassNames.end()) {
1264 							Error("class already defined", className.c_str());
1265 							break;
1266 						}
1267 						currentPass.uniClassNames[className] = currentPass.uniClassMembers.size();
1268 						currentPass.uniClassMembers.push_back(classMembers);
1269 						currentPass.uniClassLines.push_back(classLine);
1270 					}
1271 					else {
1272 						if (currentPass.byteClassNames.find(className) != currentPass.byteClassNames.end()) {
1273 							Error("class already defined", className.c_str());
1274 							break;
1275 						}
1276 						currentPass.byteClassNames[className] = currentPass.byteClassMembers.size();
1277 						currentPass.byteClassMembers.push_back(classMembers);
1278 						currentPass.byteClassLines.push_back(classLine);
1279 					}
1280 					goto GOT_TOKEN;
1281 				}
1282 				break;
1283 		}
1284 	}
1285 	FinishPass();
1286 
1287 	// Do we have names for both LHS and RHS? If not, is LHS legacy and RHS Unicode?
1288 	if (names.find(kNameID_LHS_Name) == names.end()) {
1289 		Error("EncodingName or LHSName must be specified");
1290 	}
1291 	const string&   lhs = names[kNameID_LHS_Name];
1292 	if (lhs.find("(REG_ID)") != lhs.npos) {
1293 		Error("Draft mappings generated by Encore2Unicode MUST be reviewed before use");
1294 	}
1295 	if (names.find(kNameID_RHS_Name) == names.end()) {
1296 		if ((lhsFlags & kFlags_Unicode) == 0 || (rhsFlags & kFlags_Unicode) != 0) {
1297 			names[kNameID_RHS_Name] = "UNICODE";
1298 		}
1299 		else {
1300 			Error("RHSName must be specified for non-Legacy/Unicode mapping table");
1301 		}
1302 	}
1303 
1304 	if (errorCount == 0) {
1305 		if (generateXML) {
1306 			string	header;
1307 			header += "<?xml version=\"1.0\"?>\n";
1308 			header += "<teckitMapping\n";
1309 
1310 #define doName(att,name_id)								\
1311 			if (names.find(name_id) != names.end()) {	\
1312 				header += " ";							\
1313 				header += att;							\
1314 				header += "=\"";						\
1315 				header += names[name_id];				\
1316 				header += "\"\n";						\
1317 			}
1318 
1319 			doName("lhsName", kNameID_LHS_Name);
1320 			doName("rhsName", kNameID_RHS_Name);
1321 			doName("lhsDescription", kNameID_LHS_Description);
1322 			doName("rhsDescription", kNameID_RHS_Description);
1323 			doName("version", kNameID_Version);
1324 			doName("contact", kNameID_Contact);
1325 			doName("registrationAuthority", kNameID_RegAuthority);
1326 			doName("registrationName", kNameID_RegName);
1327 			doName("copyright", kNameID_Copyright);
1328 
1329 			if (lhsFlags & kFlags_ExpectsNFC)
1330 				header += " lhsExpects=\"NFC\"\n";
1331 			else if (lhsFlags & kFlags_ExpectsNFD)
1332 				header += " lhsExpects=\"NFD\"\n";
1333 			if (rhsFlags & kFlags_ExpectsNFC)
1334 				header += " rhsExpects=\"NFC\"\n";
1335 			else if (rhsFlags & kFlags_ExpectsNFD)
1336 				header += " rhsExpects=\"NFD\"\n";
1337 
1338 			header += ">\n";
1339 
1340 			string	trailer("</teckitMapping>\n");
1341 
1342 			compiledSize = header.length() + xmlRepresentation.length() + trailer.length();
1343 			compiledTable = (Byte*)malloc(compiledSize + 1);
1344 			if (compiledTable == NULL)
1345 				throw bad_alloc();
1346 
1347 			memcpy(compiledTable, header.data(), header.length());
1348 			memcpy(compiledTable + header.length(), xmlRepresentation.data(), xmlRepresentation.length());
1349 			memcpy(compiledTable + header.length() + xmlRepresentation.length(), trailer.data(), trailer.length());
1350 			compiledTable[compiledSize] = 0;
1351 
1352 			xmlRepresentation.erase(xmlRepresentation.begin(), xmlRepresentation.end());
1353 		}
1354 		else {
1355 			// assemble the complete compiled file
1356 			FileHeader	fh;
1357 			WRITE(fh.type, kMagicNumber);
1358 			WRITE(fh.version, usedExtStringRules ? kCurrentFileVersion : kFileVersion2_1);
1359 			WRITE(fh.headerLength, 0);	// to be filled in later, once names and table counts are known
1360 			WRITE(fh.formFlagsLHS, lhsFlags);
1361 			WRITE(fh.formFlagsRHS, rhsFlags);
1362 
1363 			WRITE(fh.numFwdTables, fwdTables.size());
1364 			WRITE(fh.numRevTables, revTables.size());
1365 			WRITE(fh.numNames, names.size());
1366 
1367 			string	offsets;
1368 			UInt32	offset = sizeof(FileHeader) + (names.size() + fwdTables.size() + revTables.size()) * sizeof(UInt32);
1369 			UInt32	prevLength = 0;
1370 
1371 			// sort the name IDs into ascending order
1372 			vector<UInt16>	nameIDs;
1373 			nameIDs.reserve(names.size());
1374 			for (map<UInt16,string>::const_iterator n = names.begin(); n != names.end(); ++n) {
1375 				nameIDs.push_back(n->first);
1376 			}
1377 			sort(nameIDs.begin(), nameIDs.end());
1378 
1379 			// pack all the name records
1380 			string	namesData;
1381 			for (vector<UInt16>::const_iterator i = nameIDs.begin(); i != nameIDs.end(); ++i) {
1382 				appendToTable(offsets, (const char*)&offset, sizeof(offset));
1383 				NameRec	r;
1384 				WRITE(r.nameID, *i);
1385 				WRITE(r.nameLength, names[*i].length());
1386 				namesData.append((const char*)&r, sizeof(r));
1387 				namesData.append(names[*i]);
1388 				if ((namesData.length() & 1) != 0)
1389 					namesData.append(1, (char)0);
1390 				offset += namesData.length() - prevLength;
1391 				prevLength = namesData.length();
1392 			}
1393 			if ((namesData.length() & 2) != 0)
1394 				namesData.append(2, (char)0);
1395 			offset += namesData.length() - prevLength;
1396 
1397 			// pack the offsets to the actual mapping tables
1398 			vector<string>::const_iterator t;
1399 			for (t = fwdTables.begin(); t != fwdTables.end(); ++t) {
1400 				appendToTable(offsets, (const char*)&offset, sizeof(offset));
1401 				offset += t->size();
1402 			}
1403 			for (t = revTables.end(); t != revTables.begin(); ) {
1404 				--t;
1405 				appendToTable(offsets, (const char*)&offset, sizeof(offset));
1406 				offset += t->size();
1407 			}
1408 
1409 			WRITE(fh.headerLength, sizeof(fh) + offsets.length() + namesData.length());
1410 
1411 			if (errorCount == 0) {
1412 				// calculate total size of compiled table, malloc() it, and copy everything into it
1413 				compiledSize = sizeof(fh)
1414 							+ offsets.length()
1415 							+ namesData.length();
1416 				for (t = fwdTables.begin(); t != fwdTables.end(); ++t)
1417 					compiledSize += t->length();
1418 				for (t = revTables.begin(); t != revTables.end(); ++t)
1419 					compiledSize += t->length();
1420 
1421 				compiledTable = (Byte*)malloc(compiledSize);
1422 				if (compiledTable != 0) {
1423 					char*	cp = (char*)compiledTable;
1424 					memcpy(cp, &fh, sizeof(fh));
1425 					cp += sizeof(fh);
1426 					memcpy(cp, offsets.data(), offsets.length());
1427 					cp += offsets.length();
1428 					memcpy(cp, namesData.data(), namesData.length());
1429 					cp += namesData.length();
1430 					for (t = fwdTables.begin(); t != fwdTables.end(); ++t) {
1431 						memcpy(cp, t->data(), t->length());
1432 						cp += t->length();
1433 					}
1434 					for (t = revTables.end(); t != revTables.begin(); ) {
1435 						--t;
1436 						memcpy(cp, t->data(), t->length());
1437 						cp += t->length();
1438 					}
1439 					if ((char*)compiledTable + compiledSize != cp)
1440 						cerr << "error!" << endl;
1441 				}
1442 				else
1443 					throw bad_alloc();
1444 			}
1445 
1446 			if (errorCount == 0 && cmp) {
1447 				// do the compression...
1448 				unsigned long	destLen = compiledSize * 11 / 10 + 20;
1449 				Byte*	dest = (Byte*)malloc(destLen + 8);
1450 				if (dest != 0) {
1451 					int	result = compress2(dest + 8, &destLen, compiledTable, compiledSize, Z_BEST_COMPRESSION);
1452 					if (result == Z_OK) {
1453 						destLen += 8;
1454 						dest = (Byte*)realloc(dest, destLen); // shrink dest to fit
1455 						WRITE(((FileHeader*)dest)->type, kMagicNumberCmp);
1456 						WRITE(((FileHeader*)dest)->version, compiledSize);
1457 						free(compiledTable);
1458 						compiledTable = dest;
1459 						compiledSize = destLen;
1460 					}
1461 					else
1462 						free(dest);
1463 				}
1464 			}
1465 		}
1466 	}
1467 }
1468 
~Compiler()1469 Compiler::~Compiler()
1470 {
1471 	if (compiledTable != 0)
1472 		free(compiledTable);
1473 }
1474 
1475 void
GetCompiledTable(Byte * & table,UInt32 & len) const1476 Compiler::GetCompiledTable(Byte*& table, UInt32& len) const
1477 {
1478 	table = compiledTable;
1479 	len = compiledSize;
1480 }
1481 
1482 void
DetachCompiledTable()1483 Compiler::DetachCompiledTable()
1484 {
1485 	compiledTable = 0;
1486 	compiledSize = 0;
1487 }
1488 
1489 string
asUTF8(const string32 s)1490 Compiler::asUTF8(const string32 s)
1491 {
1492 	string	rval;
1493 	string32::const_iterator i;
1494 	for (i = s.begin(); i != s.end(); ++i) {
1495 		UInt32	c = *i;
1496 		int	bytesToWrite;
1497 		if (c < 0x80) {				bytesToWrite = 1;
1498 		} else if (c < 0x800) {		bytesToWrite = 2;
1499 		} else if (c < 0x10000) {	bytesToWrite = 3;
1500 		} else if (c < 0x200000) {	bytesToWrite = 4;
1501 		} else {					bytesToWrite = 2;
1502 									c = 0x0000fffd;
1503 		};
1504 		rval.append((size_t)bytesToWrite, 0);
1505 		int index = rval.length();
1506 		switch (bytesToWrite) {	/* note: code falls through cases! */
1507 			case 4:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1508 			case 3:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1509 			case 2:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1510 			case 1:	rval[--index] =  c | firstByteMark[bytesToWrite];
1511 		};
1512 	}
1513 	return rval;
1514 }
1515 
1516 void
ReadNameString(UInt16 nameID)1517 Compiler::ReadNameString(UInt16 nameID)
1518 {
1519 	if (ExpectToken(tok_String, "expected STRING after name keyword")) {
1520 		if (inputForm == kForm_Bytes) {
1521 			names[nameID].erase(names[nameID].begin(), names[nameID].end());
1522 			for (string32::const_iterator i = tok.strval.begin(); i != tok.strval.end(); ++i)
1523 				names[nameID].append(1, *i);
1524 		}
1525 		else
1526 			names[nameID] = asUTF8(tok.strval);
1527 		ExpectToken(tok_Newline, "junk at end of line");
1528 	}
1529 }
1530 
1531 void
FinishPass()1532 Compiler::FinishPass()
1533 {
1534 	if (currentPass.passType == 0)
1535 		return;
1536 
1537 	if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {
1538 		while (errorCount == 0) {
1539 			if (fwdTables.size() == 0)
1540 				lhsFlags |= kFlags_Unicode;
1541 			else {
1542 				if ((rhsFlags & kFlags_Unicode) == 0) {
1543 					Error("normalization only supported in Unicode space");
1544 					break;
1545 				}
1546 			}
1547 			rhsFlags |= kFlags_Unicode;
1548 			string	normTable((currentPass.passType & 0x0000FF00) == (FOUR_CHAR_CODE('_','_','C','_') & 0x0000FF00)
1549 								? "NFC " : "NFD ");
1550 			if ((currentPass.passType & 0x000000FF) != 'r')
1551 				fwdTables.push_back(normTable);
1552 			if ((currentPass.passType & 0x000000FF) != 'f')
1553 				revTables.push_back(normTable);
1554 			if (generateXML) {
1555 				xmlOut("<pass lhs=\"unicode\" rhs=\"unicode\" line=\"");
1556 				xmlOut(asDec(currentPass.startingLine));
1557 				xmlOut("\">\n");
1558 				xmlOut("<normalize form=\"");
1559 				xmlOut(normTable[2]);
1560 				if ((currentPass.passType & 0x000000FF) == 'f')
1561 					xmlOut(" dir=\"fwd\"");
1562 				else if ((currentPass.passType & 0x000000FF) == 'r')
1563 					xmlOut(" dir=\"rev\"");
1564 				xmlOut("\">\n");
1565 				xmlOut("</pass>\n");
1566 			}
1567 			break;
1568 		}
1569 	}
1570 	else {
1571 		while (errorCount == 0) {
1572 			// not really a loop; just so we can use 'break' to exit early
1573 			bool	sourceUni = (currentPass.passType == kCode_UB) || (currentPass.passType == kCode_Unic);
1574 			bool	targetUni = (currentPass.passType == kCode_BU) || (currentPass.passType == kCode_Unic);
1575 
1576 			if (generateXML) {
1577 				// pass header
1578 				xmlOut("<pass lhs=\"");
1579 				xmlOut(sourceUni ? "unicode" : "bytes");
1580 				xmlOut("\" rhs=\"");
1581 				xmlOut(targetUni ? "unicode" : "bytes");
1582 				if (sourceUni != targetUni) {
1583 					xmlOut("\" lhsDefault=\"");
1584 					xmlOut(sourceUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));
1585 					xmlOut("\" rhsDefault=\"");
1586 					xmlOut(targetUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));
1587 				}
1588 				xmlOut("\" line=\"");
1589 				xmlOut(asDec(currentPass.startingLine));
1590 				xmlOut("\">\n");
1591 
1592 				// class definitions
1593 				if (currentPass.byteClassMembers.size() > 0 || currentPass.uniClassMembers.size() > 0) {
1594 					xmlOut("<classes>\n");
1595 					unsigned int i;
1596 					for (i = 0; i < currentPass.byteClassMembers.size(); ++i) {
1597 						xmlOut("<class size=\"bytes\" name=\"b_");
1598 						xmlOut(getClassName(currentPass.byteClassNames, i));
1599 						xmlOut("\" line=\"");
1600 						xmlOut(asDec(currentPass.byteClassLines[i]));
1601 						xmlOut("\">");
1602 						for (Class::const_iterator ci = currentPass.byteClassMembers[i].begin(); ci != currentPass.byteClassMembers[i].end(); ++ci) {
1603 							xmlOut(ci == currentPass.byteClassMembers[i].begin() ? "\n" : " ");
1604 							xmlOut(asHex(*ci, 2));
1605 						}
1606 						xmlOut("\n</class>\n");
1607 					}
1608 					for (i = 0; i < currentPass.uniClassMembers.size(); ++i) {
1609 						xmlOut("<class size=\"unicode\" name=\"u_");
1610 						xmlOut(getClassName(currentPass.uniClassNames, i));
1611 						xmlOut("\" line=\"");
1612 						xmlOut(asDec(currentPass.uniClassLines[i]));
1613 						xmlOut("\">");
1614 						for (Class::const_iterator ci = currentPass.uniClassMembers[i].begin(); ci != currentPass.uniClassMembers[i].end(); ++ci) {
1615 							xmlOut(ci == currentPass.uniClassMembers[i].begin() ? "\n" : " ");
1616 							xmlOut(asHex(*ci, 4));
1617 						}
1618 						xmlOut("\n</class>\n");
1619 					}
1620 					xmlOut("</classes>\n");
1621 				}
1622 
1623 				if (currentPass.xmlContexts.size() > 0) {
1624 					xmlOut("<contexts>\n");
1625 					for (map<string,string>::const_iterator i = currentPass.xmlContexts.begin();
1626 							i != currentPass.xmlContexts.end(); ++i) {
1627 						xmlOut("<context id=\"");
1628 						xmlOut(i->second);
1629 						xmlOut("\">");
1630 						xmlOut(i->first);
1631 						xmlOut("</context>\n");
1632 					}
1633 					xmlOut("</contexts>\n");
1634 				}
1635 
1636 				xmlOut("<assignments>\n");
1637 				for (vector<string>::const_iterator i = currentPass.xmlRules.begin();
1638 						i != currentPass.xmlRules.end(); ++i) {
1639 					xmlOut(*i);
1640 				}
1641 				xmlOut("</assignments>\n");
1642 
1643 				// end pass
1644 				xmlOut("</pass>\n");
1645 			}
1646 
1647 			if (fwdTables.size() == 0) {
1648 				if (sourceUni)
1649 					lhsFlags |= kFlags_Unicode;
1650 			}
1651 			else {
1652 				if (sourceUni != ((rhsFlags & kFlags_Unicode) != 0)) {
1653 					Error("code space mismatch");
1654 					break;
1655 				}
1656 			}
1657 			rhsFlags &= ~kFlags_Unicode;
1658 			if (targetUni)
1659 				rhsFlags |= kFlags_Unicode;
1660 
1661 			// deal with COPY on LHS, and set up class/copy replacement index fields
1662 			associateItems(currentPass.fwdRules, sourceUni, targetUni);
1663 			if (errorCount > 0)
1664 				break;
1665 
1666 			setGroupPointers(currentPass.fwdRules);
1667 
1668 			// sort rules by length (also propagates repeat counts from EGroup back to BGroup items)
1669 			sortRules(currentPass.fwdRules);
1670 			if (errorCount > 0)
1671 				break;
1672 
1673 			// build the forward table
1674 			fwdTables.push_back(string());
1675 			buildTable(currentPass.fwdRules, sourceUni, targetUni, fwdTables.back());
1676 			buildVars.clear();
1677 			if (errorCount > 0)
1678 				break;
1679 
1680 			// build the reverse table
1681 			associateItems(currentPass.revRules, targetUni, sourceUni);
1682 			if (errorCount > 0)
1683 				break;
1684 			setGroupPointers(currentPass.revRules);
1685 			sortRules(currentPass.revRules);
1686 			if (errorCount > 0)
1687 				break;
1688 			revTables.push_back(string());
1689 			buildTable(currentPass.revRules, targetUni, sourceUni, revTables.back());
1690 			buildVars.clear();
1691 			break;
1692 		}
1693 	}
1694 	currentPass.clear();
1695 	currentPass.setLineNo(lineNumber);
1696 }
1697 
1698 void
SkipSpaces(void)1699 Compiler::SkipSpaces(void)
1700 {
1701 	while (textPtr < textEnd) {
1702 		UInt32 currCh = getChar();
1703 		if (currCh != ' ' && currCh != '\t') {
1704 			ungetChar(currCh);
1705 			break;
1706 		}
1707 	}
1708 }
1709 
1710 Compiler::tokenType
IDlookup(const char * str,UInt32 len)1711 Compiler::IDlookup(const char* str, UInt32 len)
1712 {
1713 	const Keyword	*k = &keywords[0];
1714 	while (k->keyword != 0)
1715 		if (strmatch(k->keyword, str, len)) {
1716 			tok.val = k->refCon;
1717 			return k->token;
1718 		}
1719 		else
1720 			++k;
1721 
1722 	// try for a macro
1723 	map<string,tokListT>::const_iterator	i = defines.find(string(str, len));
1724 	if (i != defines.end()) {
1725 		defIter = i->second.begin();
1726 		defEnd = i->second.end();
1727 		tok = *defIter;
1728 		defIter++;
1729 		return tok.type;
1730 	}
1731 
1732 	// didn't find the identifier as a keyword; try as a Unicode char name
1733 	// NOTE: the names are now sorted (by Unicode name), so we could use a binary
1734 	//  search here if anyone complains about compilation time when using names :)
1735 	const CharName	*c = &gUnicodeNames[0];
1736 	while (c->name != 0)
1737 		if (unicodeNameCompare(c->name, str, len) == 0) {
1738 			tok.val = c->usv;
1739 			return tok_USV;
1740 		}
1741 		else
1742 			++c;
1743 
1744 #ifdef __MWERKS__
1745 	tok.strval.clear();
1746 #else
1747 	tok.strval.erase(tok.strval.begin(), tok.strval.end());
1748 #endif
1749 	while (len-- > 0)
1750 		tok.strval.append(1, *str++);
1751 	return tok_Identifier;
1752 }
1753 
1754 UInt32
getChar()1755 Compiler::getChar()
1756 {
1757 	UInt32	rval = 0;
1758 
1759 	if (ungotten != kInvalidChar) {
1760 		rval = ungotten;
1761 		ungotten = kInvalidChar;
1762 		return rval;
1763 	}
1764 
1765 #define CHECK_AVAIL(x)				\
1766 	if (textPtr + (x) > textEnd) {	\
1767 			textPtr = textEnd;		\
1768 			return kInvalidChar;	\
1769 	}
1770 
1771 	switch (inputForm) {
1772 		case kForm_Bytes:
1773 			rval = *textPtr++;
1774 			break;
1775 
1776 		case kForm_UTF8:
1777 			{
1778 				UInt16 extraBytes = bytesFromUTF8[*textPtr];
1779 				CHECK_AVAIL(extraBytes + 1);
1780 				switch (extraBytes) {	// note: code falls through cases!
1781 					case 5:	rval += *textPtr++; rval <<= 6;
1782 					case 4:	rval += *textPtr++; rval <<= 6;
1783 					case 3:	rval += *textPtr++; rval <<= 6;
1784 					case 2:	rval += *textPtr++; rval <<= 6;
1785 					case 1:	rval += *textPtr++; rval <<= 6;
1786 					case 0:	rval += *textPtr++;
1787 				};
1788 				rval -= offsetsFromUTF8[extraBytes];
1789 			}
1790 			break;
1791 
1792 		case kForm_UTF16BE:
1793 			CHECK_AVAIL(2);
1794 			rval = *textPtr++ << 8;
1795 			rval += *textPtr++;
1796 			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
1797 				// check that 2 more bytes are available
1798 				CHECK_AVAIL(2);
1799 				UInt32	low = *textPtr++ << 8;
1800 				low += *textPtr++;
1801 				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
1802 			}
1803 			break;
1804 
1805 		case kForm_UTF16LE:
1806 			CHECK_AVAIL(2);
1807 			rval = *textPtr++;
1808 			rval += *textPtr++ << 8;
1809 			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
1810 				CHECK_AVAIL(2);
1811 				UInt32	low = *textPtr++;
1812 				low += *textPtr++ << 8;
1813 				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
1814 			}
1815 			break;
1816 
1817 		case kForm_UTF32BE:
1818 			CHECK_AVAIL(4);
1819 			rval = *textPtr++ << 24;
1820 			rval += *textPtr++ << 16;
1821 			rval += *textPtr++ << 8;
1822 			rval += *textPtr++;
1823 			break;
1824 
1825 		case kForm_UTF32LE:
1826 			CHECK_AVAIL(4);
1827 			rval = *textPtr++;
1828 			rval += *textPtr++ << 8;
1829 			rval += *textPtr++ << 16;
1830 			rval += *textPtr++ << 24;
1831 			break;
1832 	}
1833 
1834 	return rval;
1835 }
1836 
1837 void
ungetChar(UInt32 c)1838 Compiler::ungetChar(UInt32 c)
1839 {
1840 	ungotten = c;
1841 }
1842 
1843 bool
GetNextToken()1844 Compiler::GetNextToken()
1845 {
1846 	UInt32	currCh;
1847 
1848 	if (defIter != defEnd) {
1849 		tok = *defIter;
1850 		defIter++;
1851 		return true;
1852 	}
1853 
1854 	if (textPtr == textEnd) {
1855 		++textPtr;
1856 		tok.type = tok_Newline;
1857 		return true;
1858 	}
1859 
1860 	if (textPtr >= textEnd)
1861 		return false;
1862 
1863 	while (true) {
1864 		SkipSpaces();
1865 
1866 		tokStart = textPtr;
1867 
1868 		if (textPtr == textEnd) {
1869 			++textPtr;
1870 			tok.type = tok_Newline;
1871 			++lineNumber;
1872 			return true;
1873 		}
1874 
1875 		if (textPtr > textEnd)
1876 			return false;
1877 
1878 		currCh = getChar();
1879 		switch (currCh) {
1880 			case '\r':
1881 				if (textPtr < textEnd) {
1882 					currCh = getChar();
1883 					if (currCh != '\n')
1884 						ungetChar(currCh);
1885 				}
1886 				tok.type = tok_Newline;
1887 				++lineNumber;
1888 				return true;
1889 
1890 			case '\n':
1891 				if (textPtr < textEnd) {
1892 					currCh = getChar();
1893 					if (currCh != '\r')
1894 						ungetChar(currCh);
1895 				}
1896 				tok.type = tok_Newline;
1897 				++lineNumber;
1898 				return true;
1899 
1900 			case '\\':
1901 				if (textPtr < textEnd) {
1902 					currCh = getChar();
1903 					if (currCh == '\r' || currCh == '\n') {
1904 						if (textPtr < textEnd) {
1905 							UInt32 nextCh = getChar();
1906 							if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))
1907 								ungetChar(nextCh);
1908 						}
1909 						++lineNumber;
1910 						continue;
1911 					}
1912 					ungetChar(currCh);
1913 				}
1914 				goto DEFAULT;
1915 
1916 			case '"':
1917 			case '\'':
1918 				{
1919 					UInt32	delimiter = currCh;
1920 #ifdef __MWERKS__
1921 					tok.strval.clear();
1922 #else
1923 					tok.strval.erase(tok.strval.begin(), tok.strval.end());
1924 #endif
1925 					while ((textPtr < textEnd) && ((currCh = getChar()) != delimiter) && (currCh != '\r') && (currCh != '\n'))
1926 						tok.strval.append(1, currCh);
1927 					tok.type = tok_String;
1928 					if (currCh == '\r' || currCh == '\n')
1929 						ungetChar(currCh);
1930 				}
1931 				return true;
1932 
1933 			case '^':
1934 			case '(':
1935 			case ')':
1936 			case '[':
1937 			case ']':
1938 			case '{':
1939 			case '}':
1940 			case ',':
1941 			case '+':
1942 			case '*':
1943 			case '?':
1944 			case '>':
1945 			case '#':
1946 			case '|':
1947 			case '/':
1948 			case '=':
1949 			case '@':
1950 				tok.type = (tokenType)currCh;
1951 				return true;
1952 
1953 			case '<':
1954 				tok.type = (tokenType)'<';
1955 				if (textPtr < textEnd) {
1956 					if ((currCh = getChar()) == '>')
1957 						tok.type = tok_Map;
1958 					else
1959 						ungetChar(currCh);
1960 				}
1961 				return true;
1962 
1963 			case '.':
1964 				tok.type = (tokenType)'.';
1965 				if (textPtr < textEnd) {
1966 					if ((currCh = getChar()) == '.')
1967 						tok.type = tok_Ellipsis;
1968 					else
1969 						ungetChar(currCh);
1970 				}
1971 				return true;
1972 
1973 			case '_':
1974 				if (textPtr < textEnd) {
1975 					currCh = getChar();
1976 					ungetChar(currCh);
1977 					if (isIDcont(currCh)) {
1978 						currCh = '_';
1979 						goto DEFAULT;
1980 					}
1981 				}
1982 				tok.type = (tokenType)'_';
1983 				return true;
1984 
1985 			case '0':
1986 				if (textPtr < textEnd) {
1987 					currCh = getChar();
1988 					if (currCh == 'x' || currCh == 'X') {
1989 						tok.type = tok_Number;
1990 						tok.val = 0;
1991 						while (textPtr < textEnd) {
1992 							currCh = getChar();
1993 							if (currCh >= '0' && currCh <= '9')
1994 								tok.val = tok.val * 16 + currCh - '0';
1995 							else if (currCh >= 'a' && currCh <= 'f')
1996 								tok.val = tok.val * 16 + currCh - 'a' + 10;
1997 							else if (currCh >= 'A' && currCh <= 'F')
1998 								tok.val = tok.val * 16 + currCh - 'A' + 10;
1999 							else {
2000 								ungetChar(currCh);
2001 								break;
2002 							}
2003 						}
2004 						return true;
2005 					}
2006 					ungetChar(currCh);
2007 					currCh = '0';
2008 				}
2009 				// else fall through
2010 
2011 			case '1':
2012 			case '2':
2013 			case '3':
2014 			case '4':
2015 			case '5':
2016 			case '6':
2017 			case '7':
2018 			case '8':
2019 			case '9':
2020 				tok.type = tok_Number;
2021 				tok.val = currCh - '0';
2022 				while (textPtr < textEnd) {
2023 					currCh = getChar();
2024 					if (currCh >= '0' && currCh <= '9')
2025 						tok.val = tok.val * 10 + currCh - '0';
2026 					else {
2027 						ungetChar(currCh);
2028 						break;
2029 					}
2030 				}
2031 				return true;
2032 
2033 			case ';':
2034                 {
2035                 	bool	continuation = false;
2036 					while (textPtr < textEnd) {
2037 						continuation = (currCh == '\\');
2038 						currCh = getChar();
2039 						if (currCh == '\r' || currCh == '\n')
2040 							break;
2041 					}
2042 					if (textPtr < textEnd) {
2043 						UInt32	nextCh = getChar();
2044 						if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))
2045 							ungetChar(nextCh);
2046 					}
2047 					++lineNumber;
2048 					if (continuation)
2049 						continue;
2050 					else {
2051 						tok.type = tok_Newline;
2052 						return true;
2053 					}
2054 	            }
2055 
2056 			case 'U':
2057 				// check for U+xxxx USV
2058 				if (textPtr < textEnd) {
2059 					currCh = getChar();
2060 					if (currCh == '+') {
2061 						tok.type = tok_USV;
2062 						tok.val = 0;
2063 						int	digitCount = 0;
2064 						while (textPtr < textEnd) {
2065 							currCh = getChar();
2066 							if (currCh >= '0' && currCh <= '9')
2067 								tok.val = tok.val * 16 + currCh - '0';
2068 							else if (currCh >= 'a' && currCh <= 'f')
2069 								tok.val = tok.val * 16 + currCh - 'a' + 10;
2070 							else if (currCh >= 'A' && currCh <= 'F')
2071 								tok.val = tok.val * 16 + currCh - 'A' + 10;
2072 							else {
2073 								ungetChar(currCh);
2074 								break;
2075 							}
2076 							++digitCount;
2077 						}
2078 						if (digitCount < 4 || digitCount > 6) {
2079 							Error("Unicode value (U+xxxx) must have 4-6 hex digits");
2080 							tok.val = 0;
2081 						}
2082 						return true;
2083 					}
2084 					else
2085 						ungetChar(currCh);
2086 				}
2087 				currCh = 'U';
2088 				goto DEFAULT;
2089 
2090 				// read an identifier or some other 'unknown' character
2091 			default:
2092 			DEFAULT:
2093 				if (isIDstart(currCh)) {
2094 					idBuffer[0] = currCh;
2095 					tok.val = 1;
2096 					while (textPtr < textEnd) {
2097 						currCh = getChar();
2098 						if (!isIDcont(currCh)) {
2099 							ungetChar(currCh);
2100 							break;
2101 						}
2102 						if (tok.val < 256)
2103 							idBuffer[tok.val++] = currCh;
2104 					}
2105 					tok.type = IDlookup(&idBuffer[0], tok.val);
2106 					return true;
2107 				}
2108 				tok.type = tok_Unknown;
2109 				tok.val = currCh;
2110 				return true;
2111 		}
2112 	}
2113 }
2114 
2115 bool
ExpectToken(tokenType type,const char * errMsg)2116 Compiler::ExpectToken(tokenType type, const char* errMsg)
2117 {
2118 	if (!GetNextToken() || tok.type != type) {
2119 		Error(errMsg);
2120 		return false;
2121 	}
2122 	return true;
2123 }
2124 
2125 void
Error(const char * msg,const char * s,UInt32 line)2126 Compiler::Error(const char* msg, const char* s, UInt32 line)
2127 {
2128 	if (line == 0xffffffff)
2129 		line = lineNumber;
2130 	if (errorFunction == 0) {
2131 		cout << "Error: " << msg;
2132 		if (s != 0)
2133 			cout << ": \"" << s << '"';
2134 		cout << " at line " << line << endl;
2135 	}
2136 	else
2137 		(*errorFunction)(errFuncUserData, (char*)msg, (char*)s, line);
2138 	errorState = true;
2139 	++errorCount;
2140 }
2141 
2142 void
StartDefaultPass()2143 Compiler::StartDefaultPass()
2144 {
2145 	if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {
2146 		Error("normalization pass cannot contain any other rules");
2147 		currentPass.passType = kCode_Unic;
2148 	}
2149 	if (currentPass.passType == 0) {
2150 		currentPass.clear();	// should already be clear!
2151 		currentPass.passType = kCode_BU;
2152 		currentPass.setLineNo(lineNumber);
2153 	}
2154 }
2155 
2156 void
AppendToRule(const Item & item)2157 Compiler::AppendToRule(const Item& item)
2158 {
2159 	StartDefaultPass();
2160 	switch (ruleState) {
2161 		case notInRule:
2162 			ruleState = inLHSString;
2163 			currentRule.setLineNo(lineNumber);
2164 		case inLHSString:
2165 			currentRule.lhsString.push_back(item);
2166 			break;
2167 		case inLHSPreContext:
2168 			currentRule.lhsPreContext.push_back(item);
2169 			break;
2170 		case inLHSPostContext:
2171 			currentRule.lhsPostContext.push_back(item);
2172 			break;
2173 		case inRHSString:
2174 			currentRule.rhsString.push_back(item);
2175 			break;
2176 		case inRHSPreContext:
2177 			currentRule.rhsPreContext.push_back(item);
2178 			break;
2179 		case inRHSPostContext:
2180 			currentRule.rhsPostContext.push_back(item);
2181 			break;
2182 	}
2183 }
2184 
2185 UInt32
charLimit()2186 Compiler::charLimit()
2187 {
2188 	UInt32	limit;
2189 	switch (ruleState) {
2190 		case inRHSString:
2191 		case inRHSPreContext:
2192 		case inRHSPostContext:
2193 			limit = (currentPass.passType == kCode_BU || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);
2194 			break;
2195 		default:
2196 			limit = (currentPass.passType == kCode_UB || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);
2197 			break;
2198 	}
2199 	return limit;
2200 }
2201 
2202 void
AppendLiteral(UInt32 val,bool negate)2203 Compiler::AppendLiteral(UInt32 val, bool negate)
2204 {
2205 	StartDefaultPass();
2206 	if (val > charLimit()) {
2207 		Error("literal value out of range");
2208 		return;
2209 	}
2210 	Item	item;
2211 	item.type = 0;
2212 	item.negate = negate ? 1 : 0;
2213 	item.repeatMin = 0xff;
2214 	item.repeatMax = 0xff;
2215 	item.val = val;
2216 	AppendToRule(item);
2217 }
2218 
2219 void
AppendUSV(UInt32 val,bool negate)2220 Compiler::AppendUSV(UInt32 val, bool negate)
2221 {
2222 	StartDefaultPass();
2223 	if (charLimit() == 0xff) {
2224 		Error("can't use Unicode character in byte encoding");
2225 		return;
2226 	}
2227 	AppendLiteral(val, negate);
2228 }
2229 
2230 void
AppendSpecial(UInt8 type,bool negate)2231 Compiler::AppendSpecial(UInt8 type, bool negate)
2232 {
2233 	Item	item;
2234 	item.type = type;
2235 	item.negate = negate ? 1 : 0;
2236 	item.repeatMin = 0xff;
2237 	item.repeatMax = 0xff;
2238 	item.val = 0;
2239 	item.start = item.next = item.after = item.index = 0xff;
2240 	AppendToRule(item);
2241 }
2242 
2243 void
AppendClass(const string & className,bool negate)2244 Compiler::AppendClass(const string& className, bool negate)
2245 {
2246 	StartDefaultPass();
2247 	Item	item;
2248 	item.type = kMatchElem_Type_Class;
2249 	item.negate = negate ? 1 : 0;
2250 	item.repeatMin = 0xff;
2251 	item.repeatMax = 0xff;
2252 	item.val = 0;
2253 	const map<string,UInt32>*	classNames;
2254 	switch (ruleState) {
2255 		case inRHSString:
2256 		case inRHSPreContext:
2257 		case inRHSPostContext:
2258 			classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_UB)
2259 							? &currentPass.byteClassNames : &currentPass.uniClassNames;
2260 			break;
2261 		default:
2262 			classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_BU)
2263 							? &currentPass.byteClassNames : &currentPass.uniClassNames;
2264 			break;
2265 	}
2266 	map<string,UInt32>::const_iterator	i;
2267 	i = classNames->find(className);
2268 	if (i == classNames->end())
2269 		Error("undefined class", className.c_str());
2270 	else
2271 		item.val = i->second;
2272 	AppendToRule(item);
2273 }
2274 
2275 bool
tagExists(bool rhs,const string & tag)2276 Compiler::tagExists(bool rhs, const string& tag)
2277 {
2278 	if (rhs) {
2279 		if (   (findTag(tag, currentRule.rhsString) != -1)
2280 			|| (findTag(tag, currentRule.rhsPreContext) != -1)
2281 			|| (findTag(tag, currentRule.rhsPostContext) != -1))
2282 			return true;
2283 	}
2284 	else {
2285 		if (   (findTag(tag, currentRule.lhsString) != -1)
2286 			|| (findTag(tag, currentRule.lhsPreContext) != -1)
2287 			|| (findTag(tag, currentRule.lhsPostContext) != -1))
2288 			return true;
2289 	}
2290 	return false;
2291 }
2292 
2293 void
AssignTag(const string & tag)2294 Compiler::AssignTag(const string& tag)
2295 {
2296 	if (currentPass.passType == 0 || ruleState == notInRule) {
2297 		Error("item tag doesn't seem to be attached to a rule item", tag.c_str());
2298 		return;
2299 	}
2300 	Item*	item = NULL;
2301 	switch (ruleState) {
2302 		default:
2303 			Error("this can't happen (AssignTag)");
2304 			return;
2305 		case inLHSString:
2306 			if (tagExists(false, tag))
2307 				break;
2308 			item = &currentRule.lhsString.back();
2309 			break;
2310 		case inLHSPreContext:
2311 			if (tagExists(false, tag))
2312 				break;
2313 			item = &currentRule.lhsPreContext.back();
2314 			break;
2315 		case inLHSPostContext:
2316 			if (tagExists(false, tag))
2317 				break;
2318 			item = &currentRule.lhsPostContext.back();
2319 			break;
2320 		case inRHSString:
2321 			if (tagExists(true, tag))
2322 				break;
2323 			item = &currentRule.rhsString.back();
2324 			break;
2325 		case inRHSPreContext:
2326 			if (tagExists(true, tag))
2327 				break;
2328 			item = &currentRule.rhsPreContext.back();
2329 			break;
2330 		case inRHSPostContext:
2331 			if (tagExists(true, tag))
2332 				break;
2333 			item = &currentRule.rhsPostContext.back();
2334 			break;
2335 	}
2336 	if (item == NULL) {
2337 		Error("duplicate tag (ignored)", tag.c_str());
2338 		return;
2339 	}
2340 	if (item->tag.length() > 0) {
2341 		Error("rule item already has a tag", tag.c_str());
2342 		return;
2343 	}
2344 	switch (item->type) {
2345 		case 0:
2346 		case kMatchElem_Type_Class:
2347 		case kMatchElem_Type_EGroup:
2348 		case kMatchElem_Type_ANY:
2349 		case kMatchElem_Type_Copy:
2350 			item->tag = tag;
2351 			break;
2352 
2353 		default:
2354 			Error("invalid use of item tag", tag.c_str());
2355 			break;
2356 	}
2357 }
2358 
2359 void
SetMinMax(int repeatMin,int repeatMax)2360 Compiler::SetMinMax(int repeatMin, int repeatMax)
2361 {
2362 	Item*	item = 0;
2363 	switch (ruleState) {
2364 		default:
2365 			Error("invalid use of repeat count");
2366 			break;
2367 		case inLHSString:
2368 			item = &currentRule.lhsString.back();
2369 			break;
2370 		case inLHSPreContext:
2371 			item = &currentRule.lhsPreContext.back();
2372 			break;
2373 		case inLHSPostContext:
2374 			item = &currentRule.lhsPostContext.back();
2375 			break;
2376 		case inRHSString:
2377 			item = &currentRule.rhsString.back();
2378 			break;
2379 		case inRHSPreContext:
2380 			item = &currentRule.rhsPreContext.back();
2381 			break;
2382 		case inRHSPostContext:
2383 			item = &currentRule.rhsPostContext.back();
2384 			break;
2385 	}
2386 	if (item) {
2387 		switch (item->type) {
2388 			case 0:
2389 			case kMatchElem_Type_Class:
2390 			case kMatchElem_Type_ANY:
2391 			case kMatchElem_Type_EGroup:
2392 				if (repeatMin > repeatMax || repeatMax < 1 || repeatMax > 15)
2393 					Error("invalid repeat counts (0-15 allowed)");
2394 				else if (item->repeatMin != 0xff)
2395 					Error("multiple repeat counts on item");
2396 				else {
2397 					item->repeatMin = repeatMin;
2398 					item->repeatMax = repeatMax;
2399 				}
2400 				break;
2401 			default:
2402 				Error("invalid use of repeat count");
2403 				break;
2404 		}
2405 	}
2406 }
2407 
2408 void
setGroupPointers(vector<Item>::iterator b,vector<Item>::iterator e,int startIndex,bool isReversed)2409 Compiler::setGroupPointers(vector<Item>::iterator b, vector<Item>::iterator e, int startIndex, bool isReversed)
2410 {
2411 // set up the fwd and back pointers on bgroup/or/egroup
2412 // and propagate repeat counts from egroup to bgroup
2413 	vector<Item>::iterator	base = b;
2414 	vector<Item>::iterator	altStart = startIndex > 0 ? base - 1 : e;
2415 	bool altSeen = false;
2416 	while (b != e) {
2417 		if (b->repeatMin == 0xff)
2418 			b->repeatMin = 1;
2419 		if (b->repeatMax == 0xff)
2420 			b->repeatMax = 1;
2421 		switch (b->type) {
2422 			case 0:	// literal
2423 			case kMatchElem_Type_Class:
2424 			case kMatchElem_Type_ANY:
2425 			case kMatchElem_Type_EOS:
2426 				break;
2427 
2428 			case kMatchElem_Type_OR:
2429 				// if startIndex > 0, then initial altStart will be valid
2430 				if ((startIndex > 0 || altSeen) && (altStart->type == kMatchElem_Type_OR || altStart->type == kMatchElem_Type_BGroup))
2431 					altStart->next = startIndex + (b - base);
2432 				else {
2433 					Error("this can't happen (setGroupPointers 1)");
2434 					return;
2435 				}
2436 				altStart = b;
2437 				altStart->start = startIndex - 1;
2438 				altSeen = true;
2439 				break;
2440 
2441 			case kMatchElem_Type_EGroup:
2442 				Error("this can't happen (setGroupPointers 2)");
2443 				return;
2444 
2445 			case kMatchElem_Type_BGroup:
2446 				{
2447 					// need to find corresponding EGroup and copy repeat counts from there
2448 					// (or vice versa if this is reversed context)
2449 					vector<Item>::iterator subGroupStart = b++;
2450 					subGroupStart->next = 0;
2451 					int	nestingLevel = 0;
2452 					while (b->type != kMatchElem_Type_EGroup || nestingLevel > 0) {
2453 						if (b->type == kMatchElem_Type_BGroup)
2454 							++nestingLevel;
2455 						else if (b->type == kMatchElem_Type_EGroup)
2456 							--nestingLevel;
2457 						++b;
2458 					}
2459 					if (isReversed) {
2460 						b->repeatMin = subGroupStart->repeatMin;
2461 						b->repeatMax = subGroupStart->repeatMax;
2462 					}
2463 					else {
2464 						if (b->repeatMin == 0xff)
2465 							b->repeatMin = 1;
2466 						if (b->repeatMax == 0xff)
2467 							b->repeatMax = 1;
2468 						subGroupStart->repeatMin = b->repeatMin;
2469 						subGroupStart->repeatMax = b->repeatMax;
2470 					}
2471 					setGroupPointers(subGroupStart + 1, b, startIndex + (subGroupStart - base + 1), isReversed);
2472 					subGroupStart->after = startIndex + (b - base + 1);
2473 					b->start = startIndex + (subGroupStart - base);
2474 				}
2475 				break;
2476 		}
2477 		++b;
2478 	}
2479 	if (altSeen)
2480 		altStart->next = startIndex + (b - base);	// set NEXT pointer of last OR
2481 	if (startIndex > 0) {	// we were handling a group, so set pointers of EGroup
2482 		if (b->type == kMatchElem_Type_EGroup)
2483 			b->start = startIndex - 1;
2484 		else {
2485 			Error("this can't happen (setGroupPointers 3)");
2486 			return;
2487 		}
2488 	}
2489 }
2490 
2491 void
setGroupPointers(vector<Rule> & rules)2492 Compiler::setGroupPointers(vector<Rule>& rules)
2493 {
2494 	for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2495 		setGroupPointers(i->matchStr.begin(), i->matchStr.end(), 0);
2496 		setGroupPointers(i->preContext.begin(), i->preContext.end(), 0, true);
2497 		setGroupPointers(i->postContext.begin(), i->postContext.end(), 0);
2498 	}
2499 }
2500 
2501 int
calcMaxLen(vector<Item>::iterator b,vector<Item>::iterator e)2502 Compiler::calcMaxLen(vector<Item>::iterator b, vector<Item>::iterator e)
2503 {
2504 	int	len = 0;
2505 	int	maxLen = 0;
2506 	while (b != e) {
2507 		switch (b->type) {
2508 			case 0:	// literal
2509 			case kMatchElem_Type_Class:
2510 			case kMatchElem_Type_ANY:
2511 			case kMatchElem_Type_EOS:
2512 				len += b->repeatMax;
2513 				break;
2514 
2515 			case kMatchElem_Type_OR:
2516 				if (len > maxLen)
2517 					maxLen = len;
2518 				len = 0;
2519 				break;
2520 
2521 			case kMatchElem_Type_EGroup:
2522 				Error("this can't happen (calcMaxLen)");
2523 				return 0;
2524 
2525 			case kMatchElem_Type_BGroup:
2526 				{
2527 					// need to find corresponding EGroup
2528 					vector<Item>::iterator subGroupStart = b++;
2529 					int	nestingLevel = 0;
2530 					while (b->type != kMatchElem_Type_EGroup || nestingLevel > 0) {
2531 						if (b->type == kMatchElem_Type_BGroup)
2532 							++nestingLevel;
2533 						else if (b->type == kMatchElem_Type_EGroup)
2534 							--nestingLevel;
2535 						++b;
2536 					}
2537 					len += subGroupStart->repeatMax * calcMaxLen(subGroupStart + 1, b);
2538 				}
2539 				break;
2540 		}
2541 		if (b == e)
2542 			break;	// this can happen with nested groups that end together
2543 		++b;
2544 	}
2545 	if (len > maxLen)
2546 		maxLen = len;
2547 	return maxLen;
2548 }
2549 
2550 int
calcMaxOutLen(Rule & rule)2551 Compiler::calcMaxOutLen(Rule& rule)
2552 {
2553 	int	len = 0;
2554 	for (vector<Item>::const_iterator i = rule.replaceStr.begin(); i != rule.replaceStr.end(); ++i) {
2555 		switch (i->type) {
2556 			case 0:
2557 				++len;
2558 				break;
2559 
2560 			case kMatchElem_Type_Class:
2561 				++len;
2562 				break;
2563 
2564 			case kMatchElem_Type_Copy:
2565 				{
2566 					vector<Item>::iterator	b = rule.matchStr.begin() + i->index;
2567 					if (b->type == kMatchElem_Type_BGroup)
2568 // 2002-10-01 bugfix for "this can't happen (calcMaxLen)" reported by Bob Eaton
2569 // Note that b->after is absolute index, not relative to b
2570 //						len += calcMaxLen(b + 1, b + b->after - 2);
2571 // 2002-03-19 bugfix: added multiplication by b->repeatMax, as engine copies entire repeated group
2572 //					and corrected ending pointer (-1, not -2)
2573 						len += b->repeatMax * calcMaxLen(b + 1, rule.matchStr.begin() + b->after - 1);
2574 //						b->next - i->index;	// FIXME: this is actually an overestimate of the possible length
2575 					else
2576 						len += b->repeatMax;
2577 				}
2578 				continue;
2579 
2580 			default:
2581 				cerr << "bad rep elem type: " << (int)i->type << endl;
2582 				break;
2583 		}
2584 	}
2585 	return len;
2586 }
2587 
2588 void
associateItems(vector<Rule> & rules,bool fromUni,bool toUni)2589 Compiler::associateItems(vector<Rule>& rules, bool fromUni, bool toUni)
2590 {
2591 	/*
2592 	handle associations between match and replacement class/copy items:
2593 		1. any COPY items on LHS need to be exchanged with the corresponding sequence from RHS
2594 		2. any CLASS and COPY items on RHS need their /index/ field set according to the
2595 			corresponding LHS item:
2596 			a.	if /tag/ is non-null, look for matching tag
2597 			b.	else use position, checking that corresponding LHS item is valid
2598 				- note that if LHS item is EGroup, we actually need to point at the corresponding BGroup
2599 	*/
2600 
2601 	for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2602 		// Deal with COPY items on LHS
2603 		vector<Item>&	m = i->matchStr;
2604 		vector<Item>&	r = i->replaceStr;
2605 		for (UInt32 j = 0; j < m.size(); ++j) {
2606 			if (m[j].type == kMatchElem_Type_Copy) {
2607 				if (m[j].tag.length() == 0) {
2608 					Error("COPY item must have association tag", 0, i->lineNumber);
2609 					continue;
2610 				}
2611 				for (UInt32 k = 0; k < r.size(); ++k) {
2612 					if (r[k].tag == m[j].tag) {
2613 						Item	t = m[j];
2614 						switch (r[k].type) {
2615 							case kMatchElem_Type_EGroup:
2616 								{
2617 									// find corresponding BGroup, and replace m[j] with the entire group
2618 									UInt32	b = k;
2619 									int	nestingLevel = 0;
2620 									while (b != 0) {
2621 										--b;
2622 										if (r[b].type == kMatchElem_Type_EGroup)
2623 											++nestingLevel;
2624 										else if (r[b].type == kMatchElem_Type_BGroup) {
2625 											if (nestingLevel > 0)
2626 												--nestingLevel;
2627 											else
2628 												break;
2629 										}
2630 									}
2631 									if (r[b].type == kMatchElem_Type_BGroup && nestingLevel == 0) {
2632 										m.erase(m.begin() + j);
2633 										m.insert(m.begin() + j, r.begin() + b, r.begin() + k + 1);
2634 										r.erase(r.begin() + b, r.begin() + k + 1);
2635 										r.insert(r.begin() + b, t);
2636 									}
2637 									else {
2638 										Error("can't find complete tagged group for COPY item", r[k].tag.c_str(), i->lineNumber);
2639 										break;
2640 									}
2641 								}
2642 								break;
2643 
2644 							case 0:
2645 							case kMatchElem_Type_Class:
2646 							case kMatchElem_Type_ANY:
2647 								m[j] = r[k];
2648 								r[k] = t;
2649 								break;
2650 
2651 							default:
2652 								Error("invalid COPY item in match", r[k].tag.c_str(), i->lineNumber);
2653 								break;
2654 						}
2655 						break;
2656 					}
2657 				}
2658 			}
2659 		}
2660 
2661 		// Set up associations
2662 		for (vector<Item>::iterator ir = r.begin(); ir != r.end(); ++ir) {
2663 			int	matchIndex = ir->tag.length() > 0 ? findTag(ir->tag, m) : ir - r.begin();
2664 			if (matchIndex == -1) {
2665 				Error("tag not found", ir->tag.c_str(), i->lineNumber);
2666 				continue;
2667 			}
2668 			if (errorCount > 0)
2669 				break;
2670 			ir->index = matchIndex;
2671 			vector<Item>::const_iterator im;
2672 			if (ir->index < m.end() - m.begin())
2673 				im = m.begin() + ir->index;
2674 			else
2675 				im = m.end();
2676 			switch (ir->type) {
2677 				case kMatchElem_Type_Class:
2678 					if (im == m.end()) {
2679 						Error("class in replacement does not have corresponding match item", 0, i->lineNumber);
2680 						break;
2681 					}
2682 					switch (im->type) {
2683 						case kMatchElem_Type_Class:
2684 							{
2685 								// check that class sizes correspond
2686 								Class&	mc = fromUni ? currentPass.uniClassMembers[im->val] : currentPass.byteClassMembers[im->val];
2687 								Class&	rc = toUni   ? currentPass.uniClassMembers[ir->val] : currentPass.byteClassMembers[ir->val];
2688 								if (mc.size() != rc.size())
2689 									Error("class size mismatch", 0, i->lineNumber);
2690 							}
2691 							break;
2692 						default:
2693 							Error("type mismatch for replacement class item", 0, i->lineNumber);
2694 							break;
2695 					}
2696 					break;
2697 
2698 				case kMatchElem_Type_Copy:
2699 					// COPY can correspond to anything except COPY on LHS
2700 					if (im == m.end()) {
2701 						Error("COPY in replacement does not have corresponding match item", 0, i->lineNumber);
2702 						break;
2703 					}
2704 					switch (im->type) {
2705 						case kMatchElem_Type_Copy:
2706 							Error("can't associate COPY elements", 0, i->lineNumber);
2707 							break;
2708 						case kMatchElem_Type_EGroup:
2709 							// change replacement item to point to the BGroup instead
2710 							{
2711 								int	nestingLevel = 0;
2712 								while (1) {
2713 									if (im == m.begin()) {
2714 										Error("this can't happen (associate)", 0, i->lineNumber);
2715 										break;
2716 									}
2717 									--im;
2718 									if (im->type == kMatchElem_Type_EGroup)
2719 										++nestingLevel;
2720 									else if (im->type == kMatchElem_Type_BGroup) {
2721 										if (nestingLevel > 0)
2722 											--nestingLevel;
2723 										else {
2724 											ir->index = im - m.begin();
2725 											break;
2726 										}
2727 									}
2728 								}
2729 							}
2730 							break;
2731 					}
2732 					break;
2733 
2734 				case kMatchElem_Type_ANY:
2735 					// ANY on RHS can only correspond with LITERAL or ANY on LHS
2736 					if (im == m.end()) {
2737 						Error("ANY in replacement does not have corresponding match item", 0, i->lineNumber);
2738 						break;
2739 					}
2740 					switch (im->type) {
2741 						case 0:
2742 						case kMatchElem_Type_ANY:
2743 							break;
2744 						default:
2745 							Error("invalid ANY element in replacement", 0, i->lineNumber);
2746 							break;
2747 					}
2748 					break;
2749 
2750 				case 0:
2751 					break;
2752 			}
2753 		}
2754 
2755 		// filter out stuff on the RHS that doesn't belong
2756 		for (UInt32 f = r.size(); f > 0; ) {
2757 			--f;
2758 			switch (r[f].type) {
2759 				case kMatchElem_Type_EOS:
2760 					Error("can't use EOS in replacement", 0, i->lineNumber);
2761 					break;
2762 				case kMatchElem_Type_BGroup:
2763 				case kMatchElem_Type_EGroup:
2764 				case kMatchElem_Type_OR:
2765 					r.erase(r.begin() + f);
2766 					break;
2767 			}
2768 		}
2769 
2770 		if (errorCount > 0)
2771 			break;
2772 	}
2773 }
2774 
2775 int
findTag(const string & tag,const vector<Item> & str)2776 Compiler::findTag(const string& tag, const vector<Item>& str)
2777 {
2778 	for (vector<Item>::const_iterator i = str.begin(); i != str.end(); ++i)
2779 		if (i->tag == tag)
2780 			return i - str.begin();
2781 	return -1;
2782 }
2783 
2784 int
ruleKeyComp(const Rule & a,const Rule & b)2785 Compiler::ruleKeyComp(const Rule& a, const Rule& b)
2786 {
2787 	if (a.sortKey > b.sortKey)	// higher sortKey comes first
2788 		return -1;
2789 	else if (a.sortKey < b.sortKey)
2790 		return 1;
2791 	else if (a.lineNumber < b.lineNumber)	// lower line number comes first
2792 		return -1;
2793 	else if (a.lineNumber > b.lineNumber)
2794 		return 1;
2795 	else
2796 		return 0;
2797 }
2798 
2799 void
sortRules(vector<Rule> & rules)2800 Compiler::sortRules(vector<Rule>& rules)
2801 {
2802 	// calculate sort keys based on match string length and context length
2803 	for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2804 		int	maxMatch = calcMaxLen(i->matchStr.begin(), i->matchStr.end());
2805 		int	maxPre = calcMaxLen(i->preContext.begin(), i->preContext.end());
2806 		int	maxPost = calcMaxLen(i->postContext.begin(), i->postContext.end());
2807 		if (maxMatch + maxPre + maxPost > 255)
2808 			Error("rule too long", 0, i->lineNumber);
2809 		i->sortKey = (maxMatch << 8) + maxPre + maxPost;
2810 		if (maxMatch > buildVars.maxMatch)
2811 			buildVars.maxMatch = maxMatch;
2812 		if (maxPre > buildVars.maxPre)
2813 			buildVars.maxPre = maxPre;
2814 		if (maxPost > buildVars.maxPost)
2815 			buildVars.maxPost = maxPost;
2816 
2817 		// calculate maxOutput and track highest value
2818 		int	maxOutput = calcMaxOutLen(*i);
2819 		if (maxOutput > 255)
2820 			Error("output too long", 0, i->lineNumber);
2821 		if (maxOutput > buildVars.maxOutput)
2822 			buildVars.maxOutput = maxOutput;
2823 	}
2824 #if 0 // __MWERKS__
2825 	sort(rules.begin(), rules.end(), ruleKeyComp);
2826 #else
2827 	// std library /sort/ crashes in Project Builder version, so do a bubble sort ourselves instead
2828 /*
2829 	for (vector<Rule>::iterator a = rules.begin(); a != rules.end(); ++a)
2830 		for (vector<Rule>::iterator b = rules.end() - 1; b != a; --b)
2831 			if (ruleKeyComp(*(b - 1), *b) > 0)
2832 				swap(*b, *(b - 1));
2833 */
2834 	// bubble-sorting the actual rules is really expensive; create an index and sort that instead
2835 	vector<UInt32>	ruleIndex;
2836 	for (UInt32 i = 0; i < rules.size(); ++i)
2837 		ruleIndex.push_back(i);
2838 	for (vector<UInt32>::iterator a = ruleIndex.begin(); a != ruleIndex.end(); ++a)
2839 		for (vector<UInt32>::iterator b = ruleIndex.end() - 1; b != a; --b)
2840 			if (ruleKeyComp(rules[*(b - 1)], rules[*b]) > 0)
2841 				swap(*b, *(b - 1));
2842 	vector<Rule>	sortedRules;
2843 	for (vector<UInt32>::iterator s = ruleIndex.begin(); s != ruleIndex.end(); ++s)
2844 		sortedRules.push_back(rules[*s]);
2845 	rules = sortedRules;
2846 #endif
2847 }
2848 
2849 bool
findInitialItems(const Rule & rule,vector<Item>::const_iterator b,vector<Item>::const_iterator e,vector<Item> & initialItems)2850 Compiler::findInitialItems(const Rule& rule, vector<Item>::const_iterator b, vector<Item>::const_iterator e, vector<Item>& initialItems)
2851 {
2852 // return true if we find a non-optional item, false if we could match null string
2853 	while (b != e) {
2854 		switch (b->type) {
2855 			case 0:
2856 			case kMatchElem_Type_Class:
2857 			case kMatchElem_Type_ANY:
2858 			case kMatchElem_Type_EOS:
2859 				initialItems.push_back(*b);
2860 				if (b->repeatMin > 0)
2861 					return true;
2862 				++b;
2863 				break;
2864 
2865 			case kMatchElem_Type_BGroup:
2866 				{
2867 					vector<Item>::const_iterator	groupStart = b;
2868 					vector<Item>::const_iterator	altStart = b + 1;
2869 					int		nestingLevel = 0;
2870 					bool	optional = false;
2871 					while (++b != e) {
2872 						switch (b->type) {
2873 							case kMatchElem_Type_BGroup:
2874 								++nestingLevel;
2875 								break;
2876 							case kMatchElem_Type_OR:
2877 								if (nestingLevel == 0) {
2878 									if (!findInitialItems(rule, altStart, b, initialItems))
2879 										optional = true;
2880 									altStart = b + 1;
2881 								}
2882 								break;
2883 							case kMatchElem_Type_EGroup:
2884 								if (nestingLevel == 0) {
2885 									if (!findInitialItems(rule, altStart, b, initialItems))
2886 										optional = true;
2887 								}
2888 								--nestingLevel;
2889 								break;
2890 						}
2891 						if (nestingLevel < 0)
2892 							break;
2893 					}
2894 					if (!optional && groupStart->repeatMin > 0)
2895 						return true;
2896 					++b;
2897 				}
2898 				break;
2899 
2900 			case kMatchElem_Type_Copy:
2901 				Error("can't use copy item (@tag) on match side of rule", 0, rule.lineNumber);
2902 				++b;
2903 				break;
2904 
2905 			default:
2906 				Error("this can't happen (findInitialItems)", 0, rule.lineNumber);
2907 				++b;
2908 				break;
2909 		}
2910 	}
2911 	return false;
2912 }
2913 
2914 void
findInitialItems(const Rule & rule,vector<Item> & initialItems)2915 Compiler::findInitialItems(const Rule& rule, vector<Item>& initialItems)
2916 {
2917 	bool	foundNonOpt = false;
2918 	if (rule.matchStr.size() > 0)
2919 		foundNonOpt = findInitialItems(rule, rule.matchStr.begin(), rule.matchStr.end(), initialItems);
2920 	if (!foundNonOpt && rule.postContext.size() > 0)
2921 		foundNonOpt = findInitialItems(rule, rule.postContext.begin(), rule.postContext.end(), initialItems);
2922 	if (!foundNonOpt)
2923 		Error("rule must have non-null match string or post-context", 0, rule.lineNumber);
2924 }
2925 
2926 long
classIndex(UInt32 charCode,const Class & classMembers)2927 Compiler::classIndex(UInt32 charCode, const Class& classMembers)
2928 {
2929 	for (Class::const_iterator i = classMembers.begin(); i != classMembers.end(); ++i)
2930 		if (*i == charCode)
2931 			return i - classMembers.begin();
2932 	return -1;
2933 }
2934 
2935 bool
isSingleCharRule(const Rule & rule)2936 Compiler::isSingleCharRule(const Rule& rule)
2937 {
2938 	if (rule.preContext.size() == 0 && rule.postContext.size() == 0 && rule.matchStr.size() == 1) {
2939 		const Item&	item = rule.matchStr.front();
2940 		if (item.repeatMin == 1 && item.repeatMax == 1)
2941 			if (item.type == 0 || item.type == kMatchElem_Type_Class || item.type == kMatchElem_Type_ANY)
2942 				return true;
2943 	}
2944 	return false;
2945 }
2946 
2947 void
appendMatchElem(string & packedRule,Item & item,int index,vector<MatClass> & matchClasses)2948 Compiler::appendMatchElem(string& packedRule, Item& item, int index,
2949 							vector<MatClass>& matchClasses)
2950 {
2951 	MatchElem	m;
2952 	WRITE(m.value.usv.data, 0);
2953 	WRITE(m.flags.repeat, (item.repeatMin << 4) + item.repeatMax);
2954 	if (item.negate)
2955 		WRITE(m.flags.type, kMatchElem_Negate);
2956 	else
2957 		WRITE(m.flags.type, 0);
2958 	switch (item.type) {
2959 		case 0:
2960 			WRITE(m.value.usv.data, READ(m.value.usv.data) | item.val);
2961 			break;
2962 
2963 		case kMatchElem_Type_Class:
2964 			{
2965 				WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_Class));
2966 				UInt32 i;
2967 				for (i = 0; i < matchClasses.size(); ++i)
2968 					if (matchClasses[i].membersClass == item.val)
2969 						break;
2970 				if (i == matchClasses.size())
2971 					matchClasses.push_back(MatClass(item.val));
2972 				WRITE(m.value.cls.index, i);
2973 			}
2974 			break;
2975 
2976 		case kMatchElem_Type_BGroup:
2977 			WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_BGroup));
2978 			WRITE(m.value.bgroup.dNext, item.next - index);
2979 			WRITE(m.value.bgroup.dAfter, item.after - index);
2980 			break;
2981 
2982 		case kMatchElem_Type_EGroup:
2983 			WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_EGroup));
2984 			WRITE(m.value.egroup.dStart, index - item.start);
2985 			break;
2986 
2987 		case kMatchElem_Type_OR:
2988 			WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_OR));
2989 			WRITE(m.value.egroup.dNext, item.next - index);
2990 			WRITE(m.value.egroup.dStart, index - item.start);
2991 			break;
2992 
2993 		case kMatchElem_Type_ANY:
2994 			WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_ANY));
2995 			break;
2996 
2997 		case kMatchElem_Type_EOS:
2998 			WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_EOS));
2999 			break;
3000 	}
3001 	const char*	p = (const char*)&m;
3002 	packedRule.append(p, sizeof(m));
3003 }
3004 
3005 void
appendReplaceElem(string & packedRule,Item & item,vector<Item> & matchStr,vector<RepClass> & repClasses)3006 Compiler::appendReplaceElem(string& packedRule, Item& item, vector<Item>& matchStr, vector<RepClass>& repClasses)
3007 {
3008 	RepElem	r;
3009 	WRITE(r.value, 0);
3010 	switch (item.type) {
3011 		case 0:
3012 			WRITE(r.value, item.val);
3013 			break;
3014 
3015 		case kRepElem_Class:
3016 			{
3017 				WRITE(r.flags.type, kRepElem_Class);
3018 				WRITE(r.flags.matchIndex, item.index);
3019 				Item&	mItem = matchStr[item.index];
3020 				if (mItem.type != kMatchElem_Type_Class) {
3021 					cerr << "this can't happen (appendReplaceElem)\n";
3022 					exit(1);
3023 				}
3024 				UInt32 i;
3025 				for (i = 0; i < repClasses.size(); ++i)
3026 					if (repClasses[i].membersClass == item.val && repClasses[i].sortLikeClass == mItem.val)
3027 						break;
3028 				if (i == repClasses.size())
3029 					repClasses.push_back(RepClass(item.val, mItem.val));
3030 				WRITE(r.flags.repClass, i);
3031 			}
3032 			break;
3033 
3034 		case kRepElem_Copy:
3035 			WRITE(r.flags.type, kRepElem_Copy);
3036 			WRITE(r.flags.matchIndex, item.index);
3037 			break;
3038 
3039 		case kRepElem_Unmapped:
3040 			WRITE(r.flags.type, kRepElem_Unmapped);
3041 			break;
3042 	}
3043 	const char*	p = (const char*)&r;
3044 	packedRule.append(p, sizeof(r));
3045 }
3046 
3047 vector<Compiler::Item>
reverseContext(const vector<Item> & ctx)3048 Compiler::reverseContext(const vector<Item>& ctx)
3049 {
3050 	vector<Item>	rval;
3051 	for (vector<Item>::const_iterator i = ctx.begin(); i != ctx.end(); ++i) {
3052 		rval.insert(rval.begin(), *i);
3053 		switch (i->type) {
3054 			case kMatchElem_Type_BGroup:
3055 				rval.front().type = kMatchElem_Type_EGroup;
3056 				break;
3057 
3058 			case kMatchElem_Type_EGroup:
3059 				rval.front().type = kMatchElem_Type_BGroup;
3060 				break;
3061 		}
3062 	}
3063 	return rval;
3064 }
3065 
3066 void
addToCharMap(UInt32 ch,UInt16 index)3067 Compiler::addToCharMap(UInt32 ch, UInt16 index)
3068 {
3069 	UInt8	plane = ch >> 16;
3070 	UInt8	page = (ch & 0x00ffff) >> 8;
3071 	if (buildVars.planeMap.size() <= plane)
3072 		buildVars.planeMap.resize(plane + 1, 0xff);
3073 	if ((UInt8)buildVars.planeMap[plane] == (UInt8)0xff) {
3074 		buildVars.planeMap[plane] = buildVars.pageMaps.size();
3075 		buildVars.pageMaps.resize(buildVars.pageMaps.size() + 1);
3076 		buildVars.pageMaps.back().resize(256, 0xff);
3077 	}
3078 	UInt8	planeIndex = buildVars.planeMap[plane];
3079 	string&	pageMap = buildVars.pageMaps[planeIndex];
3080 	if ((UInt8)pageMap[page] == (UInt8)0xff) {
3081 		pageMap[page] = buildVars.charMaps.size();
3082 		buildVars.charMaps.resize(buildVars.charMaps.size() + 1);
3083 		buildVars.charMaps.back().resize(256);
3084 	}
3085 	vector<UInt16>&	charMap = buildVars.charMaps[(UInt8)pageMap[page]];
3086 	charMap[ch & 0x0000ff] = index;
3087 }
3088 
3089 void
align(string & table,int alignment)3090 Compiler::align(string& table, int alignment)
3091 {
3092 	int	remainder = table.size() % alignment;
3093 	if (remainder != 0)
3094 		table.resize(table.size() + alignment - remainder);
3095 }
3096 
3097 class Member {
3098 public:
Member(UInt32 v,UInt32 k)3099 			Member(UInt32 v, UInt32 k)
3100 				: value(v), key(k)
3101 					{ }
3102 	UInt32	value;
3103 	UInt32	key;
operator <(const Member & rhs) const3104 	bool	operator<(const Member& rhs) const
3105 					{ return key < rhs.key; }
3106 };
3107 
3108 void
buildTable(vector<Rule> & rules,bool fromUni,bool toUni,string & table)3109 Compiler::buildTable(vector<Rule>& rules, bool fromUni, bool toUni, string& table)
3110 {
3111 	TableHeader	th;
3112 	WRITE(th.type, fromUni ? (toUni ? kTableType_UU : kTableType_UB) : (toUni ? kTableType_BU : kTableType_BB));
3113 	WRITE(th.version, kCurrentTableVersion);
3114 	WRITE(th.length, 0);
3115 	WRITE(th.flags, 0);
3116 	WRITE(th.pageBase, 0);
3117 	WRITE(th.lookupBase, 0);
3118 	WRITE(th.matchClassBase, 0);
3119 	WRITE(th.repClassBase, 0);
3120 	WRITE(th.stringListBase, 0);
3121 	WRITE(th.stringRuleData, 0);
3122 	WRITE(th.maxMatch, buildVars.maxMatch);
3123 	WRITE(th.maxPre, buildVars.maxPre);
3124 	WRITE(th.maxPost, buildVars.maxPost);
3125 	WRITE(th.maxOutput, buildVars.maxOutput);
3126 	WRITE(th.replacementChar, toUni ? currentPass.uniDefault : currentPass.byteDefault);
3127 
3128 	map<UInt32,UInt16>			charToIndex;
3129 	map<UInt16,UInt32>			indexToChar;
3130 	vector< vector<UInt32> >	rulesForIndex;
3131 	vector<UInt32>				stringRuleLists;
3132 	string						stringRuleData;
3133 	vector<MatClass>			matchClasses;
3134 	vector<RepClass>			repClasses;
3135 #if 0
3136 	UInt32						terminatorRule = 0;
3137 	bool						terminatorCreated = false;
3138 #endif
3139 
3140 	if (fromUni) {
3141 		rulesForIndex.resize(1);	// index 0 will be unmapped char
3142 		for (UInt32 i = 0; i != rules.size(); ++i) {
3143 			vector<Item>	initialItems;
3144 			findInitialItems(rules[i], initialItems);
3145 			for (vector<Item>::iterator j = initialItems.begin(); j != initialItems.end(); ++j) {
3146 				UInt32	index;
3147 				switch (j->type) {
3148 					case 0:	// literal
3149 						if (j->negate) {
3150 							Error("can't start with negated Unicode literal", 0, rules[i].lineNumber);
3151 							break;
3152 						}
3153 						if (charToIndex.find(j->val) == charToIndex.end()) {
3154 							// add char to charToIndex mapping
3155 							charToIndex[j->val] = rulesForIndex.size();
3156 							indexToChar[rulesForIndex.size()] = j->val;
3157 							rulesForIndex.resize(rulesForIndex.size() + 1);
3158 							addToCharMap(j->val, charToIndex[j->val]);
3159 						}
3160 						index = charToIndex[j->val];
3161 						if (rulesForIndex[index].size() == 0 || rulesForIndex[index].back() != i) {
3162 							rulesForIndex[index].push_back(i);
3163 							if (rulesForIndex[index].size() == 0x3ff) {
3164 								Error("too many matches with same initial character", 0, rules[i].lineNumber);
3165 							}
3166 						}
3167 						break;
3168 
3169 					case kMatchElem_Type_Class:
3170 						{
3171 							if (j->negate) {
3172 								Error("can't start with negated Unicode class", 0, rules[i].lineNumber);
3173 								break;
3174 							}
3175 							Class&	uc = currentPass.uniClassMembers[j->val];
3176 							Class::const_iterator u;
3177 							for (u = uc.begin(); u != uc.end(); ++u) {
3178 								if (charToIndex.find(*u) == charToIndex.end()) {
3179 									charToIndex[*u] = rulesForIndex.size();
3180 									indexToChar[rulesForIndex.size()] = *u;
3181 									rulesForIndex.resize(rulesForIndex.size() + 1);
3182 									addToCharMap(*u, charToIndex[*u]);
3183 								}
3184 								index = charToIndex[*u];
3185 								if (rulesForIndex[index].size() == 0 || rulesForIndex[index].back() != i)
3186 									rulesForIndex[index].push_back(i);
3187 							}
3188 						}
3189 						break;
3190 
3191 					case kMatchElem_Type_ANY:
3192 					case kMatchElem_Type_EOS:
3193 						Error("can't start with ANY or EOS in Unicode match", 0, rules[i].lineNumber);
3194 						break;
3195 
3196 					default:
3197 						Error("this can't happen (buildTable 1)");
3198 						break;
3199 				}
3200 			}
3201 		}
3202 	}
3203 	else {
3204 		rulesForIndex.resize(256);
3205 		for (UInt32 i = 0; i != rules.size(); ++i) {
3206 			vector<Item>	initialItems;
3207 			findInitialItems(rules[i], initialItems);
3208 			for (vector<Item>::iterator j = initialItems.begin(); j != initialItems.end(); ++j) {
3209 				unsigned int c;
3210 				switch (j->type) {
3211 					case 0:	// literal
3212 						if (j->negate) {
3213 							// add rule to every char except the literal!
3214 							for (c = 0; c < 256; ++c)
3215 								if (c != j->val)
3216 									if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3217 										rulesForIndex[c].push_back(i);
3218 						}
3219 						else
3220 							if (rulesForIndex[j->val].size() == 0 || rulesForIndex[j->val].back() != i)
3221 								rulesForIndex[j->val].push_back(i);
3222 						break;
3223 
3224 					case kMatchElem_Type_Class:
3225 						{
3226 							Class&	bc = currentPass.byteClassMembers[j->val];
3227 							for (c = 0; c < 256; ++c)
3228 								if ((classIndex(c, bc) != -1) != j->negate)
3229 									if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3230 										rulesForIndex[c].push_back(i);
3231 						}
3232 						break;
3233 
3234 					case kMatchElem_Type_ANY:
3235 					case kMatchElem_Type_EOS:
3236 						if (j->negate) {
3237 							Error("rule can't start with negated ANY or EOS", 0, rules[i].lineNumber);
3238 							break;
3239 						}
3240 						for (c = 0; c < 256; ++c)
3241 							if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3242 								rulesForIndex[c].push_back(i);
3243 						break;
3244 
3245 					default:
3246 						Error("this can't happen (buildTable 2)");
3247 						break;
3248 				}
3249 			}
3250 		}
3251 	}
3252 
3253 	vector<Lookup>	lookup;
3254 	lookup.resize(rulesForIndex.size());
3255 
3256 	UInt32 i;
3257 	for (i = 0; errorCount == 0 && i < rulesForIndex.size(); ++i) {
3258 		WRITE(lookup[i].usv, 0);	// initialize the lookup to all zero bits
3259 
3260 		if (rulesForIndex[i].size() == 0) {
3261 			WRITE(lookup[i].rules.type, kLookupType_Unmapped);
3262 			continue;
3263 		}
3264 
3265 		if (rulesForIndex[i].size() == 1) {
3266 			const Rule&	rule = rules[rulesForIndex[i].front()];
3267 			if (isSingleCharRule(rule)) {
3268 				if (toUni) {
3269 					// mapping to Unicode: direct lookup can only support one Unicode character
3270 					if (rule.replaceStr.size() == 1) {
3271 						const Item&	rep = rule.replaceStr.front();
3272 						int	t = rep.tag.length() > 0 ? findTag(rep.tag, rule.matchStr) : 0;
3273 						switch (rep.type) {
3274 							case 0:
3275 								WRITE(lookup[i].usv, rep.val);
3276 								continue;
3277 
3278 							case kMatchElem_Type_Class:
3279 								{
3280 									if (t == -1) {
3281 										Error("tag not found", rep.tag.c_str(), rule.lineNumber);
3282 										continue;
3283 									}
3284 									const Item&	mat = rule.matchStr[t];
3285 									if (mat.type != kMatchElem_Type_Class) {
3286 										Error("improper use of class as target of mapping", 0, rule.lineNumber);
3287 										continue;
3288 									}
3289 									Class&	rc = currentPass.uniClassMembers[rep.val];
3290 									if (fromUni) {
3291 										Class&	mc = currentPass.uniClassMembers[mat.val];
3292 										if (mc.size() != rc.size()) {
3293 											Error("class size mismatch", 0, rule.lineNumber);
3294 											continue;
3295 										}
3296 										WRITE(lookup[i].usv, rc[classIndex(indexToChar[i], mc)]);
3297 									}
3298 									else {
3299 										Class&	mc = currentPass.byteClassMembers[mat.val];
3300 										if (mc.size() != rc.size()) {
3301 											Error("class size mismatch", 0, rule.lineNumber);
3302 											continue;
3303 										}
3304 										WRITE(lookup[i].usv, rc[classIndex(i, mc)]);
3305 									}
3306 								}
3307 								continue;
3308 
3309 							case kMatchElem_Type_Copy:
3310 								// should only occur in UU table
3311 								if (t > (int)rule.matchStr.size()) {
3312 									Error("no corresponding item for copy", 0, rule.lineNumber);
3313 									goto ERR_FOUND;
3314 								}
3315 								WRITE(lookup[i].usv, indexToChar[i]);
3316 								continue;
3317 
3318 							default:
3319 								goto STRING_RULE_NEEDED;
3320 						}
3321 					}
3322 				}
3323 				else {
3324 					// mapping to bytes: we can put up to three bytes into the direct lookup
3325 					if (rule.replaceStr.size() <= 3) {
3326 						WRITE(lookup[i].bytes.count, rule.replaceStr.size());
3327 							// this will get overwritten by lookup[i].rules.type if string rules turn out to be needed
3328 						UInt32 j;
3329 						for (j = 0; j < rule.replaceStr.size(); ++j) {
3330 							const Item&	rep = rule.replaceStr[j];
3331 							int	t = rep.tag.length() > 0 ? findTag(rep.tag, rule.matchStr) : j;
3332 							if (t == -1) {
3333 								Error("tag not found", rep.tag.c_str(), rule.lineNumber);
3334 								goto ERR_FOUND;
3335 							}
3336 							switch (rep.type) {
3337 								case 0:	// literal
3338 									WRITE(lookup[i].bytes.data[j], rep.val);
3339 									break;
3340 
3341 								case kMatchElem_Type_Class:
3342 									if (t > (int)rule.matchStr.size()) {
3343 										Error("no corresponding item for class replacement", 0, rule.lineNumber);
3344 										goto ERR_FOUND;
3345 									}
3346 									else {
3347 										const Item&	mat = rule.matchStr[t];
3348 										if (mat.type != kMatchElem_Type_Class) {
3349 											Error("improper use of class as target of mapping", 0, rule.lineNumber);
3350 											goto ERR_FOUND;
3351 										}
3352 										Class&	rc = currentPass.byteClassMembers[rep.val];
3353 										if (fromUni) {
3354 											Class&	mc = currentPass.uniClassMembers[mat.val];
3355 											if (mc.size() != rc.size()) {
3356 												Error("class size mismatch", 0, rule.lineNumber);
3357 												goto ERR_FOUND;
3358 											}
3359 											WRITE(lookup[i].bytes.data[j], rc[classIndex(indexToChar[i], mc)]);
3360 										}
3361 										else {
3362 											Class&	mc = currentPass.byteClassMembers[mat.val];
3363 											if (mc.size() != rc.size()) {
3364 												Error("class size mismatch", 0, rule.lineNumber);
3365 												goto ERR_FOUND;
3366 											}
3367 											WRITE(lookup[i].bytes.data[j], rc[classIndex(i, mc)]);
3368 										}
3369 									}
3370 									break;
3371 
3372 								case kMatchElem_Type_Copy:
3373 									// should only occur in BB table
3374 									if (t > (int)rule.matchStr.size()) {
3375 										Error("no corresponding item for copy", 0, rule.lineNumber);
3376 										goto ERR_FOUND;
3377 									}
3378 									WRITE(lookup[i].bytes.data[j], i);
3379 									break;
3380 
3381 								default:
3382 									goto STRING_RULE_NEEDED;
3383 							}
3384 						}
3385 					ERR_FOUND:
3386 						continue;
3387 					}
3388 				}
3389 			}
3390 		}
3391 
3392 	STRING_RULE_NEEDED:
3393 #if 0
3394 		// decide whether we need to add a default terminating rule
3395 		const Rule&	finalRule = rules[rulesForIndex[i].back()];
3396 		if (!isSingleCharRule(finalRule)) {
3397 			if (!terminatorCreated) {
3398 				currentRule.clear();
3399 				Item	item;
3400 				item.type = kMatchElem_Type_ANY;
3401 				item.negate = 0;
3402 				item.repeatMin = 1;
3403 				item.repeatMax = 1;
3404 				item.val = 0;
3405 				currentRule.lhsString.push_back(item);
3406 				item.type = kRepElem_Unmapped;
3407 				currentRule.rhsString.push_back(item);
3408 				terminatorRule = rules.size();
3409 				rules.push_back(Rule(currentRule.lhsString,
3410 						currentRule.lhsPreContext, currentRule.lhsPostContext,
3411 						currentRule.rhsString, finalRule.lineNumber));
3412 				terminatorCreated = true;
3413 				currentRule.clear();
3414 			}
3415 			rulesForIndex[i].push_back(terminatorRule);
3416 		}
3417 #endif
3418 		// set the Lookup fields
3419 		if (rulesForIndex[i].size() > 255) {
3420 			WRITE(lookup[i].rules.type, kLookupType_ExtStringRules + (rulesForIndex[i].size() >> 8));
3421 			WRITE(lookup[i].rules.ruleCount, rulesForIndex[i].size() & 0xff);
3422 			usedExtStringRules = true;
3423 		}
3424 		else {
3425 			WRITE(lookup[i].rules.type, kLookupType_StringRules);
3426 			WRITE(lookup[i].rules.ruleCount, rulesForIndex[i].size());
3427 		}
3428 		WRITE(lookup[i].rules.ruleIndex, stringRuleLists.size());
3429 
3430 		// construct the rule list
3431 		if (errorCount == 0) {
3432 			for (unsigned int j = 0; j < rulesForIndex[i].size(); ++j) {
3433 				Rule&	r = rules[rulesForIndex[i][j]];
3434 				if (r.offset == kInvalidRuleOffset) {
3435 					r.offset = stringRuleData.size();
3436 					stringRuleData.append(1, r.matchStr.size());
3437 					stringRuleData.append(1, r.postContext.size());
3438 					stringRuleData.append(1, r.preContext.size());
3439 					stringRuleData.append(1, r.replaceStr.size());
3440 					unsigned int k;
3441 					for (k = 0; k < r.matchStr.size(); ++k)
3442 						appendMatchElem(stringRuleData, r.matchStr[k], k, matchClasses);
3443 					for (k = 0; k < r.postContext.size(); ++k)
3444 						appendMatchElem(stringRuleData, r.postContext[k], k, matchClasses);
3445 					for (k = 0; k < r.preContext.size(); ++k)
3446 						appendMatchElem(stringRuleData, r.preContext[k], k, matchClasses);
3447 					for (k = 0; k < r.replaceStr.size(); ++k)
3448 						appendReplaceElem(stringRuleData, r.replaceStr[k], r.matchStr, repClasses);
3449 				}
3450 				stringRuleLists.push_back(r.offset);
3451 			}
3452 		}
3453 	}
3454 
3455 	if (fromUni && buildVars.planeMap.size() > 1)
3456 		currentPass.supplementaryChars = true;
3457 
3458 	UInt32	headerOffset = table.size();
3459 	table.append((const char*)&th, sizeof(th));
3460 
3461 	if (fromUni) {
3462 		WRITE(th.pageBase, table.size());
3463 		UInt32	i, j;
3464 		if (currentPass.supplementaryChars) {
3465 			buildVars.planeMap.resize(17, 0xff);
3466 			for (i = 0; i < buildVars.planeMap.size(); ++i)
3467 				appendToTable(table, buildVars.planeMap[i]);
3468 			appendToTable(table, (UInt8)buildVars.pageMaps.size());
3469 			align(table, 4);
3470 		}
3471 
3472 		for (i = 0; i < buildVars.pageMaps.size(); ++i)
3473 			for (j = 0; j < buildVars.pageMaps[i].size(); ++j)
3474 				appendToTable(table, (UInt8)buildVars.pageMaps[i][j]);
3475 		align(table, 4);
3476 
3477 		for (i = 0; i < buildVars.charMaps.size(); ++i)
3478 			for (j = 0; j < buildVars.charMaps[i].size(); ++j)
3479 				appendToTable(table, buildVars.charMaps[i][j]);
3480 		align(table, 4);
3481 	}
3482 
3483 	WRITE(th.lookupBase, table.size());
3484 	for (i = 0; i < lookup.size(); ++i)
3485 		appendToTable(table, READ(lookup[i].usv));
3486 	align(table, 4);
3487 
3488 	WRITE(th.stringListBase, table.size());
3489 	for (i = 0; i < stringRuleLists.size(); ++i)
3490 		appendToTable(table, stringRuleLists[i]);
3491 	align(table, 4);
3492 
3493 	WRITE(th.stringRuleData, table.size());
3494 	for (i = 0; i < stringRuleData.size(); ++i)
3495 		appendToTable(table, stringRuleData[i]);
3496 	align(table, 4);
3497 
3498 	// sort and output the match classes
3499 	{
3500 		WRITE(th.matchClassBase, table.size() - headerOffset);
3501 		vector<UInt32>	classOffsets;
3502 		classOffsets.resize(matchClasses.size());
3503 		UInt32	classOffset = matchClasses.size() * sizeof(UInt32);
3504 		string	classes;
3505 
3506 		for (i = 0; i < matchClasses.size(); ++i) {
3507 			classOffsets[i] = classOffset + classes.size();
3508 			Class	sortedClass = fromUni
3509 					? currentPass.uniClassMembers[matchClasses[i].membersClass]
3510 					: currentPass.byteClassMembers[matchClasses[i].membersClass];
3511 
3512 			if (sortedClass.size() > 0) {
3513 				sort(sortedClass.begin(), sortedClass.end());
3514 				for (UInt32 j = sortedClass.size() - 1; j > 0; --j)
3515 					if (sortedClass[j] == sortedClass[j - 1])
3516 						sortedClass.erase(sortedClass.begin() + j);
3517 			}
3518 
3519 			appendToTable(classes, (UInt32)sortedClass.size());
3520 			if (fromUni)
3521 				if (currentPass.supplementaryChars)
3522 					for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3523 						appendToTable(classes, *x);
3524 				else
3525 					for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3526 						appendToTable(classes, (UInt16)*x);
3527 			else
3528 				for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3529 					appendToTable(classes, (UInt8)*x);
3530 			align(classes, 4);
3531 		}
3532 		// copy the real classOffsets into the table
3533 		for (i = 0; i < classOffsets.size(); ++i)
3534 			appendToTable(table, classOffsets[i]);
3535 
3536 		// now append the actual classes
3537 		table.insert(table.end(), classes.begin(), classes.end());
3538 	}
3539 
3540 	// sort and output the replacement classes
3541 	{
3542 		WRITE(th.repClassBase, table.size());
3543 		vector<UInt32>	classOffsets;
3544 		classOffsets.resize(repClasses.size());
3545 		UInt32	classOffset = repClasses.size() * sizeof(UInt32);
3546 		string	classes;
3547 
3548 		for (i = 0; i < repClasses.size(); ++i) {
3549 			classOffsets[i] = classOffset + classes.size();
3550 			vector<Member>	sortedClass;
3551 			const Class&	values = toUni
3552 									? currentPass.uniClassMembers[repClasses[i].membersClass]
3553 									: currentPass.byteClassMembers[repClasses[i].membersClass];
3554 			const Class&	keys = fromUni
3555 									? currentPass.uniClassMembers[repClasses[i].sortLikeClass]
3556 									: currentPass.byteClassMembers[repClasses[i].sortLikeClass];
3557 			for (UInt32 j = 0; j < values.size(); ++j)
3558 				sortedClass.push_back(Member(values[j], keys[j]));
3559 
3560 			if (sortedClass.size() > 0) {
3561 				sort(sortedClass.begin(), sortedClass.end());
3562 				for (UInt32 j = sortedClass.size() - 1; j > 0; --j)
3563 					if (sortedClass[j].key == sortedClass[j - 1].key)
3564 						sortedClass.erase(sortedClass.begin() + j);
3565 			}
3566 
3567 			appendToTable(classes, (UInt32)sortedClass.size());
3568 			if (toUni)
3569 				if (currentPass.supplementaryChars)
3570 					for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3571 						appendToTable(classes, x->value);
3572 				else
3573 					for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3574 						appendToTable(classes, (UInt16)x->value);
3575 			else
3576 				for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3577 					appendToTable(classes, (UInt8)(x->value));
3578 			align(classes, 4);
3579 		}
3580 		// copy the real classOffsets into the table
3581 		for (i = 0; i < classOffsets.size(); ++i)
3582 			appendToTable(table, classOffsets[i]);
3583 
3584 		// now append the actual classes
3585 		table.insert(table.end(), classes.begin(), classes.end());
3586 	}
3587 
3588 	// stuff the real header values into the beginning of the table
3589 	if (currentPass.supplementaryChars)
3590 		WRITE(th.flags, READ(th.flags) | kTableFlags_Supplementary);
3591 	WRITE(th.length, table.size());
3592 	table.replace(0, sizeof(th), (const char*)&th, sizeof(th));
3593 }
3594 
3595 void
clear()3596 Compiler::Pass::clear()
3597 {
3598 	fwdRules.clear();
3599 	revRules.clear();
3600 	xmlRules.clear();
3601 	xmlContexts.clear();
3602 
3603 	byteClassNames.clear();
3604 	uniClassNames.clear();
3605 
3606 	byteClassMembers.clear();
3607 	uniClassMembers.clear();
3608 
3609 	uniDefault = 0xfffd;	// REPLACEMENT CHARACTER
3610 	byteDefault = '?';
3611 	passType = 0;
3612 	supplementaryChars = false;
3613 	startingLine = 0;
3614 }
3615 
3616 void
setLineNo(UInt32 lineNo)3617 Compiler::Pass::setLineNo(UInt32 lineNo)
3618 {
3619 	if (startingLine == 0)
3620 		startingLine = lineNo;
3621 }
3622 
3623 void
clear()3624 Compiler::BuildVars::clear()
3625 {
3626 	planeMap.erase(planeMap.begin(), planeMap.end());
3627 	pageMaps.clear();
3628 	charMaps.clear();
3629 	maxMatch = 1;
3630 	maxPre = 0;
3631 	maxPost = 0;
3632 	maxOutput = 0;
3633 }
3634 
3635 void
clear()3636 Compiler::CurrRule::clear()
3637 {
3638 	lhsString.clear();
3639 	lhsPreContext.clear();
3640 	lhsPostContext.clear();
3641 	rhsString.clear();
3642 	rhsPreContext.clear();
3643 	rhsPostContext.clear();
3644 	startingLine = 0;
3645 }
3646 
3647 void
setLineNo(UInt32 lineNo)3648 Compiler::CurrRule::setLineNo(UInt32 lineNo)
3649 {
3650 	if (startingLine == 0)
3651 		startingLine = lineNo;
3652 }
3653