1 /*------------------------------------------------------------------------
2 Copyright (C) 2002-2016 SIL International. All rights reserved.
3
4 Distributable under the terms of either the Common Public License or the
5 GNU Lesser General Public License, as specified in the LICENSING.txt file.
6
7 File: Compiler.cpp
8 Responsibility: Jonathan Kew
9 Last reviewed: Not yet.
10
11 Description:
12 Implements the TECkit mapping compiler.
13 -------------------------------------------------------------------------*/
14
15 /*
16 2008-11-17 jk include <cstdio> (Debian bug 505693)
17 2006-06-19 jk added new APIs to look up Unicode names
18 2006-01-12 jk removed multi-char constants, use FOUR_CHAR_CODE to define UInt32 values instead
19 (no functional change, just to avoid compiler warnings)
20 2005-07-07 jk 2.1.5 changed to use WORDS_BIGENDIAN rather than TARGET_RT_BIG_ENDIAN
21 2005-06-20 jk 2.1.4 added lhsDefault/rhsDefault attributes to <pass> elem in xml output
22 23-May-2005 changes for 64-bit compilation, from Ulrik P
23 21-May-2005 changes based on Ulrik Petersen's patch for MS VC++ 6
24 2004-11-11 jk 2.1.3 added support for XML export
25 2004-07-21 jk 2.1.2 removed trailing spaces from 2 names in UnicodeNames.cpp
26 2004-06-16 jk 2.1.1 fixed bug of ignoring char after '_'
27 2004-03-12 jk 2.1 updated for version 2.1 with ...Opt APIs
28 modified compiler to accept Unicode source text
29 */
30
31 #include "Compiler.h"
32
33 #include <cstdio>
34 #include <iostream>
35 #include <iomanip>
36 #include <algorithm>
37 #include <cstring>
38 #include <cstdio>
39
40 #include "zlib.h"
41
42 const UInt32 kInvalidChar = 0xfffffffdUL;
43
44 static UInt32
45 offsetsFromUTF8[6] = {
46 0x00000000UL,
47 0x00003080UL,
48 0x000E2080UL,
49 0x03C82080UL,
50 0xFA082080UL,
51 0x82082080UL
52 };
53
54 static UInt8
55 bytesFromUTF8[256] = {
56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
64 };
65
66 static UInt8
67 firstByteMark[7] = {
68 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
69 };
70
71 const int halfShift = 10;
72 const UInt32 halfBase = 0x0010000UL;
73 const UInt32 kSurrogateHighStart = 0xD800UL;
74 const UInt32 kSurrogateHighEnd = 0xDBFFUL;
75 const UInt32 kSurrogateLowStart = 0xDC00UL;
76 const UInt32 byteMask = 0x000000BFUL;
77 const UInt32 byteMark = 0x00000080UL;
78
79 #define FOUR_CHAR_CODE(a,b,c,d) (UInt32)((a << 24) + (b << 16) + (c << 8) + d)
80
81 const UInt32 kCode_Byte = FOUR_CHAR_CODE('B','y','t','e');
82 const UInt32 kCode_BU = FOUR_CHAR_CODE('B','-','>','U');
83 const UInt32 kCode_UB = FOUR_CHAR_CODE('U','-','>','B');
84 const UInt32 kCode_Unic = FOUR_CHAR_CODE('U','n','i','c');
85 const UInt32 kCode_NFCf = FOUR_CHAR_CODE('N','F','C','f');
86 const UInt32 kCode_NFCr = FOUR_CHAR_CODE('N','F','C','r');
87 const UInt32 kCode_NFC = FOUR_CHAR_CODE('N','F','C',' ');
88 const UInt32 kCode_NFDf = FOUR_CHAR_CODE('N','F','D','f');
89 const UInt32 kCode_NFDr = FOUR_CHAR_CODE('N','F','D','r');
90 const UInt32 kCode_NFD = FOUR_CHAR_CODE('N','F','D',' ');
91
92 Compiler::Keyword
93 Compiler::keywords[] = {
94 { "Pass", tok_Pass, 0 },
95 { "Byte", tok_PassType, kCode_Byte },
96 { "Byte_Unicode", tok_PassType, kCode_BU },
97 { "Unicode_Byte", tok_PassType, kCode_UB },
98 { "Unicode", tok_PassType, kCode_Unic },
99 { "NFC_fwd", tok_PassType, kCode_NFCf },
100 { "NFC_rev", tok_PassType, kCode_NFCr },
101 { "NFC", tok_PassType, kCode_NFC },
102 { "NFD_fwd", tok_PassType, kCode_NFDf },
103 { "NFD_rev", tok_PassType, kCode_NFDr },
104 { "NFD", tok_PassType, kCode_NFD },
105 { "Class", tok_Class, 0 },
106 { "ByteClass", tok_Class, 'B' },
107 { "UniClass", tok_Class, 'U' },
108 { "ByteDefault", tok_Default, 'B' },
109 { "UniDefault", tok_Default, 'U' },
110 { "EncodingName", tok_Name, kNameID_LHS_Name },
111 { "DescriptiveName", tok_Name, kNameID_LHS_Description },
112 { "Name", tok_Name, 0xffffffff },
113 { "LHSName", tok_Name, kNameID_LHS_Name },
114 { "LHSDescription", tok_Name, kNameID_LHS_Description },
115 { "RHSName", tok_Name, kNameID_RHS_Name },
116 { "RHSDescription", tok_Name, kNameID_RHS_Description },
117 { "Version", tok_Name, kNameID_Version },
118 { "Contact", tok_Name, kNameID_Contact },
119 { "RegistrationAuthority", tok_Name, kNameID_RegAuthority },
120 { "RegistrationName", tok_Name, kNameID_RegName },
121 { "Copyright", tok_Name, kNameID_Copyright },
122 { "LHSFlags", tok_Flags, 'S' },
123 { "RHSFlags", tok_Flags, 'T' },
124 { "ExpectsNFC", tok_FlagValue, kFlags_ExpectsNFC },
125 { "ExpectsNFD", tok_FlagValue, kFlags_ExpectsNFD },
126 { "GeneratesNFC", tok_FlagValue, kFlags_GeneratesNFC },
127 { "GeneratesNFD", tok_FlagValue, kFlags_GeneratesNFD },
128 { "VisualOrder", tok_FlagValue, kFlags_VisualOrder },
129 { "Define", tok_Define, 0 },
130 { 0, tok_Identifier, 0 }
131 };
132
133 UInt32
134 WINAPI
TECkit_GetCompilerVersion()135 TECkit_GetCompilerVersion()
136 {
137 return kCurrentTECkitVersion;
138 }
139
140 TECkit_Status
141 WINAPI
TECkit_Compile(char * txt,UInt32 len,Byte doCompression,TECkit_ErrorFn errFunc,void * userData,Byte ** outTable,UInt32 * outLen)142 TECkit_Compile(char* txt, UInt32 len, Byte doCompression, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen)
143 {
144 TECkit_Status result = kStatus_CompilationFailed;
145 try {
146 Compiler* cmp = new Compiler(txt, len, kForm_Unspecified, (bool)doCompression, false, errFunc, userData);
147 cmp->GetCompiledTable(*outTable, *outLen);
148 if (*outTable == 0)
149 result = kStatus_CompilationFailed;
150 else {
151 cmp->DetachCompiledTable();
152 result = kStatus_NoError;
153 }
154 delete cmp;
155 }
156 catch (...) {
157 result = kStatus_Exception;
158 }
159 return result;
160 }
161
162 TECkit_Status
163 WINAPI
TECkit_CompileOpt(char * txt,UInt32 len,TECkit_ErrorFn errFunc,void * userData,Byte ** outTable,UInt32 * outLen,UInt32 opts)164 TECkit_CompileOpt(char* txt, UInt32 len, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen, UInt32 opts)
165 {
166 TECkit_Status result = kStatus_CompilationFailed;
167 try {
168 Compiler* cmp = new Compiler(txt, len, (opts & kCompilerOpts_FormMask),
169 (opts & kCompilerOpts_Compress) != 0, (opts & kCompilerOpts_XML) != 0, errFunc, userData);
170 cmp->GetCompiledTable(*outTable, *outLen);
171 if (*outTable == 0)
172 result = kStatus_CompilationFailed;
173 else {
174 cmp->DetachCompiledTable();
175 result = kStatus_NoError;
176 }
177 delete cmp;
178 }
179 catch (...) {
180 result = kStatus_Exception;
181 }
182 return result;
183 }
184
185 void
186 WINAPI
TECkit_DisposeCompiled(Byte * table)187 TECkit_DisposeCompiled(Byte* table)
188 {
189 if (table != 0)
190 free(table);
191 }
192
193 char*
194 WINAPI
TECkit_GetUnicodeName(UInt32 usv)195 TECkit_GetUnicodeName(UInt32 usv)
196 {
197 const CharName *c = &gUnicodeNames[0];
198 while (c->name != 0)
199 if (c->usv == usv)
200 return (char*)c->name;
201 else
202 ++c;
203 return NULL;
204 }
205
206 char*
207 WINAPI
TECkit_GetTECkitName(UInt32 usv)208 TECkit_GetTECkitName(UInt32 usv)
209 {
210 static char buffer[256];
211 const char* name = TECkit_GetUnicodeName(usv);
212 if (name == NULL)
213 sprintf(buffer, "U+%04X", usv);
214 else {
215 char* cp = &buffer[0];
216 while (*name && (cp - buffer < 255)) {
217 if ((*name < '0') || (*name > '9' && *name < 'A') || (*name > 'Z'))
218 *cp++ = '_';
219 else
220 *cp++ = *name | 0x20;
221 ++name;
222 }
223 *cp = 0;
224 }
225 return buffer;
226 }
227
228 static int
unicodeNameCompare(const char * uniName,const char * idStr,UInt32 len)229 unicodeNameCompare(const char* uniName, const char* idStr, UInt32 len)
230 { // idStr could be either a "real" unicode name or a teckit identifier
231 // when this is used by the TECkit_GetUnicodeValue API
232 while (*uniName || len != 0) {
233 if (len == 0)
234 return 1;
235 char u = *uniName++;
236 char i = *idStr++;
237 --len;
238 if ((i >= 'a') && (i <= 'z'))
239 i &= ~0x20;
240 if (u == i)
241 continue;
242 if ((u < '0') || (u > '9' && u < 'A') || (u > 'Z'))
243 u = '_';
244 if (u == i)
245 continue;
246 return u < i ? -1 : 1;
247 }
248 return 0;
249 }
250
251 int
252 WINAPI
TECkit_GetUnicodeValue(char * name)253 TECkit_GetUnicodeValue(char* name)
254 {
255 const CharName *c = &gUnicodeNames[0];
256 int len = strlen(name);
257 while (c->name != 0)
258 if (unicodeNameCompare(c->name, name, len) == 0)
259 return c->usv;
260 else
261 ++c;
262 return -1;
263 }
264
265 inline UInt8
READ(const UInt8 p)266 READ(const UInt8 p)
267 {
268 return p;
269 }
270
271 inline UInt16
READ(const UInt16 p)272 READ(const UInt16 p)
273 {
274 #ifdef WORDS_BIGENDIAN
275 return p;
276 #else
277 return (p >> 8) + (p << 8);
278 #endif
279 }
280
281 inline UInt32
READ(const UInt32 p)282 READ(const UInt32 p)
283 {
284 #ifdef WORDS_BIGENDIAN
285 return p;
286 #else
287 return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);
288 #endif
289 }
290
291 template<class T>
292 inline void
WRITE(T & t,UInt32 v)293 WRITE(T& t, UInt32 v)
294 {
295 t = READ(T(v));
296 }
297
298 void
appendToTable(string & s,const char * ptr,UInt32 len)299 Compiler::appendToTable(string& s, const char* ptr, UInt32 len)
300 {
301 #ifdef WORDS_BIGENDIAN
302 s.append(ptr, len);
303 #else
304 ptr += len;
305 while (len-- > 0)
306 s.append(1, *--ptr);
307 #endif
308 }
309
310 static inline bool
isIDstart(char c)311 isIDstart(char c)
312 {
313 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
314 }
315
316 static inline bool
isIDcont(char c)317 isIDcont(char c)
318 {
319 return isIDstart(c) || (c >= '0' && c <= '9');
320 }
321
322 static bool
strmatch(const char * str,const char * txt,UInt32 len)323 strmatch(const char* str, const char* txt, UInt32 len)
324 {
325 while (*str || len != 0) {
326 if (len == 0)
327 return false;
328 if ((*str++ | 0x20) != (*txt++ | 0x20))
329 return false;
330 --len;
331 }
332 return true;
333 }
334
335 static const char*
getClassName(const map<string,UInt32> & nameMap,UInt32 index)336 getClassName(const map<string,UInt32>& nameMap, UInt32 index)
337 {
338 for (map<string,UInt32>::const_iterator i = nameMap.begin(); i != nameMap.end(); ++i) {
339 if (i->second == index) {
340 return i->first.c_str();
341 }
342 }
343 return "[UNKNOWN]";
344 }
345
346 static const char*
asHex(UInt32 val,short digits)347 asHex(UInt32 val, short digits)
348 {
349 static char str[16];
350 sprintf(str, "%0*X", digits, val);
351 return str;
352 }
353
354 static const char*
asDec(UInt32 val)355 asDec(UInt32 val)
356 {
357 static char str[16];
358 sprintf(str, "%d", val);
359 return str;
360 }
361
362 void
xmlOut(const char * s)363 Compiler::xmlOut(const char* s)
364 {
365 xmlRepresentation += s;
366 }
367
368 void
xmlOut(const string & s)369 Compiler::xmlOut(const string& s)
370 {
371 xmlRepresentation += s;
372 }
373
374 void
xmlOut(char c)375 Compiler::xmlOut(char c)
376 {
377 xmlRepresentation += c;
378 }
379
380 string
getContextID(const vector<Item> & ctx,bool isUnicode)381 Compiler::getContextID(const vector<Item>& ctx, bool isUnicode)
382 {
383 string contextString = xmlString(ctx.begin(), ctx.end(), isUnicode);
384 string contextID = currentPass.xmlContexts[contextString];
385 if (contextID.length() == 0) {
386 contextID = isUnicode ? "uctx_" : "bctx_";
387 contextID += asDec(currentPass.xmlContexts.size());
388 currentPass.xmlContexts[contextString] = contextID;
389 }
390 return contextID;
391 }
392
393 string
xmlString(vector<Item>::const_iterator b,vector<Item>::const_iterator e,bool isUnicode)394 Compiler::xmlString(vector<Item>::const_iterator b, vector<Item>::const_iterator e, bool isUnicode)
395 {
396 string rval;
397 if (b == e)
398 return rval;
399 for (vector<Item>::const_iterator i = b; i != e; ++i) {
400 switch (i->type) {
401 case 0:
402 rval += "<ch n=\"";
403 rval += asHex(i->val, isUnicode ? 4 : 2);
404 rval += "\"";
405 break;
406 case kMatchElem_Type_EOS:
407 rval += "<eot";
408 break;
409 case kMatchElem_Type_ANY:
410 rval += "<any";
411 break;
412 case kMatchElem_Type_BGroup:
413 {
414 vector<Item>::const_iterator j = i;
415 int nesting = 0;
416 bool alt = false;
417 string groupStr;
418 ++i;
419 while (++j != e) {
420 if (j->type == kMatchElem_Type_BGroup)
421 ++nesting;
422 else if (j->type == kMatchElem_Type_EGroup) {
423 if (nesting == 0) {
424 if (alt && i < j - 1)
425 groupStr += "<group>\n";
426 groupStr += xmlString(i, j, isUnicode);
427 if (alt && i < j - 1)
428 groupStr += "</group>\n";
429 break;
430 }
431 else
432 --nesting;
433 }
434 else if (j->type == kMatchElem_Type_OR && nesting == 0) {
435 if (i < j - 1)
436 groupStr += "<group>\n";
437 groupStr += xmlString(i, j, isUnicode);
438 if (i < j - 1)
439 groupStr += "</group>\n";
440 i = j + 1;
441 alt = true;
442 }
443 }
444 i = j;
445 rval += "<group";
446 if (alt)
447 rval += " alt=\"1\"";
448 if ((i->repeatMin != 1) && (i->repeatMin != 255)) {
449 rval += " min=\"";
450 rval += asDec(i->repeatMin);
451 rval += "\"";
452 }
453 if ((i->repeatMax != 1) && (i->repeatMax != 255)) {
454 rval += " max=\"";
455 rval += asDec(i->repeatMax);
456 rval += "\"";
457 }
458 if (i->tag.length() > 0) {
459 if (i->type != kMatchElem_Type_Copy) {
460 rval += " id=\"";
461 rval += i->tag;
462 rval += "\"";
463 }
464 }
465 rval += ">\n";
466 rval += groupStr;
467 rval += "</group>\n";
468 continue;
469 }
470 break;
471 case kMatchElem_Type_OR:
472 rval += "<OR/>\n";
473 continue;
474 break;
475 case kMatchElem_Type_EGroup:
476 rval += "<END-GROUP/>\n";
477 continue;
478 break;
479 case kMatchElem_Type_Class:
480 {
481 rval += "<class-ref name=\"";
482 const map<string,UInt32>& classes = isUnicode ? currentPass.uniClassNames : currentPass.byteClassNames;
483 rval += isUnicode ? "u_" : "b_";
484 rval += getClassName(classes, i->val);
485 rval += "\"";
486 }
487 break;
488 case kMatchElem_Type_Copy:
489 rval += "<copy-ref id=\"";
490 rval += i->tag;
491 rval += "\"";
492 break;
493 default:
494 rval += "<UNKNOWN type=\"";
495 rval += asHex(i->type, 1);
496 break;
497 }
498 if (i->negate)
499 rval += " neg=\"1\"";
500 if ((i->repeatMin != 1) && (i->repeatMin != 255)) {
501 rval += " min=\"";
502 rval += asDec(i->repeatMin);
503 rval += "\"";
504 }
505 if ((i->repeatMax != 1) && (i->repeatMax != 255)) {
506 rval += " max=\"";
507 rval += asDec(i->repeatMax);
508 rval += "\"";
509 }
510 if (i->tag.length() > 0) {
511 if (i->type != kMatchElem_Type_Copy) {
512 rval += " id=\"";
513 rval += i->tag;
514 rval += "\"";
515 }
516 }
517 rval += "/>";
518 }
519 return rval;
520 }
521
Compiler(const char * txt,UInt32 len,char inForm,bool cmp,bool genXML,TECkit_ErrorFn errFunc,void * userData)522 Compiler::Compiler(const char* txt, UInt32 len, char inForm, bool cmp, bool genXML, TECkit_ErrorFn errFunc, void* userData)
523 {
524 compiledTable = 0;
525 compiledSize = 0;
526 usedExtStringRules = false;
527
528 textPtr = (const unsigned char*)txt;
529 textEnd = textPtr + len;
530
531 ungotten = kInvalidChar;
532 inputForm = inForm;
533
534 generateXML = genXML;
535
536 lineNumber = 1;
537 errorState = false;
538 errorCount = 0;
539
540 errorFunction = errFunc;
541 errFuncUserData = userData;
542
543 names.clear();
544 fwdTables.clear();
545 revTables.clear();
546 currentPass.clear();
547 buildVars.clear();
548 currentRule.clear();
549
550 lhsFlags = 0;
551 rhsFlags = 0;
552
553 char classType;
554
555 ruleState = notInRule;
556 int nestingLevel = 0;
557
558 defIter = defEnd;
559
560 while (inputForm == kForm_Unspecified) {
561 // attempt to determine input encoding
562 if (len >= 4) {
563 // check for UTF32 BOM or 3 nulls in first 4 bytes
564 if (strncmp(txt, "\0\0\376\377", 4) == 0) {
565 inputForm = kForm_UTF32BE;
566 break;
567 }
568 if (strncmp(txt, "\377\376\0\0", 4) == 0) {
569 inputForm = kForm_UTF32LE;
570 break;
571 }
572 if (strncmp(txt, "\0\0\0", 3) == 0) {
573 inputForm = kForm_UTF32BE;
574 break;
575 }
576 if (strncmp(txt+1, "\0\0\0", 3) == 0) {
577 inputForm = kForm_UTF32LE;
578 break;
579 }
580 }
581 if (len >= 3) {
582 // check for UTF8 signature
583 if (strncmp(txt, "\357\273\277", 3) == 0) {
584 inputForm = kForm_UTF8;
585 break;
586 }
587 }
588 if (len >= 2) {
589 // check for UTF16 BOM or null byte
590 if (strncmp(txt, "\376\377", 2) == 0) {
591 inputForm = kForm_UTF16BE;
592 break;
593 }
594 if (strncmp(txt, "\377\376", 2) == 0) {
595 inputForm = kForm_UTF16LE;
596 break;
597 }
598 if (txt[0] == '\0') {
599 inputForm = kForm_UTF16BE;
600 break;
601 }
602 if (txt[1] == '\0') {
603 inputForm = kForm_UTF16LE;
604 break;
605 }
606 }
607 inputForm = kForm_Bytes;
608 };
609
610 if (inputForm != kForm_Bytes) {
611 // discard initial BOM if present
612 UInt32 currCh = getChar();
613 if (currCh != 0xfeff)
614 ungetChar(currCh);
615 }
616
617 while (GetNextToken()) {
618 // on error, skip to next newline
619 GOT_TOKEN:
620 if (errorState) {
621 if (ruleState != notInRule) {
622 ruleState = notInRule;
623 nestingLevel = 0;
624 currentRule.clear();
625 }
626 if (tok.type != tok_Newline)
627 continue;
628 }
629 errorState = false;
630
631 string32::const_iterator i;
632 switch ((int)tok.type) {
633 default:
634 Error("this can't happen!");
635 break;
636
637 case tok_Unknown:
638 Error("unexpected character", string(1, tok.val).c_str());
639 break;
640
641 case tok_Identifier:
642 Error("unexpected identifier", asUTF8(tok.strval).c_str());
643 break;
644
645 case tok_Define:
646 if (!ExpectToken(tok_Identifier, "expected identifier after Define"))
647 break;
648 {
649 string defName(asUTF8(tok.strval));
650 tokListT defToks;
651 while (GetNextToken()) {
652 if (tok.type == tok_Newline)
653 break;
654 if (tok.type == tok_Unknown)
655 Error("unexpected character in Define text", string(1, tok.val).c_str());
656 else
657 defToks.push_back(tok);
658 }
659 defines[defName] = defToks;
660 }
661 break;
662
663 case tok_Newline:
664 switch (ruleState) {
665 default:
666 break;
667
668 case inLHSString:
669 case inLHSPreContext:
670 case inLHSPostContext:
671 Error("no mapping operator found");
672 goto GOT_TOKEN;
673
674 case inRHSString:
675 case inRHSPreContext:
676 case inRHSPostContext:
677 if (nestingLevel > 0) {
678 Error("unmatched opening parenthesis");
679 break;
680 }
681 if (ruleType == 0 || ruleType == '>') {
682 currentPass.fwdRules.push_back(Rule(currentRule.lhsString,
683 reverseContext(currentRule.lhsPreContext), currentRule.lhsPostContext,
684 currentRule.rhsString, currentRule.startingLine));
685 }
686 if (ruleType == 0 || ruleType == '<') {
687 currentPass.revRules.push_back(Rule(currentRule.rhsString,
688 reverseContext(currentRule.rhsPreContext), currentRule.rhsPostContext,
689 currentRule.lhsString, currentRule.startingLine));
690 }
691 if (generateXML) {
692 // create an XML representation of the rule and append to currentPass.xmlRules/xmlContexts
693 bool sourceUni = (currentPass.passType == kCode_UB) || (currentPass.passType == kCode_Unic);
694 bool targetUni = (currentPass.passType == kCode_BU) || (currentPass.passType == kCode_Unic);
695
696 string xmlRule;
697 xmlRule += "<a";
698 xmlRule += " line=\"";
699 xmlRule += asDec(currentRule.startingLine);
700 xmlRule += "\"";
701 if (ruleType == '>')
702 xmlRule += " dir=\"fwd\"";
703 else if (ruleType == '<')
704 xmlRule += " dir=\"rev\"";
705 xmlRule += ">\n";
706
707 string contextID;
708 xmlRule += "<l";
709 if (currentRule.lhsPreContext.size() != 0) {
710 contextID = getContextID(currentRule.lhsPreContext, sourceUni);
711 xmlRule += " preCtx=\"";
712 xmlRule + contextID;
713 xmlRule += "\"";
714 }
715 if (currentRule.lhsPostContext.size() != 0) {
716 contextID = getContextID(currentRule.lhsPostContext, sourceUni);
717 xmlRule += " postCtx=\"";
718 xmlRule += contextID;
719 xmlRule += "\"";
720 }
721 xmlRule += ">";
722 xmlRule += xmlString(currentRule.lhsString.begin(), currentRule.lhsString.end(), sourceUni);
723 xmlRule += "</l>\n";
724
725 xmlRule += "<r";
726 if (currentRule.rhsPreContext.size() != 0) {
727 contextID = getContextID(currentRule.rhsPreContext, targetUni);
728 xmlRule += " preCtx=\"";
729 xmlRule += contextID;
730 xmlRule += "\"";
731 }
732 if (currentRule.rhsPostContext.size() != 0) {
733 contextID = getContextID(currentRule.rhsPostContext, targetUni);
734 xmlRule += " postCtx=\"";
735 xmlRule += contextID;
736 xmlRule += "\"";
737 }
738 xmlRule += ">";
739 xmlRule += xmlString(currentRule.rhsString.begin(), currentRule.rhsString.end(), targetUni);
740 xmlRule += "</r>\n";
741
742 xmlRule += "</a>\n";
743 currentPass.xmlRules.push_back(xmlRule);
744 }
745 currentRule.clear();
746 ruleState = notInRule;
747 break;
748 }
749 break;
750
751 case tok_Number:
752 AppendLiteral(tok.val);
753 break;
754
755 case tok_USV:
756 AppendUSV(tok.val);
757 break;
758
759 case tok_String:
760 if (inputForm == kForm_Bytes && charLimit() != 0xff) {
761 Error("can't use quoted string for Unicodes in 8-bit source text");
762 break;
763 }
764 if (inputForm != kForm_Bytes && charLimit() == 0xff) {
765 Error("can't use quoted string for Bytes in Unicode source text");
766 break;
767 }
768 for (i = tok.strval.begin(); i != tok.strval.end(); ++i)
769 AppendLiteral(*i);
770 break;
771
772 case '^':
773 // negation can only apply to a few things:
774 GetNextToken();
775 switch ((int)tok.type) {
776 case tok_Number:
777 AppendLiteral(tok.val, true);
778 break;
779 case tok_USV:
780 AppendUSV(tok.val, true);
781 break;
782 case '[':
783 if (!ExpectToken(tok_Identifier, "expected CLASS-NAME after opening bracket"))
784 break;
785 AppendClass(asUTF8(tok.strval), true);
786 if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
787 break;
788 break;
789 case '#':
790 AppendSpecial(kMatchElem_Type_EOS, true);
791 break;
792 case '.':
793 AppendSpecial(kMatchElem_Type_ANY, true);
794 break;
795 default:
796 Error("invalid use of negation");
797 break;
798 }
799 break;
800
801 case '#':
802 AppendSpecial(kMatchElem_Type_EOS);
803 break;
804
805 case '.':
806 AppendSpecial(kMatchElem_Type_ANY);
807 break;
808
809 case '(':
810 AppendSpecial(kMatchElem_Type_BGroup);
811 ++nestingLevel;
812 break;
813
814 case ')':
815 if (nestingLevel == 0) {
816 Error("unmatched closing parenthesis");
817 break;
818 }
819 --nestingLevel;
820 AppendSpecial(kMatchElem_Type_EGroup);
821 break;
822
823 case '|':
824 if (nestingLevel == 0) {
825 Error("alternation only permitted within parentheses");
826 break;
827 }
828 AppendSpecial(kMatchElem_Type_OR);
829 break;
830
831 case tok_Map:
832 case '>':
833 case '<':
834 if (nestingLevel > 0) {
835 Error("unmatched opening parenthesis");
836 break;
837 }
838 switch (ruleState) {
839 default:
840 Error("not within a mapping rule");
841 break;
842 case inLHSString:
843 case inLHSPostContext:
844 ruleState = inRHSString;
845 ruleType = (tok.type == tok_Map ? 0 : (tok.type == '>' ? '>' : '<'));
846 break;
847 case inLHSPreContext:
848 Error("no underscore found in context");
849 break;
850 case inRHSString:
851 case inRHSPreContext:
852 case inRHSPostContext:
853 Error("extra mapping operator in rule");
854 break;
855 }
856 break;
857
858 case '/':
859 if (nestingLevel > 0) {
860 Error("unmatched opening parenthesis");
861 break;
862 }
863 switch (ruleState) {
864 default:
865 Error("not within a mapping rule");
866 break;
867 case inLHSString:
868 ruleState = inLHSPreContext;
869 break;
870 case inRHSString:
871 ruleState = inRHSPreContext;
872 break;
873 case inLHSPreContext:
874 case inLHSPostContext:
875 case inRHSPreContext:
876 case inRHSPostContext:
877 Error("extra slash in rule");
878 break;
879 }
880 break;
881
882 case '_':
883 if (nestingLevel > 0) {
884 Error("unmatched opening parenthesis");
885 break;
886 }
887 switch (ruleState) {
888 default:
889 Error("not within a mapping rule");
890 break;
891 case inLHSPreContext:
892 ruleState = inLHSPostContext;
893 break;
894 case inRHSPreContext:
895 ruleState = inRHSPostContext;
896 break;
897 case inLHSString:
898 case inRHSString:
899 Error("underscore only allowed in context");
900 break;
901 case inLHSPostContext:
902 case inRHSPostContext:
903 Error("extra underscore in context");
904 break;
905 }
906 break;
907
908 case '[':
909 if (!ExpectToken(tok_Identifier, "expected CLASS-NAME after opening bracket"))
910 break;
911 AppendClass(asUTF8(tok.strval));
912 if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
913 break;
914 break;
915
916 case ']':
917 Error("unmatched closing bracket");
918 break;
919
920 case '=':
921 if (!ExpectToken(tok_Identifier, "expected tag name after '='"))
922 break;
923 AssignTag(asUTF8(tok.strval));
924 break;
925
926 case '@':
927 if (!ExpectToken(tok_Identifier, "expected tag name after '@'"))
928 break;
929 AppendSpecial(kMatchElem_Type_Copy);
930 AssignTag(asUTF8(tok.strval));
931 break;
932
933 case '?':
934 SetMinMax(0, 1);
935 break;
936
937 case '*':
938 SetMinMax(0, 15);
939 break;
940
941 case '+':
942 SetMinMax(1, 15);
943 break;
944
945 case '{':
946 {
947 int repeatMin = 0;
948 int repeatMax = 15;
949 GetNextToken();
950 if (tok.type == tok_Number) {
951 repeatMin = repeatMax = tok.val;
952 GetNextToken();
953 if (tok.type == ',') {
954 GetNextToken();
955 if (tok.type == tok_Number) {
956 repeatMax = tok.val;
957 if (!ExpectToken('}', "expected closing brace after repeat counts"))
958 break;
959 }
960 else if (tok.type == '}')
961 repeatMax = 15;
962 else {
963 Error("expected repeat count or closing brace after comma");
964 break;
965 }
966 }
967 else if (tok.type != '}') {
968 Error("expected comma or closing brace after repeat count");
969 break;
970 }
971 }
972 else if (tok.type == ',') {
973 GetNextToken();
974 if (tok.type == tok_Number)
975 repeatMax = tok.val;
976 else {
977 Error("expected repeat count");
978 break;
979 }
980 if (!ExpectToken('}', "expected closing brace after repeat count"))
981 break;
982 }
983 else {
984 Error("expected repeat counts within braces");
985 break;
986 }
987 SetMinMax(repeatMin, repeatMax);
988 }
989 break;
990
991 case '}':
992 Error("unmatched closing brace");
993 break;
994
995 case tok_Name:
996 if (tok.val == 0xffffffff) {
997 if (!ExpectToken('(', "expected (NUMBER) STRING after Name"))
998 break;
999 if (!ExpectToken(tok_Number, "expected (NUMBER) STRING after Name"))
1000 break;
1001 int nameID = tok.val;
1002 if (!ExpectToken(')', "expected (NUMBER) STRING after Name"))
1003 break;
1004 ReadNameString(nameID);
1005 }
1006 else
1007 ReadNameString(tok.val);
1008 goto GOT_TOKEN; // ReadNameString has already read the newline
1009
1010 case tok_Flags:
1011 {
1012 if (!ExpectToken('(', "expected (FLAG-LIST) after SourceFlags/TargetFlags"))
1013 break;
1014 UInt32 flagValue = 0;
1015 char whichFlags = tok.val;
1016 while (1) {
1017 GetNextToken();
1018 if (tok.type == tok_FlagValue)
1019 flagValue |= tok.val;
1020 else
1021 break;
1022 }
1023 if (tok.type != ')') {
1024 Error("expected (FLAG-LIST) after SourceFlags/TargetFlags");
1025 break;
1026 }
1027 if (whichFlags == 'S')
1028 lhsFlags = flagValue;
1029 else
1030 rhsFlags = flagValue;
1031 }
1032 ExpectToken(tok_Newline, "junk at end of line");
1033 break;
1034
1035 case tok_Pass:
1036 FinishPass();
1037 currentPass.setLineNo(lineNumber);
1038 if (!ExpectToken('(', "expected (PASS-TYPE) after Pass"))
1039 break;
1040 GetNextToken();
1041 if (tok.type == tok_PassType)
1042 currentPass.passType = tok.val;
1043 else
1044 Error("unrecognized pass type");
1045 if (!ExpectToken(')', "expected (PASS-TYPE) after Pass"))
1046 break;
1047 ExpectToken(tok_Newline, "junk at end of line");
1048 goto GOT_TOKEN;
1049
1050 case tok_Default:
1051 StartDefaultPass();
1052 if (currentPass.passType != kCode_BU && currentPass.passType != kCode_UB) {
1053 Error("defaults are only used in Byte_Unicode and Unicode_Byte passes");
1054 break;
1055 }
1056 {
1057 char whichDefault = tok.val;
1058 GetNextToken();
1059 switch (tok.type) {
1060 case tok_String:
1061 if (tok.strval.length() != 1)
1062 Error("default can only be a single character, not a multi-character string");
1063 else if (whichDefault == 'U') {
1064 if (inputForm == kForm_Bytes)
1065 Error("UniDefault cannot use quoted character in 8-bit source text");
1066 else
1067 currentPass.uniDefault = tok.strval[0];
1068 }
1069 else {
1070 if (inputForm != kForm_Bytes)
1071 Error("ByteDefault cannot use quoted character in Unicode source text");
1072 else
1073 currentPass.byteDefault = tok.strval[0];
1074 }
1075 break;
1076 case tok_Number:
1077 if (whichDefault == 'U')
1078 currentPass.uniDefault = tok.val;
1079 else
1080 currentPass.byteDefault = tok.val;
1081 break;
1082 case tok_USV:
1083 if (whichDefault == 'U')
1084 currentPass.uniDefault = tok.val;
1085 else
1086 Error("can't use Unicode value in byte encoding");
1087 break;
1088 default:
1089 Error("expected character code after ByteDefault/UniDefault");
1090 break;
1091 }
1092 }
1093 break;
1094
1095 case tok_Class:
1096 StartDefaultPass();
1097 classLine = lineNumber;
1098 if (tok.val == 0) {
1099 if (currentPass.passType == kCode_Byte)
1100 classType = 'B';
1101 else if (currentPass.passType == kCode_Unic)
1102 classType = 'U';
1103 else {
1104 Error("must use ByteClass or UniClass to define classes in this pass");
1105 break;
1106 }
1107 }
1108 else {
1109 classType = tok.val;
1110 if (classType == 'B' && currentPass.passType == kCode_Unic)
1111 Error("can't use ByteClass in this pass");
1112 else if (classType == 'U' && currentPass.passType == kCode_Byte)
1113 Error("can't use UniClass in this pass");
1114 }
1115 {
1116 UInt32 classLimit = (classType == 'U' ? 0x10ffff : 0xff);
1117 if (!ExpectToken('[', "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1118 break;
1119 if (!ExpectToken(tok_Identifier, "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1120 break;
1121 string className(asUTF8(tok.strval));
1122 if (!ExpectToken(']', "expected [CLASS-NAME] after Class/ByteClass/UniClass"))
1123 break;
1124 if (!ExpectToken('=', "expected =(CHARACTER-CODE-LIST) after Class/ByteClass/UniClass[CLASS-NAME]"))
1125 break;
1126 if (!ExpectToken('(', "expected =(CHARACTER-CODE-LIST) after Class/ByteClass/UniClass[CLASS-NAME]"))
1127 break;
1128 vector<UInt32> classMembers;
1129 bool ellipsis = false;
1130 bool ellipsisOK = false;
1131 while (tok.type != ')' && tok.type != tok_Newline) {
1132 GetNextToken();
1133 switch ((int)tok.type) {
1134 case tok_USV:
1135 if (classType == 'B') {
1136 Error("can't use Unicode value in byte encoding");
1137 break;
1138 }
1139 // fall through
1140 case tok_Number:
1141 if (tok.val > classLimit) {
1142 Error("class element outside valid range");
1143 break;
1144 }
1145 if (ellipsis) {
1146 ellipsis = false;
1147 ellipsisOK = false;
1148 UInt32 lastVal = classMembers.back();
1149 if (tok.val < lastVal) {
1150 Error("range out of order");
1151 break;
1152 }
1153 while (++lastVal <= tok.val)
1154 classMembers.push_back(lastVal);
1155 }
1156 else {
1157 classMembers.push_back(tok.val);
1158 ellipsisOK = true;
1159 }
1160 if (classMembers.back() > 0x0000ffff)
1161 currentPass.supplementaryChars = true;
1162 break;
1163
1164 case tok_String:
1165 if (classType == 'U' && inputForm == kForm_Bytes) {
1166 Error("can't use quoted string for Unicode class in 8-bit source text");
1167 break;
1168 }
1169 if (classType == 'B' && inputForm != kForm_Bytes) {
1170 Error("can't use quoted string for Byte class in Unicode source text");
1171 break;
1172 }
1173 if (ellipsis) {
1174 ellipsis = false;
1175 ellipsisOK = false;
1176 if (tok.strval.length() != 1) {
1177 Error("can only use single-character string with ..");
1178 break;
1179 }
1180 UInt32 lastVal = classMembers.back();
1181 if (tok.strval[0] < lastVal) {
1182 Error("range out of order");
1183 break;
1184 }
1185 while (++lastVal <= tok.strval[0])
1186 classMembers.push_back(lastVal);
1187 break;
1188 }
1189 ellipsisOK = (tok.strval.length() == 1);
1190 for (i = tok.strval.begin(); i < tok.strval.end(); ++i)
1191 classMembers.push_back(*i);
1192 break;
1193
1194 case tok_Ellipsis:
1195 if (ellipsisOK) {
1196 ellipsisOK = false;
1197 ellipsis = true;
1198 }
1199 else
1200 Error("illegal .. in class");
1201 break;
1202
1203 case '[':
1204 {
1205 if (ellipsis) {
1206 Error("can't use [CLASS-NAME] after ..");
1207 break;
1208 }
1209 ellipsis = false;
1210 ellipsisOK = false;
1211 // get the referenced class and copy in its members
1212 if (ExpectToken(tok_Identifier, "expected [CLASS-NAME]")) {
1213 string refName(asUTF8(tok.strval));
1214 if (classType == 'U') {
1215 map<string,UInt32>::const_iterator c = currentPass.uniClassNames.find(refName);
1216 if (c == currentPass.uniClassNames.end()) {
1217 Error("undefined class used", refName.c_str());
1218 break;
1219 }
1220 Class uc = currentPass.uniClassMembers[c->second];
1221 for (Class::const_iterator i = uc.begin(); i != uc.end(); ++i)
1222 classMembers.push_back(*i);
1223 }
1224 else {
1225 map<string,UInt32>::const_iterator c = currentPass.byteClassNames.find(refName);
1226 if (c == currentPass.byteClassNames.end()) {
1227 Error("undefined class used", refName.c_str());
1228 break;
1229 }
1230 Class bc = currentPass.byteClassMembers[c->second];
1231 for (Class::const_iterator i = bc.begin(); i != bc.end(); ++i)
1232 classMembers.push_back(*i);
1233 }
1234 if (!ExpectToken(']', "expected closing bracket after CLASS-NAME"))
1235 break;
1236 }
1237 }
1238 break;
1239
1240 case ')':
1241 if (ellipsis)
1242 Error("trailing .. in class");
1243 break;
1244
1245 case tok_Newline:
1246 Error("unexpected end of line within class");
1247 break;
1248
1249 case tok_Identifier:
1250 Error("unexpected identifier within class", asUTF8(tok.strval).c_str());
1251 break;
1252
1253 default:
1254 Error("unexpected token within class", string((const char*)tokStart, (const char*)textPtr - (const char*)tokStart).c_str());
1255 break;
1256 }
1257 }
1258 if (tok.type != tok_Newline)
1259 if (!ExpectToken(tok_Newline, "junk at end of line"))
1260 break;
1261 // ok, we've got the class name and members; save it
1262 if (classType == 'U') {
1263 if (currentPass.uniClassNames.find(className) != currentPass.uniClassNames.end()) {
1264 Error("class already defined", className.c_str());
1265 break;
1266 }
1267 currentPass.uniClassNames[className] = currentPass.uniClassMembers.size();
1268 currentPass.uniClassMembers.push_back(classMembers);
1269 currentPass.uniClassLines.push_back(classLine);
1270 }
1271 else {
1272 if (currentPass.byteClassNames.find(className) != currentPass.byteClassNames.end()) {
1273 Error("class already defined", className.c_str());
1274 break;
1275 }
1276 currentPass.byteClassNames[className] = currentPass.byteClassMembers.size();
1277 currentPass.byteClassMembers.push_back(classMembers);
1278 currentPass.byteClassLines.push_back(classLine);
1279 }
1280 goto GOT_TOKEN;
1281 }
1282 break;
1283 }
1284 }
1285 FinishPass();
1286
1287 // Do we have names for both LHS and RHS? If not, is LHS legacy and RHS Unicode?
1288 if (names.find(kNameID_LHS_Name) == names.end()) {
1289 Error("EncodingName or LHSName must be specified");
1290 }
1291 const string& lhs = names[kNameID_LHS_Name];
1292 if (lhs.find("(REG_ID)") != lhs.npos) {
1293 Error("Draft mappings generated by Encore2Unicode MUST be reviewed before use");
1294 }
1295 if (names.find(kNameID_RHS_Name) == names.end()) {
1296 if ((lhsFlags & kFlags_Unicode) == 0 || (rhsFlags & kFlags_Unicode) != 0) {
1297 names[kNameID_RHS_Name] = "UNICODE";
1298 }
1299 else {
1300 Error("RHSName must be specified for non-Legacy/Unicode mapping table");
1301 }
1302 }
1303
1304 if (errorCount == 0) {
1305 if (generateXML) {
1306 string header;
1307 header += "<?xml version=\"1.0\"?>\n";
1308 header += "<teckitMapping\n";
1309
1310 #define doName(att,name_id) \
1311 if (names.find(name_id) != names.end()) { \
1312 header += " "; \
1313 header += att; \
1314 header += "=\""; \
1315 header += names[name_id]; \
1316 header += "\"\n"; \
1317 }
1318
1319 doName("lhsName", kNameID_LHS_Name);
1320 doName("rhsName", kNameID_RHS_Name);
1321 doName("lhsDescription", kNameID_LHS_Description);
1322 doName("rhsDescription", kNameID_RHS_Description);
1323 doName("version", kNameID_Version);
1324 doName("contact", kNameID_Contact);
1325 doName("registrationAuthority", kNameID_RegAuthority);
1326 doName("registrationName", kNameID_RegName);
1327 doName("copyright", kNameID_Copyright);
1328
1329 if (lhsFlags & kFlags_ExpectsNFC)
1330 header += " lhsExpects=\"NFC\"\n";
1331 else if (lhsFlags & kFlags_ExpectsNFD)
1332 header += " lhsExpects=\"NFD\"\n";
1333 if (rhsFlags & kFlags_ExpectsNFC)
1334 header += " rhsExpects=\"NFC\"\n";
1335 else if (rhsFlags & kFlags_ExpectsNFD)
1336 header += " rhsExpects=\"NFD\"\n";
1337
1338 header += ">\n";
1339
1340 string trailer("</teckitMapping>\n");
1341
1342 compiledSize = header.length() + xmlRepresentation.length() + trailer.length();
1343 compiledTable = (Byte*)malloc(compiledSize + 1);
1344 if (compiledTable == NULL)
1345 throw bad_alloc();
1346
1347 memcpy(compiledTable, header.data(), header.length());
1348 memcpy(compiledTable + header.length(), xmlRepresentation.data(), xmlRepresentation.length());
1349 memcpy(compiledTable + header.length() + xmlRepresentation.length(), trailer.data(), trailer.length());
1350 compiledTable[compiledSize] = 0;
1351
1352 xmlRepresentation.erase(xmlRepresentation.begin(), xmlRepresentation.end());
1353 }
1354 else {
1355 // assemble the complete compiled file
1356 FileHeader fh;
1357 WRITE(fh.type, kMagicNumber);
1358 WRITE(fh.version, usedExtStringRules ? kCurrentFileVersion : kFileVersion2_1);
1359 WRITE(fh.headerLength, 0); // to be filled in later, once names and table counts are known
1360 WRITE(fh.formFlagsLHS, lhsFlags);
1361 WRITE(fh.formFlagsRHS, rhsFlags);
1362
1363 WRITE(fh.numFwdTables, fwdTables.size());
1364 WRITE(fh.numRevTables, revTables.size());
1365 WRITE(fh.numNames, names.size());
1366
1367 string offsets;
1368 UInt32 offset = sizeof(FileHeader) + (names.size() + fwdTables.size() + revTables.size()) * sizeof(UInt32);
1369 UInt32 prevLength = 0;
1370
1371 // sort the name IDs into ascending order
1372 vector<UInt16> nameIDs;
1373 nameIDs.reserve(names.size());
1374 for (map<UInt16,string>::const_iterator n = names.begin(); n != names.end(); ++n) {
1375 nameIDs.push_back(n->first);
1376 }
1377 sort(nameIDs.begin(), nameIDs.end());
1378
1379 // pack all the name records
1380 string namesData;
1381 for (vector<UInt16>::const_iterator i = nameIDs.begin(); i != nameIDs.end(); ++i) {
1382 appendToTable(offsets, (const char*)&offset, sizeof(offset));
1383 NameRec r;
1384 WRITE(r.nameID, *i);
1385 WRITE(r.nameLength, names[*i].length());
1386 namesData.append((const char*)&r, sizeof(r));
1387 namesData.append(names[*i]);
1388 if ((namesData.length() & 1) != 0)
1389 namesData.append(1, (char)0);
1390 offset += namesData.length() - prevLength;
1391 prevLength = namesData.length();
1392 }
1393 if ((namesData.length() & 2) != 0)
1394 namesData.append(2, (char)0);
1395 offset += namesData.length() - prevLength;
1396
1397 // pack the offsets to the actual mapping tables
1398 vector<string>::const_iterator t;
1399 for (t = fwdTables.begin(); t != fwdTables.end(); ++t) {
1400 appendToTable(offsets, (const char*)&offset, sizeof(offset));
1401 offset += t->size();
1402 }
1403 for (t = revTables.end(); t != revTables.begin(); ) {
1404 --t;
1405 appendToTable(offsets, (const char*)&offset, sizeof(offset));
1406 offset += t->size();
1407 }
1408
1409 WRITE(fh.headerLength, sizeof(fh) + offsets.length() + namesData.length());
1410
1411 if (errorCount == 0) {
1412 // calculate total size of compiled table, malloc() it, and copy everything into it
1413 compiledSize = sizeof(fh)
1414 + offsets.length()
1415 + namesData.length();
1416 for (t = fwdTables.begin(); t != fwdTables.end(); ++t)
1417 compiledSize += t->length();
1418 for (t = revTables.begin(); t != revTables.end(); ++t)
1419 compiledSize += t->length();
1420
1421 compiledTable = (Byte*)malloc(compiledSize);
1422 if (compiledTable != 0) {
1423 char* cp = (char*)compiledTable;
1424 memcpy(cp, &fh, sizeof(fh));
1425 cp += sizeof(fh);
1426 memcpy(cp, offsets.data(), offsets.length());
1427 cp += offsets.length();
1428 memcpy(cp, namesData.data(), namesData.length());
1429 cp += namesData.length();
1430 for (t = fwdTables.begin(); t != fwdTables.end(); ++t) {
1431 memcpy(cp, t->data(), t->length());
1432 cp += t->length();
1433 }
1434 for (t = revTables.end(); t != revTables.begin(); ) {
1435 --t;
1436 memcpy(cp, t->data(), t->length());
1437 cp += t->length();
1438 }
1439 if ((char*)compiledTable + compiledSize != cp)
1440 cerr << "error!" << endl;
1441 }
1442 else
1443 throw bad_alloc();
1444 }
1445
1446 if (errorCount == 0 && cmp) {
1447 // do the compression...
1448 unsigned long destLen = compiledSize * 11 / 10 + 20;
1449 Byte* dest = (Byte*)malloc(destLen + 8);
1450 if (dest != 0) {
1451 int result = compress2(dest + 8, &destLen, compiledTable, compiledSize, Z_BEST_COMPRESSION);
1452 if (result == Z_OK) {
1453 destLen += 8;
1454 dest = (Byte*)realloc(dest, destLen); // shrink dest to fit
1455 WRITE(((FileHeader*)dest)->type, kMagicNumberCmp);
1456 WRITE(((FileHeader*)dest)->version, compiledSize);
1457 free(compiledTable);
1458 compiledTable = dest;
1459 compiledSize = destLen;
1460 }
1461 else
1462 free(dest);
1463 }
1464 }
1465 }
1466 }
1467 }
1468
~Compiler()1469 Compiler::~Compiler()
1470 {
1471 if (compiledTable != 0)
1472 free(compiledTable);
1473 }
1474
1475 void
GetCompiledTable(Byte * & table,UInt32 & len) const1476 Compiler::GetCompiledTable(Byte*& table, UInt32& len) const
1477 {
1478 table = compiledTable;
1479 len = compiledSize;
1480 }
1481
1482 void
DetachCompiledTable()1483 Compiler::DetachCompiledTable()
1484 {
1485 compiledTable = 0;
1486 compiledSize = 0;
1487 }
1488
1489 string
asUTF8(const string32 s)1490 Compiler::asUTF8(const string32 s)
1491 {
1492 string rval;
1493 string32::const_iterator i;
1494 for (i = s.begin(); i != s.end(); ++i) {
1495 UInt32 c = *i;
1496 int bytesToWrite;
1497 if (c < 0x80) { bytesToWrite = 1;
1498 } else if (c < 0x800) { bytesToWrite = 2;
1499 } else if (c < 0x10000) { bytesToWrite = 3;
1500 } else if (c < 0x200000) { bytesToWrite = 4;
1501 } else { bytesToWrite = 2;
1502 c = 0x0000fffd;
1503 };
1504 rval.append((size_t)bytesToWrite, 0);
1505 int index = rval.length();
1506 switch (bytesToWrite) { /* note: code falls through cases! */
1507 case 4: rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1508 case 3: rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1509 case 2: rval[--index] = (c | byteMark) & byteMask; c >>= 6;
1510 case 1: rval[--index] = c | firstByteMark[bytesToWrite];
1511 };
1512 }
1513 return rval;
1514 }
1515
1516 void
ReadNameString(UInt16 nameID)1517 Compiler::ReadNameString(UInt16 nameID)
1518 {
1519 if (ExpectToken(tok_String, "expected STRING after name keyword")) {
1520 if (inputForm == kForm_Bytes) {
1521 names[nameID].erase(names[nameID].begin(), names[nameID].end());
1522 for (string32::const_iterator i = tok.strval.begin(); i != tok.strval.end(); ++i)
1523 names[nameID].append(1, *i);
1524 }
1525 else
1526 names[nameID] = asUTF8(tok.strval);
1527 ExpectToken(tok_Newline, "junk at end of line");
1528 }
1529 }
1530
1531 void
FinishPass()1532 Compiler::FinishPass()
1533 {
1534 if (currentPass.passType == 0)
1535 return;
1536
1537 if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {
1538 while (errorCount == 0) {
1539 if (fwdTables.size() == 0)
1540 lhsFlags |= kFlags_Unicode;
1541 else {
1542 if ((rhsFlags & kFlags_Unicode) == 0) {
1543 Error("normalization only supported in Unicode space");
1544 break;
1545 }
1546 }
1547 rhsFlags |= kFlags_Unicode;
1548 string normTable((currentPass.passType & 0x0000FF00) == (FOUR_CHAR_CODE('_','_','C','_') & 0x0000FF00)
1549 ? "NFC " : "NFD ");
1550 if ((currentPass.passType & 0x000000FF) != 'r')
1551 fwdTables.push_back(normTable);
1552 if ((currentPass.passType & 0x000000FF) != 'f')
1553 revTables.push_back(normTable);
1554 if (generateXML) {
1555 xmlOut("<pass lhs=\"unicode\" rhs=\"unicode\" line=\"");
1556 xmlOut(asDec(currentPass.startingLine));
1557 xmlOut("\">\n");
1558 xmlOut("<normalize form=\"");
1559 xmlOut(normTable[2]);
1560 if ((currentPass.passType & 0x000000FF) == 'f')
1561 xmlOut(" dir=\"fwd\"");
1562 else if ((currentPass.passType & 0x000000FF) == 'r')
1563 xmlOut(" dir=\"rev\"");
1564 xmlOut("\">\n");
1565 xmlOut("</pass>\n");
1566 }
1567 break;
1568 }
1569 }
1570 else {
1571 while (errorCount == 0) {
1572 // not really a loop; just so we can use 'break' to exit early
1573 bool sourceUni = (currentPass.passType == kCode_UB) || (currentPass.passType == kCode_Unic);
1574 bool targetUni = (currentPass.passType == kCode_BU) || (currentPass.passType == kCode_Unic);
1575
1576 if (generateXML) {
1577 // pass header
1578 xmlOut("<pass lhs=\"");
1579 xmlOut(sourceUni ? "unicode" : "bytes");
1580 xmlOut("\" rhs=\"");
1581 xmlOut(targetUni ? "unicode" : "bytes");
1582 if (sourceUni != targetUni) {
1583 xmlOut("\" lhsDefault=\"");
1584 xmlOut(sourceUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));
1585 xmlOut("\" rhsDefault=\"");
1586 xmlOut(targetUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));
1587 }
1588 xmlOut("\" line=\"");
1589 xmlOut(asDec(currentPass.startingLine));
1590 xmlOut("\">\n");
1591
1592 // class definitions
1593 if (currentPass.byteClassMembers.size() > 0 || currentPass.uniClassMembers.size() > 0) {
1594 xmlOut("<classes>\n");
1595 unsigned int i;
1596 for (i = 0; i < currentPass.byteClassMembers.size(); ++i) {
1597 xmlOut("<class size=\"bytes\" name=\"b_");
1598 xmlOut(getClassName(currentPass.byteClassNames, i));
1599 xmlOut("\" line=\"");
1600 xmlOut(asDec(currentPass.byteClassLines[i]));
1601 xmlOut("\">");
1602 for (Class::const_iterator ci = currentPass.byteClassMembers[i].begin(); ci != currentPass.byteClassMembers[i].end(); ++ci) {
1603 xmlOut(ci == currentPass.byteClassMembers[i].begin() ? "\n" : " ");
1604 xmlOut(asHex(*ci, 2));
1605 }
1606 xmlOut("\n</class>\n");
1607 }
1608 for (i = 0; i < currentPass.uniClassMembers.size(); ++i) {
1609 xmlOut("<class size=\"unicode\" name=\"u_");
1610 xmlOut(getClassName(currentPass.uniClassNames, i));
1611 xmlOut("\" line=\"");
1612 xmlOut(asDec(currentPass.uniClassLines[i]));
1613 xmlOut("\">");
1614 for (Class::const_iterator ci = currentPass.uniClassMembers[i].begin(); ci != currentPass.uniClassMembers[i].end(); ++ci) {
1615 xmlOut(ci == currentPass.uniClassMembers[i].begin() ? "\n" : " ");
1616 xmlOut(asHex(*ci, 4));
1617 }
1618 xmlOut("\n</class>\n");
1619 }
1620 xmlOut("</classes>\n");
1621 }
1622
1623 if (currentPass.xmlContexts.size() > 0) {
1624 xmlOut("<contexts>\n");
1625 for (map<string,string>::const_iterator i = currentPass.xmlContexts.begin();
1626 i != currentPass.xmlContexts.end(); ++i) {
1627 xmlOut("<context id=\"");
1628 xmlOut(i->second);
1629 xmlOut("\">");
1630 xmlOut(i->first);
1631 xmlOut("</context>\n");
1632 }
1633 xmlOut("</contexts>\n");
1634 }
1635
1636 xmlOut("<assignments>\n");
1637 for (vector<string>::const_iterator i = currentPass.xmlRules.begin();
1638 i != currentPass.xmlRules.end(); ++i) {
1639 xmlOut(*i);
1640 }
1641 xmlOut("</assignments>\n");
1642
1643 // end pass
1644 xmlOut("</pass>\n");
1645 }
1646
1647 if (fwdTables.size() == 0) {
1648 if (sourceUni)
1649 lhsFlags |= kFlags_Unicode;
1650 }
1651 else {
1652 if (sourceUni != ((rhsFlags & kFlags_Unicode) != 0)) {
1653 Error("code space mismatch");
1654 break;
1655 }
1656 }
1657 rhsFlags &= ~kFlags_Unicode;
1658 if (targetUni)
1659 rhsFlags |= kFlags_Unicode;
1660
1661 // deal with COPY on LHS, and set up class/copy replacement index fields
1662 associateItems(currentPass.fwdRules, sourceUni, targetUni);
1663 if (errorCount > 0)
1664 break;
1665
1666 setGroupPointers(currentPass.fwdRules);
1667
1668 // sort rules by length (also propagates repeat counts from EGroup back to BGroup items)
1669 sortRules(currentPass.fwdRules);
1670 if (errorCount > 0)
1671 break;
1672
1673 // build the forward table
1674 fwdTables.push_back(string());
1675 buildTable(currentPass.fwdRules, sourceUni, targetUni, fwdTables.back());
1676 buildVars.clear();
1677 if (errorCount > 0)
1678 break;
1679
1680 // build the reverse table
1681 associateItems(currentPass.revRules, targetUni, sourceUni);
1682 if (errorCount > 0)
1683 break;
1684 setGroupPointers(currentPass.revRules);
1685 sortRules(currentPass.revRules);
1686 if (errorCount > 0)
1687 break;
1688 revTables.push_back(string());
1689 buildTable(currentPass.revRules, targetUni, sourceUni, revTables.back());
1690 buildVars.clear();
1691 break;
1692 }
1693 }
1694 currentPass.clear();
1695 currentPass.setLineNo(lineNumber);
1696 }
1697
1698 void
SkipSpaces(void)1699 Compiler::SkipSpaces(void)
1700 {
1701 while (textPtr < textEnd) {
1702 UInt32 currCh = getChar();
1703 if (currCh != ' ' && currCh != '\t') {
1704 ungetChar(currCh);
1705 break;
1706 }
1707 }
1708 }
1709
1710 Compiler::tokenType
IDlookup(const char * str,UInt32 len)1711 Compiler::IDlookup(const char* str, UInt32 len)
1712 {
1713 const Keyword *k = &keywords[0];
1714 while (k->keyword != 0)
1715 if (strmatch(k->keyword, str, len)) {
1716 tok.val = k->refCon;
1717 return k->token;
1718 }
1719 else
1720 ++k;
1721
1722 // try for a macro
1723 map<string,tokListT>::const_iterator i = defines.find(string(str, len));
1724 if (i != defines.end()) {
1725 defIter = i->second.begin();
1726 defEnd = i->second.end();
1727 tok = *defIter;
1728 defIter++;
1729 return tok.type;
1730 }
1731
1732 // didn't find the identifier as a keyword; try as a Unicode char name
1733 // NOTE: the names are now sorted (by Unicode name), so we could use a binary
1734 // search here if anyone complains about compilation time when using names :)
1735 const CharName *c = &gUnicodeNames[0];
1736 while (c->name != 0)
1737 if (unicodeNameCompare(c->name, str, len) == 0) {
1738 tok.val = c->usv;
1739 return tok_USV;
1740 }
1741 else
1742 ++c;
1743
1744 #ifdef __MWERKS__
1745 tok.strval.clear();
1746 #else
1747 tok.strval.erase(tok.strval.begin(), tok.strval.end());
1748 #endif
1749 while (len-- > 0)
1750 tok.strval.append(1, *str++);
1751 return tok_Identifier;
1752 }
1753
1754 UInt32
getChar()1755 Compiler::getChar()
1756 {
1757 UInt32 rval = 0;
1758
1759 if (ungotten != kInvalidChar) {
1760 rval = ungotten;
1761 ungotten = kInvalidChar;
1762 return rval;
1763 }
1764
1765 #define CHECK_AVAIL(x) \
1766 if (textPtr + (x) > textEnd) { \
1767 textPtr = textEnd; \
1768 return kInvalidChar; \
1769 }
1770
1771 switch (inputForm) {
1772 case kForm_Bytes:
1773 rval = *textPtr++;
1774 break;
1775
1776 case kForm_UTF8:
1777 {
1778 UInt16 extraBytes = bytesFromUTF8[*textPtr];
1779 CHECK_AVAIL(extraBytes + 1);
1780 switch (extraBytes) { // note: code falls through cases!
1781 case 5: rval += *textPtr++; rval <<= 6;
1782 case 4: rval += *textPtr++; rval <<= 6;
1783 case 3: rval += *textPtr++; rval <<= 6;
1784 case 2: rval += *textPtr++; rval <<= 6;
1785 case 1: rval += *textPtr++; rval <<= 6;
1786 case 0: rval += *textPtr++;
1787 };
1788 rval -= offsetsFromUTF8[extraBytes];
1789 }
1790 break;
1791
1792 case kForm_UTF16BE:
1793 CHECK_AVAIL(2);
1794 rval = *textPtr++ << 8;
1795 rval += *textPtr++;
1796 if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
1797 // check that 2 more bytes are available
1798 CHECK_AVAIL(2);
1799 UInt32 low = *textPtr++ << 8;
1800 low += *textPtr++;
1801 rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
1802 }
1803 break;
1804
1805 case kForm_UTF16LE:
1806 CHECK_AVAIL(2);
1807 rval = *textPtr++;
1808 rval += *textPtr++ << 8;
1809 if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
1810 CHECK_AVAIL(2);
1811 UInt32 low = *textPtr++;
1812 low += *textPtr++ << 8;
1813 rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
1814 }
1815 break;
1816
1817 case kForm_UTF32BE:
1818 CHECK_AVAIL(4);
1819 rval = *textPtr++ << 24;
1820 rval += *textPtr++ << 16;
1821 rval += *textPtr++ << 8;
1822 rval += *textPtr++;
1823 break;
1824
1825 case kForm_UTF32LE:
1826 CHECK_AVAIL(4);
1827 rval = *textPtr++;
1828 rval += *textPtr++ << 8;
1829 rval += *textPtr++ << 16;
1830 rval += *textPtr++ << 24;
1831 break;
1832 }
1833
1834 return rval;
1835 }
1836
1837 void
ungetChar(UInt32 c)1838 Compiler::ungetChar(UInt32 c)
1839 {
1840 ungotten = c;
1841 }
1842
1843 bool
GetNextToken()1844 Compiler::GetNextToken()
1845 {
1846 UInt32 currCh;
1847
1848 if (defIter != defEnd) {
1849 tok = *defIter;
1850 defIter++;
1851 return true;
1852 }
1853
1854 if (textPtr == textEnd) {
1855 ++textPtr;
1856 tok.type = tok_Newline;
1857 return true;
1858 }
1859
1860 if (textPtr >= textEnd)
1861 return false;
1862
1863 while (true) {
1864 SkipSpaces();
1865
1866 tokStart = textPtr;
1867
1868 if (textPtr == textEnd) {
1869 ++textPtr;
1870 tok.type = tok_Newline;
1871 ++lineNumber;
1872 return true;
1873 }
1874
1875 if (textPtr > textEnd)
1876 return false;
1877
1878 currCh = getChar();
1879 switch (currCh) {
1880 case '\r':
1881 if (textPtr < textEnd) {
1882 currCh = getChar();
1883 if (currCh != '\n')
1884 ungetChar(currCh);
1885 }
1886 tok.type = tok_Newline;
1887 ++lineNumber;
1888 return true;
1889
1890 case '\n':
1891 if (textPtr < textEnd) {
1892 currCh = getChar();
1893 if (currCh != '\r')
1894 ungetChar(currCh);
1895 }
1896 tok.type = tok_Newline;
1897 ++lineNumber;
1898 return true;
1899
1900 case '\\':
1901 if (textPtr < textEnd) {
1902 currCh = getChar();
1903 if (currCh == '\r' || currCh == '\n') {
1904 if (textPtr < textEnd) {
1905 UInt32 nextCh = getChar();
1906 if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))
1907 ungetChar(nextCh);
1908 }
1909 ++lineNumber;
1910 continue;
1911 }
1912 ungetChar(currCh);
1913 }
1914 goto DEFAULT;
1915
1916 case '"':
1917 case '\'':
1918 {
1919 UInt32 delimiter = currCh;
1920 #ifdef __MWERKS__
1921 tok.strval.clear();
1922 #else
1923 tok.strval.erase(tok.strval.begin(), tok.strval.end());
1924 #endif
1925 while ((textPtr < textEnd) && ((currCh = getChar()) != delimiter) && (currCh != '\r') && (currCh != '\n'))
1926 tok.strval.append(1, currCh);
1927 tok.type = tok_String;
1928 if (currCh == '\r' || currCh == '\n')
1929 ungetChar(currCh);
1930 }
1931 return true;
1932
1933 case '^':
1934 case '(':
1935 case ')':
1936 case '[':
1937 case ']':
1938 case '{':
1939 case '}':
1940 case ',':
1941 case '+':
1942 case '*':
1943 case '?':
1944 case '>':
1945 case '#':
1946 case '|':
1947 case '/':
1948 case '=':
1949 case '@':
1950 tok.type = (tokenType)currCh;
1951 return true;
1952
1953 case '<':
1954 tok.type = (tokenType)'<';
1955 if (textPtr < textEnd) {
1956 if ((currCh = getChar()) == '>')
1957 tok.type = tok_Map;
1958 else
1959 ungetChar(currCh);
1960 }
1961 return true;
1962
1963 case '.':
1964 tok.type = (tokenType)'.';
1965 if (textPtr < textEnd) {
1966 if ((currCh = getChar()) == '.')
1967 tok.type = tok_Ellipsis;
1968 else
1969 ungetChar(currCh);
1970 }
1971 return true;
1972
1973 case '_':
1974 if (textPtr < textEnd) {
1975 currCh = getChar();
1976 ungetChar(currCh);
1977 if (isIDcont(currCh)) {
1978 currCh = '_';
1979 goto DEFAULT;
1980 }
1981 }
1982 tok.type = (tokenType)'_';
1983 return true;
1984
1985 case '0':
1986 if (textPtr < textEnd) {
1987 currCh = getChar();
1988 if (currCh == 'x' || currCh == 'X') {
1989 tok.type = tok_Number;
1990 tok.val = 0;
1991 while (textPtr < textEnd) {
1992 currCh = getChar();
1993 if (currCh >= '0' && currCh <= '9')
1994 tok.val = tok.val * 16 + currCh - '0';
1995 else if (currCh >= 'a' && currCh <= 'f')
1996 tok.val = tok.val * 16 + currCh - 'a' + 10;
1997 else if (currCh >= 'A' && currCh <= 'F')
1998 tok.val = tok.val * 16 + currCh - 'A' + 10;
1999 else {
2000 ungetChar(currCh);
2001 break;
2002 }
2003 }
2004 return true;
2005 }
2006 ungetChar(currCh);
2007 currCh = '0';
2008 }
2009 // else fall through
2010
2011 case '1':
2012 case '2':
2013 case '3':
2014 case '4':
2015 case '5':
2016 case '6':
2017 case '7':
2018 case '8':
2019 case '9':
2020 tok.type = tok_Number;
2021 tok.val = currCh - '0';
2022 while (textPtr < textEnd) {
2023 currCh = getChar();
2024 if (currCh >= '0' && currCh <= '9')
2025 tok.val = tok.val * 10 + currCh - '0';
2026 else {
2027 ungetChar(currCh);
2028 break;
2029 }
2030 }
2031 return true;
2032
2033 case ';':
2034 {
2035 bool continuation = false;
2036 while (textPtr < textEnd) {
2037 continuation = (currCh == '\\');
2038 currCh = getChar();
2039 if (currCh == '\r' || currCh == '\n')
2040 break;
2041 }
2042 if (textPtr < textEnd) {
2043 UInt32 nextCh = getChar();
2044 if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))
2045 ungetChar(nextCh);
2046 }
2047 ++lineNumber;
2048 if (continuation)
2049 continue;
2050 else {
2051 tok.type = tok_Newline;
2052 return true;
2053 }
2054 }
2055
2056 case 'U':
2057 // check for U+xxxx USV
2058 if (textPtr < textEnd) {
2059 currCh = getChar();
2060 if (currCh == '+') {
2061 tok.type = tok_USV;
2062 tok.val = 0;
2063 int digitCount = 0;
2064 while (textPtr < textEnd) {
2065 currCh = getChar();
2066 if (currCh >= '0' && currCh <= '9')
2067 tok.val = tok.val * 16 + currCh - '0';
2068 else if (currCh >= 'a' && currCh <= 'f')
2069 tok.val = tok.val * 16 + currCh - 'a' + 10;
2070 else if (currCh >= 'A' && currCh <= 'F')
2071 tok.val = tok.val * 16 + currCh - 'A' + 10;
2072 else {
2073 ungetChar(currCh);
2074 break;
2075 }
2076 ++digitCount;
2077 }
2078 if (digitCount < 4 || digitCount > 6) {
2079 Error("Unicode value (U+xxxx) must have 4-6 hex digits");
2080 tok.val = 0;
2081 }
2082 return true;
2083 }
2084 else
2085 ungetChar(currCh);
2086 }
2087 currCh = 'U';
2088 goto DEFAULT;
2089
2090 // read an identifier or some other 'unknown' character
2091 default:
2092 DEFAULT:
2093 if (isIDstart(currCh)) {
2094 idBuffer[0] = currCh;
2095 tok.val = 1;
2096 while (textPtr < textEnd) {
2097 currCh = getChar();
2098 if (!isIDcont(currCh)) {
2099 ungetChar(currCh);
2100 break;
2101 }
2102 if (tok.val < 256)
2103 idBuffer[tok.val++] = currCh;
2104 }
2105 tok.type = IDlookup(&idBuffer[0], tok.val);
2106 return true;
2107 }
2108 tok.type = tok_Unknown;
2109 tok.val = currCh;
2110 return true;
2111 }
2112 }
2113 }
2114
2115 bool
ExpectToken(tokenType type,const char * errMsg)2116 Compiler::ExpectToken(tokenType type, const char* errMsg)
2117 {
2118 if (!GetNextToken() || tok.type != type) {
2119 Error(errMsg);
2120 return false;
2121 }
2122 return true;
2123 }
2124
2125 void
Error(const char * msg,const char * s,UInt32 line)2126 Compiler::Error(const char* msg, const char* s, UInt32 line)
2127 {
2128 if (line == 0xffffffff)
2129 line = lineNumber;
2130 if (errorFunction == 0) {
2131 cout << "Error: " << msg;
2132 if (s != 0)
2133 cout << ": \"" << s << '"';
2134 cout << " at line " << line << endl;
2135 }
2136 else
2137 (*errorFunction)(errFuncUserData, (char*)msg, (char*)s, line);
2138 errorState = true;
2139 ++errorCount;
2140 }
2141
2142 void
StartDefaultPass()2143 Compiler::StartDefaultPass()
2144 {
2145 if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {
2146 Error("normalization pass cannot contain any other rules");
2147 currentPass.passType = kCode_Unic;
2148 }
2149 if (currentPass.passType == 0) {
2150 currentPass.clear(); // should already be clear!
2151 currentPass.passType = kCode_BU;
2152 currentPass.setLineNo(lineNumber);
2153 }
2154 }
2155
2156 void
AppendToRule(const Item & item)2157 Compiler::AppendToRule(const Item& item)
2158 {
2159 StartDefaultPass();
2160 switch (ruleState) {
2161 case notInRule:
2162 ruleState = inLHSString;
2163 currentRule.setLineNo(lineNumber);
2164 case inLHSString:
2165 currentRule.lhsString.push_back(item);
2166 break;
2167 case inLHSPreContext:
2168 currentRule.lhsPreContext.push_back(item);
2169 break;
2170 case inLHSPostContext:
2171 currentRule.lhsPostContext.push_back(item);
2172 break;
2173 case inRHSString:
2174 currentRule.rhsString.push_back(item);
2175 break;
2176 case inRHSPreContext:
2177 currentRule.rhsPreContext.push_back(item);
2178 break;
2179 case inRHSPostContext:
2180 currentRule.rhsPostContext.push_back(item);
2181 break;
2182 }
2183 }
2184
2185 UInt32
charLimit()2186 Compiler::charLimit()
2187 {
2188 UInt32 limit;
2189 switch (ruleState) {
2190 case inRHSString:
2191 case inRHSPreContext:
2192 case inRHSPostContext:
2193 limit = (currentPass.passType == kCode_BU || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);
2194 break;
2195 default:
2196 limit = (currentPass.passType == kCode_UB || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);
2197 break;
2198 }
2199 return limit;
2200 }
2201
2202 void
AppendLiteral(UInt32 val,bool negate)2203 Compiler::AppendLiteral(UInt32 val, bool negate)
2204 {
2205 StartDefaultPass();
2206 if (val > charLimit()) {
2207 Error("literal value out of range");
2208 return;
2209 }
2210 Item item;
2211 item.type = 0;
2212 item.negate = negate ? 1 : 0;
2213 item.repeatMin = 0xff;
2214 item.repeatMax = 0xff;
2215 item.val = val;
2216 AppendToRule(item);
2217 }
2218
2219 void
AppendUSV(UInt32 val,bool negate)2220 Compiler::AppendUSV(UInt32 val, bool negate)
2221 {
2222 StartDefaultPass();
2223 if (charLimit() == 0xff) {
2224 Error("can't use Unicode character in byte encoding");
2225 return;
2226 }
2227 AppendLiteral(val, negate);
2228 }
2229
2230 void
AppendSpecial(UInt8 type,bool negate)2231 Compiler::AppendSpecial(UInt8 type, bool negate)
2232 {
2233 Item item;
2234 item.type = type;
2235 item.negate = negate ? 1 : 0;
2236 item.repeatMin = 0xff;
2237 item.repeatMax = 0xff;
2238 item.val = 0;
2239 item.start = item.next = item.after = item.index = 0xff;
2240 AppendToRule(item);
2241 }
2242
2243 void
AppendClass(const string & className,bool negate)2244 Compiler::AppendClass(const string& className, bool negate)
2245 {
2246 StartDefaultPass();
2247 Item item;
2248 item.type = kMatchElem_Type_Class;
2249 item.negate = negate ? 1 : 0;
2250 item.repeatMin = 0xff;
2251 item.repeatMax = 0xff;
2252 item.val = 0;
2253 const map<string,UInt32>* classNames;
2254 switch (ruleState) {
2255 case inRHSString:
2256 case inRHSPreContext:
2257 case inRHSPostContext:
2258 classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_UB)
2259 ? ¤tPass.byteClassNames : ¤tPass.uniClassNames;
2260 break;
2261 default:
2262 classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_BU)
2263 ? ¤tPass.byteClassNames : ¤tPass.uniClassNames;
2264 break;
2265 }
2266 map<string,UInt32>::const_iterator i;
2267 i = classNames->find(className);
2268 if (i == classNames->end())
2269 Error("undefined class", className.c_str());
2270 else
2271 item.val = i->second;
2272 AppendToRule(item);
2273 }
2274
2275 bool
tagExists(bool rhs,const string & tag)2276 Compiler::tagExists(bool rhs, const string& tag)
2277 {
2278 if (rhs) {
2279 if ( (findTag(tag, currentRule.rhsString) != -1)
2280 || (findTag(tag, currentRule.rhsPreContext) != -1)
2281 || (findTag(tag, currentRule.rhsPostContext) != -1))
2282 return true;
2283 }
2284 else {
2285 if ( (findTag(tag, currentRule.lhsString) != -1)
2286 || (findTag(tag, currentRule.lhsPreContext) != -1)
2287 || (findTag(tag, currentRule.lhsPostContext) != -1))
2288 return true;
2289 }
2290 return false;
2291 }
2292
2293 void
AssignTag(const string & tag)2294 Compiler::AssignTag(const string& tag)
2295 {
2296 if (currentPass.passType == 0 || ruleState == notInRule) {
2297 Error("item tag doesn't seem to be attached to a rule item", tag.c_str());
2298 return;
2299 }
2300 Item* item = NULL;
2301 switch (ruleState) {
2302 default:
2303 Error("this can't happen (AssignTag)");
2304 return;
2305 case inLHSString:
2306 if (tagExists(false, tag))
2307 break;
2308 item = ¤tRule.lhsString.back();
2309 break;
2310 case inLHSPreContext:
2311 if (tagExists(false, tag))
2312 break;
2313 item = ¤tRule.lhsPreContext.back();
2314 break;
2315 case inLHSPostContext:
2316 if (tagExists(false, tag))
2317 break;
2318 item = ¤tRule.lhsPostContext.back();
2319 break;
2320 case inRHSString:
2321 if (tagExists(true, tag))
2322 break;
2323 item = ¤tRule.rhsString.back();
2324 break;
2325 case inRHSPreContext:
2326 if (tagExists(true, tag))
2327 break;
2328 item = ¤tRule.rhsPreContext.back();
2329 break;
2330 case inRHSPostContext:
2331 if (tagExists(true, tag))
2332 break;
2333 item = ¤tRule.rhsPostContext.back();
2334 break;
2335 }
2336 if (item == NULL) {
2337 Error("duplicate tag (ignored)", tag.c_str());
2338 return;
2339 }
2340 if (item->tag.length() > 0) {
2341 Error("rule item already has a tag", tag.c_str());
2342 return;
2343 }
2344 switch (item->type) {
2345 case 0:
2346 case kMatchElem_Type_Class:
2347 case kMatchElem_Type_EGroup:
2348 case kMatchElem_Type_ANY:
2349 case kMatchElem_Type_Copy:
2350 item->tag = tag;
2351 break;
2352
2353 default:
2354 Error("invalid use of item tag", tag.c_str());
2355 break;
2356 }
2357 }
2358
2359 void
SetMinMax(int repeatMin,int repeatMax)2360 Compiler::SetMinMax(int repeatMin, int repeatMax)
2361 {
2362 Item* item = 0;
2363 switch (ruleState) {
2364 default:
2365 Error("invalid use of repeat count");
2366 break;
2367 case inLHSString:
2368 item = ¤tRule.lhsString.back();
2369 break;
2370 case inLHSPreContext:
2371 item = ¤tRule.lhsPreContext.back();
2372 break;
2373 case inLHSPostContext:
2374 item = ¤tRule.lhsPostContext.back();
2375 break;
2376 case inRHSString:
2377 item = ¤tRule.rhsString.back();
2378 break;
2379 case inRHSPreContext:
2380 item = ¤tRule.rhsPreContext.back();
2381 break;
2382 case inRHSPostContext:
2383 item = ¤tRule.rhsPostContext.back();
2384 break;
2385 }
2386 if (item) {
2387 switch (item->type) {
2388 case 0:
2389 case kMatchElem_Type_Class:
2390 case kMatchElem_Type_ANY:
2391 case kMatchElem_Type_EGroup:
2392 if (repeatMin > repeatMax || repeatMax < 1 || repeatMax > 15)
2393 Error("invalid repeat counts (0-15 allowed)");
2394 else if (item->repeatMin != 0xff)
2395 Error("multiple repeat counts on item");
2396 else {
2397 item->repeatMin = repeatMin;
2398 item->repeatMax = repeatMax;
2399 }
2400 break;
2401 default:
2402 Error("invalid use of repeat count");
2403 break;
2404 }
2405 }
2406 }
2407
2408 void
setGroupPointers(vector<Item>::iterator b,vector<Item>::iterator e,int startIndex,bool isReversed)2409 Compiler::setGroupPointers(vector<Item>::iterator b, vector<Item>::iterator e, int startIndex, bool isReversed)
2410 {
2411 // set up the fwd and back pointers on bgroup/or/egroup
2412 // and propagate repeat counts from egroup to bgroup
2413 vector<Item>::iterator base = b;
2414 vector<Item>::iterator altStart = startIndex > 0 ? base - 1 : e;
2415 bool altSeen = false;
2416 while (b != e) {
2417 if (b->repeatMin == 0xff)
2418 b->repeatMin = 1;
2419 if (b->repeatMax == 0xff)
2420 b->repeatMax = 1;
2421 switch (b->type) {
2422 case 0: // literal
2423 case kMatchElem_Type_Class:
2424 case kMatchElem_Type_ANY:
2425 case kMatchElem_Type_EOS:
2426 break;
2427
2428 case kMatchElem_Type_OR:
2429 // if startIndex > 0, then initial altStart will be valid
2430 if ((startIndex > 0 || altSeen) && (altStart->type == kMatchElem_Type_OR || altStart->type == kMatchElem_Type_BGroup))
2431 altStart->next = startIndex + (b - base);
2432 else {
2433 Error("this can't happen (setGroupPointers 1)");
2434 return;
2435 }
2436 altStart = b;
2437 altStart->start = startIndex - 1;
2438 altSeen = true;
2439 break;
2440
2441 case kMatchElem_Type_EGroup:
2442 Error("this can't happen (setGroupPointers 2)");
2443 return;
2444
2445 case kMatchElem_Type_BGroup:
2446 {
2447 // need to find corresponding EGroup and copy repeat counts from there
2448 // (or vice versa if this is reversed context)
2449 vector<Item>::iterator subGroupStart = b++;
2450 subGroupStart->next = 0;
2451 int nestingLevel = 0;
2452 while (b->type != kMatchElem_Type_EGroup || nestingLevel > 0) {
2453 if (b->type == kMatchElem_Type_BGroup)
2454 ++nestingLevel;
2455 else if (b->type == kMatchElem_Type_EGroup)
2456 --nestingLevel;
2457 ++b;
2458 }
2459 if (isReversed) {
2460 b->repeatMin = subGroupStart->repeatMin;
2461 b->repeatMax = subGroupStart->repeatMax;
2462 }
2463 else {
2464 if (b->repeatMin == 0xff)
2465 b->repeatMin = 1;
2466 if (b->repeatMax == 0xff)
2467 b->repeatMax = 1;
2468 subGroupStart->repeatMin = b->repeatMin;
2469 subGroupStart->repeatMax = b->repeatMax;
2470 }
2471 setGroupPointers(subGroupStart + 1, b, startIndex + (subGroupStart - base + 1), isReversed);
2472 subGroupStart->after = startIndex + (b - base + 1);
2473 b->start = startIndex + (subGroupStart - base);
2474 }
2475 break;
2476 }
2477 ++b;
2478 }
2479 if (altSeen)
2480 altStart->next = startIndex + (b - base); // set NEXT pointer of last OR
2481 if (startIndex > 0) { // we were handling a group, so set pointers of EGroup
2482 if (b->type == kMatchElem_Type_EGroup)
2483 b->start = startIndex - 1;
2484 else {
2485 Error("this can't happen (setGroupPointers 3)");
2486 return;
2487 }
2488 }
2489 }
2490
2491 void
setGroupPointers(vector<Rule> & rules)2492 Compiler::setGroupPointers(vector<Rule>& rules)
2493 {
2494 for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2495 setGroupPointers(i->matchStr.begin(), i->matchStr.end(), 0);
2496 setGroupPointers(i->preContext.begin(), i->preContext.end(), 0, true);
2497 setGroupPointers(i->postContext.begin(), i->postContext.end(), 0);
2498 }
2499 }
2500
2501 int
calcMaxLen(vector<Item>::iterator b,vector<Item>::iterator e)2502 Compiler::calcMaxLen(vector<Item>::iterator b, vector<Item>::iterator e)
2503 {
2504 int len = 0;
2505 int maxLen = 0;
2506 while (b != e) {
2507 switch (b->type) {
2508 case 0: // literal
2509 case kMatchElem_Type_Class:
2510 case kMatchElem_Type_ANY:
2511 case kMatchElem_Type_EOS:
2512 len += b->repeatMax;
2513 break;
2514
2515 case kMatchElem_Type_OR:
2516 if (len > maxLen)
2517 maxLen = len;
2518 len = 0;
2519 break;
2520
2521 case kMatchElem_Type_EGroup:
2522 Error("this can't happen (calcMaxLen)");
2523 return 0;
2524
2525 case kMatchElem_Type_BGroup:
2526 {
2527 // need to find corresponding EGroup
2528 vector<Item>::iterator subGroupStart = b++;
2529 int nestingLevel = 0;
2530 while (b->type != kMatchElem_Type_EGroup || nestingLevel > 0) {
2531 if (b->type == kMatchElem_Type_BGroup)
2532 ++nestingLevel;
2533 else if (b->type == kMatchElem_Type_EGroup)
2534 --nestingLevel;
2535 ++b;
2536 }
2537 len += subGroupStart->repeatMax * calcMaxLen(subGroupStart + 1, b);
2538 }
2539 break;
2540 }
2541 if (b == e)
2542 break; // this can happen with nested groups that end together
2543 ++b;
2544 }
2545 if (len > maxLen)
2546 maxLen = len;
2547 return maxLen;
2548 }
2549
2550 int
calcMaxOutLen(Rule & rule)2551 Compiler::calcMaxOutLen(Rule& rule)
2552 {
2553 int len = 0;
2554 for (vector<Item>::const_iterator i = rule.replaceStr.begin(); i != rule.replaceStr.end(); ++i) {
2555 switch (i->type) {
2556 case 0:
2557 ++len;
2558 break;
2559
2560 case kMatchElem_Type_Class:
2561 ++len;
2562 break;
2563
2564 case kMatchElem_Type_Copy:
2565 {
2566 vector<Item>::iterator b = rule.matchStr.begin() + i->index;
2567 if (b->type == kMatchElem_Type_BGroup)
2568 // 2002-10-01 bugfix for "this can't happen (calcMaxLen)" reported by Bob Eaton
2569 // Note that b->after is absolute index, not relative to b
2570 // len += calcMaxLen(b + 1, b + b->after - 2);
2571 // 2002-03-19 bugfix: added multiplication by b->repeatMax, as engine copies entire repeated group
2572 // and corrected ending pointer (-1, not -2)
2573 len += b->repeatMax * calcMaxLen(b + 1, rule.matchStr.begin() + b->after - 1);
2574 // b->next - i->index; // FIXME: this is actually an overestimate of the possible length
2575 else
2576 len += b->repeatMax;
2577 }
2578 continue;
2579
2580 default:
2581 cerr << "bad rep elem type: " << (int)i->type << endl;
2582 break;
2583 }
2584 }
2585 return len;
2586 }
2587
2588 void
associateItems(vector<Rule> & rules,bool fromUni,bool toUni)2589 Compiler::associateItems(vector<Rule>& rules, bool fromUni, bool toUni)
2590 {
2591 /*
2592 handle associations between match and replacement class/copy items:
2593 1. any COPY items on LHS need to be exchanged with the corresponding sequence from RHS
2594 2. any CLASS and COPY items on RHS need their /index/ field set according to the
2595 corresponding LHS item:
2596 a. if /tag/ is non-null, look for matching tag
2597 b. else use position, checking that corresponding LHS item is valid
2598 - note that if LHS item is EGroup, we actually need to point at the corresponding BGroup
2599 */
2600
2601 for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2602 // Deal with COPY items on LHS
2603 vector<Item>& m = i->matchStr;
2604 vector<Item>& r = i->replaceStr;
2605 for (UInt32 j = 0; j < m.size(); ++j) {
2606 if (m[j].type == kMatchElem_Type_Copy) {
2607 if (m[j].tag.length() == 0) {
2608 Error("COPY item must have association tag", 0, i->lineNumber);
2609 continue;
2610 }
2611 for (UInt32 k = 0; k < r.size(); ++k) {
2612 if (r[k].tag == m[j].tag) {
2613 Item t = m[j];
2614 switch (r[k].type) {
2615 case kMatchElem_Type_EGroup:
2616 {
2617 // find corresponding BGroup, and replace m[j] with the entire group
2618 UInt32 b = k;
2619 int nestingLevel = 0;
2620 while (b != 0) {
2621 --b;
2622 if (r[b].type == kMatchElem_Type_EGroup)
2623 ++nestingLevel;
2624 else if (r[b].type == kMatchElem_Type_BGroup) {
2625 if (nestingLevel > 0)
2626 --nestingLevel;
2627 else
2628 break;
2629 }
2630 }
2631 if (r[b].type == kMatchElem_Type_BGroup && nestingLevel == 0) {
2632 m.erase(m.begin() + j);
2633 m.insert(m.begin() + j, r.begin() + b, r.begin() + k + 1);
2634 r.erase(r.begin() + b, r.begin() + k + 1);
2635 r.insert(r.begin() + b, t);
2636 }
2637 else {
2638 Error("can't find complete tagged group for COPY item", r[k].tag.c_str(), i->lineNumber);
2639 break;
2640 }
2641 }
2642 break;
2643
2644 case 0:
2645 case kMatchElem_Type_Class:
2646 case kMatchElem_Type_ANY:
2647 m[j] = r[k];
2648 r[k] = t;
2649 break;
2650
2651 default:
2652 Error("invalid COPY item in match", r[k].tag.c_str(), i->lineNumber);
2653 break;
2654 }
2655 break;
2656 }
2657 }
2658 }
2659 }
2660
2661 // Set up associations
2662 for (vector<Item>::iterator ir = r.begin(); ir != r.end(); ++ir) {
2663 int matchIndex = ir->tag.length() > 0 ? findTag(ir->tag, m) : ir - r.begin();
2664 if (matchIndex == -1) {
2665 Error("tag not found", ir->tag.c_str(), i->lineNumber);
2666 continue;
2667 }
2668 if (errorCount > 0)
2669 break;
2670 ir->index = matchIndex;
2671 vector<Item>::const_iterator im;
2672 if (ir->index < m.end() - m.begin())
2673 im = m.begin() + ir->index;
2674 else
2675 im = m.end();
2676 switch (ir->type) {
2677 case kMatchElem_Type_Class:
2678 if (im == m.end()) {
2679 Error("class in replacement does not have corresponding match item", 0, i->lineNumber);
2680 break;
2681 }
2682 switch (im->type) {
2683 case kMatchElem_Type_Class:
2684 {
2685 // check that class sizes correspond
2686 Class& mc = fromUni ? currentPass.uniClassMembers[im->val] : currentPass.byteClassMembers[im->val];
2687 Class& rc = toUni ? currentPass.uniClassMembers[ir->val] : currentPass.byteClassMembers[ir->val];
2688 if (mc.size() != rc.size())
2689 Error("class size mismatch", 0, i->lineNumber);
2690 }
2691 break;
2692 default:
2693 Error("type mismatch for replacement class item", 0, i->lineNumber);
2694 break;
2695 }
2696 break;
2697
2698 case kMatchElem_Type_Copy:
2699 // COPY can correspond to anything except COPY on LHS
2700 if (im == m.end()) {
2701 Error("COPY in replacement does not have corresponding match item", 0, i->lineNumber);
2702 break;
2703 }
2704 switch (im->type) {
2705 case kMatchElem_Type_Copy:
2706 Error("can't associate COPY elements", 0, i->lineNumber);
2707 break;
2708 case kMatchElem_Type_EGroup:
2709 // change replacement item to point to the BGroup instead
2710 {
2711 int nestingLevel = 0;
2712 while (1) {
2713 if (im == m.begin()) {
2714 Error("this can't happen (associate)", 0, i->lineNumber);
2715 break;
2716 }
2717 --im;
2718 if (im->type == kMatchElem_Type_EGroup)
2719 ++nestingLevel;
2720 else if (im->type == kMatchElem_Type_BGroup) {
2721 if (nestingLevel > 0)
2722 --nestingLevel;
2723 else {
2724 ir->index = im - m.begin();
2725 break;
2726 }
2727 }
2728 }
2729 }
2730 break;
2731 }
2732 break;
2733
2734 case kMatchElem_Type_ANY:
2735 // ANY on RHS can only correspond with LITERAL or ANY on LHS
2736 if (im == m.end()) {
2737 Error("ANY in replacement does not have corresponding match item", 0, i->lineNumber);
2738 break;
2739 }
2740 switch (im->type) {
2741 case 0:
2742 case kMatchElem_Type_ANY:
2743 break;
2744 default:
2745 Error("invalid ANY element in replacement", 0, i->lineNumber);
2746 break;
2747 }
2748 break;
2749
2750 case 0:
2751 break;
2752 }
2753 }
2754
2755 // filter out stuff on the RHS that doesn't belong
2756 for (UInt32 f = r.size(); f > 0; ) {
2757 --f;
2758 switch (r[f].type) {
2759 case kMatchElem_Type_EOS:
2760 Error("can't use EOS in replacement", 0, i->lineNumber);
2761 break;
2762 case kMatchElem_Type_BGroup:
2763 case kMatchElem_Type_EGroup:
2764 case kMatchElem_Type_OR:
2765 r.erase(r.begin() + f);
2766 break;
2767 }
2768 }
2769
2770 if (errorCount > 0)
2771 break;
2772 }
2773 }
2774
2775 int
findTag(const string & tag,const vector<Item> & str)2776 Compiler::findTag(const string& tag, const vector<Item>& str)
2777 {
2778 for (vector<Item>::const_iterator i = str.begin(); i != str.end(); ++i)
2779 if (i->tag == tag)
2780 return i - str.begin();
2781 return -1;
2782 }
2783
2784 int
ruleKeyComp(const Rule & a,const Rule & b)2785 Compiler::ruleKeyComp(const Rule& a, const Rule& b)
2786 {
2787 if (a.sortKey > b.sortKey) // higher sortKey comes first
2788 return -1;
2789 else if (a.sortKey < b.sortKey)
2790 return 1;
2791 else if (a.lineNumber < b.lineNumber) // lower line number comes first
2792 return -1;
2793 else if (a.lineNumber > b.lineNumber)
2794 return 1;
2795 else
2796 return 0;
2797 }
2798
2799 void
sortRules(vector<Rule> & rules)2800 Compiler::sortRules(vector<Rule>& rules)
2801 {
2802 // calculate sort keys based on match string length and context length
2803 for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {
2804 int maxMatch = calcMaxLen(i->matchStr.begin(), i->matchStr.end());
2805 int maxPre = calcMaxLen(i->preContext.begin(), i->preContext.end());
2806 int maxPost = calcMaxLen(i->postContext.begin(), i->postContext.end());
2807 if (maxMatch + maxPre + maxPost > 255)
2808 Error("rule too long", 0, i->lineNumber);
2809 i->sortKey = (maxMatch << 8) + maxPre + maxPost;
2810 if (maxMatch > buildVars.maxMatch)
2811 buildVars.maxMatch = maxMatch;
2812 if (maxPre > buildVars.maxPre)
2813 buildVars.maxPre = maxPre;
2814 if (maxPost > buildVars.maxPost)
2815 buildVars.maxPost = maxPost;
2816
2817 // calculate maxOutput and track highest value
2818 int maxOutput = calcMaxOutLen(*i);
2819 if (maxOutput > 255)
2820 Error("output too long", 0, i->lineNumber);
2821 if (maxOutput > buildVars.maxOutput)
2822 buildVars.maxOutput = maxOutput;
2823 }
2824 #if 0 // __MWERKS__
2825 sort(rules.begin(), rules.end(), ruleKeyComp);
2826 #else
2827 // std library /sort/ crashes in Project Builder version, so do a bubble sort ourselves instead
2828 /*
2829 for (vector<Rule>::iterator a = rules.begin(); a != rules.end(); ++a)
2830 for (vector<Rule>::iterator b = rules.end() - 1; b != a; --b)
2831 if (ruleKeyComp(*(b - 1), *b) > 0)
2832 swap(*b, *(b - 1));
2833 */
2834 // bubble-sorting the actual rules is really expensive; create an index and sort that instead
2835 vector<UInt32> ruleIndex;
2836 for (UInt32 i = 0; i < rules.size(); ++i)
2837 ruleIndex.push_back(i);
2838 for (vector<UInt32>::iterator a = ruleIndex.begin(); a != ruleIndex.end(); ++a)
2839 for (vector<UInt32>::iterator b = ruleIndex.end() - 1; b != a; --b)
2840 if (ruleKeyComp(rules[*(b - 1)], rules[*b]) > 0)
2841 swap(*b, *(b - 1));
2842 vector<Rule> sortedRules;
2843 for (vector<UInt32>::iterator s = ruleIndex.begin(); s != ruleIndex.end(); ++s)
2844 sortedRules.push_back(rules[*s]);
2845 rules = sortedRules;
2846 #endif
2847 }
2848
2849 bool
findInitialItems(const Rule & rule,vector<Item>::const_iterator b,vector<Item>::const_iterator e,vector<Item> & initialItems)2850 Compiler::findInitialItems(const Rule& rule, vector<Item>::const_iterator b, vector<Item>::const_iterator e, vector<Item>& initialItems)
2851 {
2852 // return true if we find a non-optional item, false if we could match null string
2853 while (b != e) {
2854 switch (b->type) {
2855 case 0:
2856 case kMatchElem_Type_Class:
2857 case kMatchElem_Type_ANY:
2858 case kMatchElem_Type_EOS:
2859 initialItems.push_back(*b);
2860 if (b->repeatMin > 0)
2861 return true;
2862 ++b;
2863 break;
2864
2865 case kMatchElem_Type_BGroup:
2866 {
2867 vector<Item>::const_iterator groupStart = b;
2868 vector<Item>::const_iterator altStart = b + 1;
2869 int nestingLevel = 0;
2870 bool optional = false;
2871 while (++b != e) {
2872 switch (b->type) {
2873 case kMatchElem_Type_BGroup:
2874 ++nestingLevel;
2875 break;
2876 case kMatchElem_Type_OR:
2877 if (nestingLevel == 0) {
2878 if (!findInitialItems(rule, altStart, b, initialItems))
2879 optional = true;
2880 altStart = b + 1;
2881 }
2882 break;
2883 case kMatchElem_Type_EGroup:
2884 if (nestingLevel == 0) {
2885 if (!findInitialItems(rule, altStart, b, initialItems))
2886 optional = true;
2887 }
2888 --nestingLevel;
2889 break;
2890 }
2891 if (nestingLevel < 0)
2892 break;
2893 }
2894 if (!optional && groupStart->repeatMin > 0)
2895 return true;
2896 ++b;
2897 }
2898 break;
2899
2900 case kMatchElem_Type_Copy:
2901 Error("can't use copy item (@tag) on match side of rule", 0, rule.lineNumber);
2902 ++b;
2903 break;
2904
2905 default:
2906 Error("this can't happen (findInitialItems)", 0, rule.lineNumber);
2907 ++b;
2908 break;
2909 }
2910 }
2911 return false;
2912 }
2913
2914 void
findInitialItems(const Rule & rule,vector<Item> & initialItems)2915 Compiler::findInitialItems(const Rule& rule, vector<Item>& initialItems)
2916 {
2917 bool foundNonOpt = false;
2918 if (rule.matchStr.size() > 0)
2919 foundNonOpt = findInitialItems(rule, rule.matchStr.begin(), rule.matchStr.end(), initialItems);
2920 if (!foundNonOpt && rule.postContext.size() > 0)
2921 foundNonOpt = findInitialItems(rule, rule.postContext.begin(), rule.postContext.end(), initialItems);
2922 if (!foundNonOpt)
2923 Error("rule must have non-null match string or post-context", 0, rule.lineNumber);
2924 }
2925
2926 long
classIndex(UInt32 charCode,const Class & classMembers)2927 Compiler::classIndex(UInt32 charCode, const Class& classMembers)
2928 {
2929 for (Class::const_iterator i = classMembers.begin(); i != classMembers.end(); ++i)
2930 if (*i == charCode)
2931 return i - classMembers.begin();
2932 return -1;
2933 }
2934
2935 bool
isSingleCharRule(const Rule & rule)2936 Compiler::isSingleCharRule(const Rule& rule)
2937 {
2938 if (rule.preContext.size() == 0 && rule.postContext.size() == 0 && rule.matchStr.size() == 1) {
2939 const Item& item = rule.matchStr.front();
2940 if (item.repeatMin == 1 && item.repeatMax == 1)
2941 if (item.type == 0 || item.type == kMatchElem_Type_Class || item.type == kMatchElem_Type_ANY)
2942 return true;
2943 }
2944 return false;
2945 }
2946
2947 void
appendMatchElem(string & packedRule,Item & item,int index,vector<MatClass> & matchClasses)2948 Compiler::appendMatchElem(string& packedRule, Item& item, int index,
2949 vector<MatClass>& matchClasses)
2950 {
2951 MatchElem m;
2952 WRITE(m.value.usv.data, 0);
2953 WRITE(m.flags.repeat, (item.repeatMin << 4) + item.repeatMax);
2954 if (item.negate)
2955 WRITE(m.flags.type, kMatchElem_Negate);
2956 else
2957 WRITE(m.flags.type, 0);
2958 switch (item.type) {
2959 case 0:
2960 WRITE(m.value.usv.data, READ(m.value.usv.data) | item.val);
2961 break;
2962
2963 case kMatchElem_Type_Class:
2964 {
2965 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_Class));
2966 UInt32 i;
2967 for (i = 0; i < matchClasses.size(); ++i)
2968 if (matchClasses[i].membersClass == item.val)
2969 break;
2970 if (i == matchClasses.size())
2971 matchClasses.push_back(MatClass(item.val));
2972 WRITE(m.value.cls.index, i);
2973 }
2974 break;
2975
2976 case kMatchElem_Type_BGroup:
2977 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_BGroup));
2978 WRITE(m.value.bgroup.dNext, item.next - index);
2979 WRITE(m.value.bgroup.dAfter, item.after - index);
2980 break;
2981
2982 case kMatchElem_Type_EGroup:
2983 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_EGroup));
2984 WRITE(m.value.egroup.dStart, index - item.start);
2985 break;
2986
2987 case kMatchElem_Type_OR:
2988 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_OR));
2989 WRITE(m.value.egroup.dNext, item.next - index);
2990 WRITE(m.value.egroup.dStart, index - item.start);
2991 break;
2992
2993 case kMatchElem_Type_ANY:
2994 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_ANY));
2995 break;
2996
2997 case kMatchElem_Type_EOS:
2998 WRITE(m.flags.type, READ(m.flags.type) | (kMatchElem_NonLit + kMatchElem_Type_EOS));
2999 break;
3000 }
3001 const char* p = (const char*)&m;
3002 packedRule.append(p, sizeof(m));
3003 }
3004
3005 void
appendReplaceElem(string & packedRule,Item & item,vector<Item> & matchStr,vector<RepClass> & repClasses)3006 Compiler::appendReplaceElem(string& packedRule, Item& item, vector<Item>& matchStr, vector<RepClass>& repClasses)
3007 {
3008 RepElem r;
3009 WRITE(r.value, 0);
3010 switch (item.type) {
3011 case 0:
3012 WRITE(r.value, item.val);
3013 break;
3014
3015 case kRepElem_Class:
3016 {
3017 WRITE(r.flags.type, kRepElem_Class);
3018 WRITE(r.flags.matchIndex, item.index);
3019 Item& mItem = matchStr[item.index];
3020 if (mItem.type != kMatchElem_Type_Class) {
3021 cerr << "this can't happen (appendReplaceElem)\n";
3022 exit(1);
3023 }
3024 UInt32 i;
3025 for (i = 0; i < repClasses.size(); ++i)
3026 if (repClasses[i].membersClass == item.val && repClasses[i].sortLikeClass == mItem.val)
3027 break;
3028 if (i == repClasses.size())
3029 repClasses.push_back(RepClass(item.val, mItem.val));
3030 WRITE(r.flags.repClass, i);
3031 }
3032 break;
3033
3034 case kRepElem_Copy:
3035 WRITE(r.flags.type, kRepElem_Copy);
3036 WRITE(r.flags.matchIndex, item.index);
3037 break;
3038
3039 case kRepElem_Unmapped:
3040 WRITE(r.flags.type, kRepElem_Unmapped);
3041 break;
3042 }
3043 const char* p = (const char*)&r;
3044 packedRule.append(p, sizeof(r));
3045 }
3046
3047 vector<Compiler::Item>
reverseContext(const vector<Item> & ctx)3048 Compiler::reverseContext(const vector<Item>& ctx)
3049 {
3050 vector<Item> rval;
3051 for (vector<Item>::const_iterator i = ctx.begin(); i != ctx.end(); ++i) {
3052 rval.insert(rval.begin(), *i);
3053 switch (i->type) {
3054 case kMatchElem_Type_BGroup:
3055 rval.front().type = kMatchElem_Type_EGroup;
3056 break;
3057
3058 case kMatchElem_Type_EGroup:
3059 rval.front().type = kMatchElem_Type_BGroup;
3060 break;
3061 }
3062 }
3063 return rval;
3064 }
3065
3066 void
addToCharMap(UInt32 ch,UInt16 index)3067 Compiler::addToCharMap(UInt32 ch, UInt16 index)
3068 {
3069 UInt8 plane = ch >> 16;
3070 UInt8 page = (ch & 0x00ffff) >> 8;
3071 if (buildVars.planeMap.size() <= plane)
3072 buildVars.planeMap.resize(plane + 1, 0xff);
3073 if ((UInt8)buildVars.planeMap[plane] == (UInt8)0xff) {
3074 buildVars.planeMap[plane] = buildVars.pageMaps.size();
3075 buildVars.pageMaps.resize(buildVars.pageMaps.size() + 1);
3076 buildVars.pageMaps.back().resize(256, 0xff);
3077 }
3078 UInt8 planeIndex = buildVars.planeMap[plane];
3079 string& pageMap = buildVars.pageMaps[planeIndex];
3080 if ((UInt8)pageMap[page] == (UInt8)0xff) {
3081 pageMap[page] = buildVars.charMaps.size();
3082 buildVars.charMaps.resize(buildVars.charMaps.size() + 1);
3083 buildVars.charMaps.back().resize(256);
3084 }
3085 vector<UInt16>& charMap = buildVars.charMaps[(UInt8)pageMap[page]];
3086 charMap[ch & 0x0000ff] = index;
3087 }
3088
3089 void
align(string & table,int alignment)3090 Compiler::align(string& table, int alignment)
3091 {
3092 int remainder = table.size() % alignment;
3093 if (remainder != 0)
3094 table.resize(table.size() + alignment - remainder);
3095 }
3096
3097 class Member {
3098 public:
Member(UInt32 v,UInt32 k)3099 Member(UInt32 v, UInt32 k)
3100 : value(v), key(k)
3101 { }
3102 UInt32 value;
3103 UInt32 key;
operator <(const Member & rhs) const3104 bool operator<(const Member& rhs) const
3105 { return key < rhs.key; }
3106 };
3107
3108 void
buildTable(vector<Rule> & rules,bool fromUni,bool toUni,string & table)3109 Compiler::buildTable(vector<Rule>& rules, bool fromUni, bool toUni, string& table)
3110 {
3111 TableHeader th;
3112 WRITE(th.type, fromUni ? (toUni ? kTableType_UU : kTableType_UB) : (toUni ? kTableType_BU : kTableType_BB));
3113 WRITE(th.version, kCurrentTableVersion);
3114 WRITE(th.length, 0);
3115 WRITE(th.flags, 0);
3116 WRITE(th.pageBase, 0);
3117 WRITE(th.lookupBase, 0);
3118 WRITE(th.matchClassBase, 0);
3119 WRITE(th.repClassBase, 0);
3120 WRITE(th.stringListBase, 0);
3121 WRITE(th.stringRuleData, 0);
3122 WRITE(th.maxMatch, buildVars.maxMatch);
3123 WRITE(th.maxPre, buildVars.maxPre);
3124 WRITE(th.maxPost, buildVars.maxPost);
3125 WRITE(th.maxOutput, buildVars.maxOutput);
3126 WRITE(th.replacementChar, toUni ? currentPass.uniDefault : currentPass.byteDefault);
3127
3128 map<UInt32,UInt16> charToIndex;
3129 map<UInt16,UInt32> indexToChar;
3130 vector< vector<UInt32> > rulesForIndex;
3131 vector<UInt32> stringRuleLists;
3132 string stringRuleData;
3133 vector<MatClass> matchClasses;
3134 vector<RepClass> repClasses;
3135 #if 0
3136 UInt32 terminatorRule = 0;
3137 bool terminatorCreated = false;
3138 #endif
3139
3140 if (fromUni) {
3141 rulesForIndex.resize(1); // index 0 will be unmapped char
3142 for (UInt32 i = 0; i != rules.size(); ++i) {
3143 vector<Item> initialItems;
3144 findInitialItems(rules[i], initialItems);
3145 for (vector<Item>::iterator j = initialItems.begin(); j != initialItems.end(); ++j) {
3146 UInt32 index;
3147 switch (j->type) {
3148 case 0: // literal
3149 if (j->negate) {
3150 Error("can't start with negated Unicode literal", 0, rules[i].lineNumber);
3151 break;
3152 }
3153 if (charToIndex.find(j->val) == charToIndex.end()) {
3154 // add char to charToIndex mapping
3155 charToIndex[j->val] = rulesForIndex.size();
3156 indexToChar[rulesForIndex.size()] = j->val;
3157 rulesForIndex.resize(rulesForIndex.size() + 1);
3158 addToCharMap(j->val, charToIndex[j->val]);
3159 }
3160 index = charToIndex[j->val];
3161 if (rulesForIndex[index].size() == 0 || rulesForIndex[index].back() != i) {
3162 rulesForIndex[index].push_back(i);
3163 if (rulesForIndex[index].size() == 0x3ff) {
3164 Error("too many matches with same initial character", 0, rules[i].lineNumber);
3165 }
3166 }
3167 break;
3168
3169 case kMatchElem_Type_Class:
3170 {
3171 if (j->negate) {
3172 Error("can't start with negated Unicode class", 0, rules[i].lineNumber);
3173 break;
3174 }
3175 Class& uc = currentPass.uniClassMembers[j->val];
3176 Class::const_iterator u;
3177 for (u = uc.begin(); u != uc.end(); ++u) {
3178 if (charToIndex.find(*u) == charToIndex.end()) {
3179 charToIndex[*u] = rulesForIndex.size();
3180 indexToChar[rulesForIndex.size()] = *u;
3181 rulesForIndex.resize(rulesForIndex.size() + 1);
3182 addToCharMap(*u, charToIndex[*u]);
3183 }
3184 index = charToIndex[*u];
3185 if (rulesForIndex[index].size() == 0 || rulesForIndex[index].back() != i)
3186 rulesForIndex[index].push_back(i);
3187 }
3188 }
3189 break;
3190
3191 case kMatchElem_Type_ANY:
3192 case kMatchElem_Type_EOS:
3193 Error("can't start with ANY or EOS in Unicode match", 0, rules[i].lineNumber);
3194 break;
3195
3196 default:
3197 Error("this can't happen (buildTable 1)");
3198 break;
3199 }
3200 }
3201 }
3202 }
3203 else {
3204 rulesForIndex.resize(256);
3205 for (UInt32 i = 0; i != rules.size(); ++i) {
3206 vector<Item> initialItems;
3207 findInitialItems(rules[i], initialItems);
3208 for (vector<Item>::iterator j = initialItems.begin(); j != initialItems.end(); ++j) {
3209 unsigned int c;
3210 switch (j->type) {
3211 case 0: // literal
3212 if (j->negate) {
3213 // add rule to every char except the literal!
3214 for (c = 0; c < 256; ++c)
3215 if (c != j->val)
3216 if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3217 rulesForIndex[c].push_back(i);
3218 }
3219 else
3220 if (rulesForIndex[j->val].size() == 0 || rulesForIndex[j->val].back() != i)
3221 rulesForIndex[j->val].push_back(i);
3222 break;
3223
3224 case kMatchElem_Type_Class:
3225 {
3226 Class& bc = currentPass.byteClassMembers[j->val];
3227 for (c = 0; c < 256; ++c)
3228 if ((classIndex(c, bc) != -1) != j->negate)
3229 if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3230 rulesForIndex[c].push_back(i);
3231 }
3232 break;
3233
3234 case kMatchElem_Type_ANY:
3235 case kMatchElem_Type_EOS:
3236 if (j->negate) {
3237 Error("rule can't start with negated ANY or EOS", 0, rules[i].lineNumber);
3238 break;
3239 }
3240 for (c = 0; c < 256; ++c)
3241 if (rulesForIndex[c].size() == 0 || rulesForIndex[c].back() != i)
3242 rulesForIndex[c].push_back(i);
3243 break;
3244
3245 default:
3246 Error("this can't happen (buildTable 2)");
3247 break;
3248 }
3249 }
3250 }
3251 }
3252
3253 vector<Lookup> lookup;
3254 lookup.resize(rulesForIndex.size());
3255
3256 UInt32 i;
3257 for (i = 0; errorCount == 0 && i < rulesForIndex.size(); ++i) {
3258 WRITE(lookup[i].usv, 0); // initialize the lookup to all zero bits
3259
3260 if (rulesForIndex[i].size() == 0) {
3261 WRITE(lookup[i].rules.type, kLookupType_Unmapped);
3262 continue;
3263 }
3264
3265 if (rulesForIndex[i].size() == 1) {
3266 const Rule& rule = rules[rulesForIndex[i].front()];
3267 if (isSingleCharRule(rule)) {
3268 if (toUni) {
3269 // mapping to Unicode: direct lookup can only support one Unicode character
3270 if (rule.replaceStr.size() == 1) {
3271 const Item& rep = rule.replaceStr.front();
3272 int t = rep.tag.length() > 0 ? findTag(rep.tag, rule.matchStr) : 0;
3273 switch (rep.type) {
3274 case 0:
3275 WRITE(lookup[i].usv, rep.val);
3276 continue;
3277
3278 case kMatchElem_Type_Class:
3279 {
3280 if (t == -1) {
3281 Error("tag not found", rep.tag.c_str(), rule.lineNumber);
3282 continue;
3283 }
3284 const Item& mat = rule.matchStr[t];
3285 if (mat.type != kMatchElem_Type_Class) {
3286 Error("improper use of class as target of mapping", 0, rule.lineNumber);
3287 continue;
3288 }
3289 Class& rc = currentPass.uniClassMembers[rep.val];
3290 if (fromUni) {
3291 Class& mc = currentPass.uniClassMembers[mat.val];
3292 if (mc.size() != rc.size()) {
3293 Error("class size mismatch", 0, rule.lineNumber);
3294 continue;
3295 }
3296 WRITE(lookup[i].usv, rc[classIndex(indexToChar[i], mc)]);
3297 }
3298 else {
3299 Class& mc = currentPass.byteClassMembers[mat.val];
3300 if (mc.size() != rc.size()) {
3301 Error("class size mismatch", 0, rule.lineNumber);
3302 continue;
3303 }
3304 WRITE(lookup[i].usv, rc[classIndex(i, mc)]);
3305 }
3306 }
3307 continue;
3308
3309 case kMatchElem_Type_Copy:
3310 // should only occur in UU table
3311 if (t > (int)rule.matchStr.size()) {
3312 Error("no corresponding item for copy", 0, rule.lineNumber);
3313 goto ERR_FOUND;
3314 }
3315 WRITE(lookup[i].usv, indexToChar[i]);
3316 continue;
3317
3318 default:
3319 goto STRING_RULE_NEEDED;
3320 }
3321 }
3322 }
3323 else {
3324 // mapping to bytes: we can put up to three bytes into the direct lookup
3325 if (rule.replaceStr.size() <= 3) {
3326 WRITE(lookup[i].bytes.count, rule.replaceStr.size());
3327 // this will get overwritten by lookup[i].rules.type if string rules turn out to be needed
3328 UInt32 j;
3329 for (j = 0; j < rule.replaceStr.size(); ++j) {
3330 const Item& rep = rule.replaceStr[j];
3331 int t = rep.tag.length() > 0 ? findTag(rep.tag, rule.matchStr) : j;
3332 if (t == -1) {
3333 Error("tag not found", rep.tag.c_str(), rule.lineNumber);
3334 goto ERR_FOUND;
3335 }
3336 switch (rep.type) {
3337 case 0: // literal
3338 WRITE(lookup[i].bytes.data[j], rep.val);
3339 break;
3340
3341 case kMatchElem_Type_Class:
3342 if (t > (int)rule.matchStr.size()) {
3343 Error("no corresponding item for class replacement", 0, rule.lineNumber);
3344 goto ERR_FOUND;
3345 }
3346 else {
3347 const Item& mat = rule.matchStr[t];
3348 if (mat.type != kMatchElem_Type_Class) {
3349 Error("improper use of class as target of mapping", 0, rule.lineNumber);
3350 goto ERR_FOUND;
3351 }
3352 Class& rc = currentPass.byteClassMembers[rep.val];
3353 if (fromUni) {
3354 Class& mc = currentPass.uniClassMembers[mat.val];
3355 if (mc.size() != rc.size()) {
3356 Error("class size mismatch", 0, rule.lineNumber);
3357 goto ERR_FOUND;
3358 }
3359 WRITE(lookup[i].bytes.data[j], rc[classIndex(indexToChar[i], mc)]);
3360 }
3361 else {
3362 Class& mc = currentPass.byteClassMembers[mat.val];
3363 if (mc.size() != rc.size()) {
3364 Error("class size mismatch", 0, rule.lineNumber);
3365 goto ERR_FOUND;
3366 }
3367 WRITE(lookup[i].bytes.data[j], rc[classIndex(i, mc)]);
3368 }
3369 }
3370 break;
3371
3372 case kMatchElem_Type_Copy:
3373 // should only occur in BB table
3374 if (t > (int)rule.matchStr.size()) {
3375 Error("no corresponding item for copy", 0, rule.lineNumber);
3376 goto ERR_FOUND;
3377 }
3378 WRITE(lookup[i].bytes.data[j], i);
3379 break;
3380
3381 default:
3382 goto STRING_RULE_NEEDED;
3383 }
3384 }
3385 ERR_FOUND:
3386 continue;
3387 }
3388 }
3389 }
3390 }
3391
3392 STRING_RULE_NEEDED:
3393 #if 0
3394 // decide whether we need to add a default terminating rule
3395 const Rule& finalRule = rules[rulesForIndex[i].back()];
3396 if (!isSingleCharRule(finalRule)) {
3397 if (!terminatorCreated) {
3398 currentRule.clear();
3399 Item item;
3400 item.type = kMatchElem_Type_ANY;
3401 item.negate = 0;
3402 item.repeatMin = 1;
3403 item.repeatMax = 1;
3404 item.val = 0;
3405 currentRule.lhsString.push_back(item);
3406 item.type = kRepElem_Unmapped;
3407 currentRule.rhsString.push_back(item);
3408 terminatorRule = rules.size();
3409 rules.push_back(Rule(currentRule.lhsString,
3410 currentRule.lhsPreContext, currentRule.lhsPostContext,
3411 currentRule.rhsString, finalRule.lineNumber));
3412 terminatorCreated = true;
3413 currentRule.clear();
3414 }
3415 rulesForIndex[i].push_back(terminatorRule);
3416 }
3417 #endif
3418 // set the Lookup fields
3419 if (rulesForIndex[i].size() > 255) {
3420 WRITE(lookup[i].rules.type, kLookupType_ExtStringRules + (rulesForIndex[i].size() >> 8));
3421 WRITE(lookup[i].rules.ruleCount, rulesForIndex[i].size() & 0xff);
3422 usedExtStringRules = true;
3423 }
3424 else {
3425 WRITE(lookup[i].rules.type, kLookupType_StringRules);
3426 WRITE(lookup[i].rules.ruleCount, rulesForIndex[i].size());
3427 }
3428 WRITE(lookup[i].rules.ruleIndex, stringRuleLists.size());
3429
3430 // construct the rule list
3431 if (errorCount == 0) {
3432 for (unsigned int j = 0; j < rulesForIndex[i].size(); ++j) {
3433 Rule& r = rules[rulesForIndex[i][j]];
3434 if (r.offset == kInvalidRuleOffset) {
3435 r.offset = stringRuleData.size();
3436 stringRuleData.append(1, r.matchStr.size());
3437 stringRuleData.append(1, r.postContext.size());
3438 stringRuleData.append(1, r.preContext.size());
3439 stringRuleData.append(1, r.replaceStr.size());
3440 unsigned int k;
3441 for (k = 0; k < r.matchStr.size(); ++k)
3442 appendMatchElem(stringRuleData, r.matchStr[k], k, matchClasses);
3443 for (k = 0; k < r.postContext.size(); ++k)
3444 appendMatchElem(stringRuleData, r.postContext[k], k, matchClasses);
3445 for (k = 0; k < r.preContext.size(); ++k)
3446 appendMatchElem(stringRuleData, r.preContext[k], k, matchClasses);
3447 for (k = 0; k < r.replaceStr.size(); ++k)
3448 appendReplaceElem(stringRuleData, r.replaceStr[k], r.matchStr, repClasses);
3449 }
3450 stringRuleLists.push_back(r.offset);
3451 }
3452 }
3453 }
3454
3455 if (fromUni && buildVars.planeMap.size() > 1)
3456 currentPass.supplementaryChars = true;
3457
3458 UInt32 headerOffset = table.size();
3459 table.append((const char*)&th, sizeof(th));
3460
3461 if (fromUni) {
3462 WRITE(th.pageBase, table.size());
3463 UInt32 i, j;
3464 if (currentPass.supplementaryChars) {
3465 buildVars.planeMap.resize(17, 0xff);
3466 for (i = 0; i < buildVars.planeMap.size(); ++i)
3467 appendToTable(table, buildVars.planeMap[i]);
3468 appendToTable(table, (UInt8)buildVars.pageMaps.size());
3469 align(table, 4);
3470 }
3471
3472 for (i = 0; i < buildVars.pageMaps.size(); ++i)
3473 for (j = 0; j < buildVars.pageMaps[i].size(); ++j)
3474 appendToTable(table, (UInt8)buildVars.pageMaps[i][j]);
3475 align(table, 4);
3476
3477 for (i = 0; i < buildVars.charMaps.size(); ++i)
3478 for (j = 0; j < buildVars.charMaps[i].size(); ++j)
3479 appendToTable(table, buildVars.charMaps[i][j]);
3480 align(table, 4);
3481 }
3482
3483 WRITE(th.lookupBase, table.size());
3484 for (i = 0; i < lookup.size(); ++i)
3485 appendToTable(table, READ(lookup[i].usv));
3486 align(table, 4);
3487
3488 WRITE(th.stringListBase, table.size());
3489 for (i = 0; i < stringRuleLists.size(); ++i)
3490 appendToTable(table, stringRuleLists[i]);
3491 align(table, 4);
3492
3493 WRITE(th.stringRuleData, table.size());
3494 for (i = 0; i < stringRuleData.size(); ++i)
3495 appendToTable(table, stringRuleData[i]);
3496 align(table, 4);
3497
3498 // sort and output the match classes
3499 {
3500 WRITE(th.matchClassBase, table.size() - headerOffset);
3501 vector<UInt32> classOffsets;
3502 classOffsets.resize(matchClasses.size());
3503 UInt32 classOffset = matchClasses.size() * sizeof(UInt32);
3504 string classes;
3505
3506 for (i = 0; i < matchClasses.size(); ++i) {
3507 classOffsets[i] = classOffset + classes.size();
3508 Class sortedClass = fromUni
3509 ? currentPass.uniClassMembers[matchClasses[i].membersClass]
3510 : currentPass.byteClassMembers[matchClasses[i].membersClass];
3511
3512 if (sortedClass.size() > 0) {
3513 sort(sortedClass.begin(), sortedClass.end());
3514 for (UInt32 j = sortedClass.size() - 1; j > 0; --j)
3515 if (sortedClass[j] == sortedClass[j - 1])
3516 sortedClass.erase(sortedClass.begin() + j);
3517 }
3518
3519 appendToTable(classes, (UInt32)sortedClass.size());
3520 if (fromUni)
3521 if (currentPass.supplementaryChars)
3522 for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3523 appendToTable(classes, *x);
3524 else
3525 for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3526 appendToTable(classes, (UInt16)*x);
3527 else
3528 for (Class::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3529 appendToTable(classes, (UInt8)*x);
3530 align(classes, 4);
3531 }
3532 // copy the real classOffsets into the table
3533 for (i = 0; i < classOffsets.size(); ++i)
3534 appendToTable(table, classOffsets[i]);
3535
3536 // now append the actual classes
3537 table.insert(table.end(), classes.begin(), classes.end());
3538 }
3539
3540 // sort and output the replacement classes
3541 {
3542 WRITE(th.repClassBase, table.size());
3543 vector<UInt32> classOffsets;
3544 classOffsets.resize(repClasses.size());
3545 UInt32 classOffset = repClasses.size() * sizeof(UInt32);
3546 string classes;
3547
3548 for (i = 0; i < repClasses.size(); ++i) {
3549 classOffsets[i] = classOffset + classes.size();
3550 vector<Member> sortedClass;
3551 const Class& values = toUni
3552 ? currentPass.uniClassMembers[repClasses[i].membersClass]
3553 : currentPass.byteClassMembers[repClasses[i].membersClass];
3554 const Class& keys = fromUni
3555 ? currentPass.uniClassMembers[repClasses[i].sortLikeClass]
3556 : currentPass.byteClassMembers[repClasses[i].sortLikeClass];
3557 for (UInt32 j = 0; j < values.size(); ++j)
3558 sortedClass.push_back(Member(values[j], keys[j]));
3559
3560 if (sortedClass.size() > 0) {
3561 sort(sortedClass.begin(), sortedClass.end());
3562 for (UInt32 j = sortedClass.size() - 1; j > 0; --j)
3563 if (sortedClass[j].key == sortedClass[j - 1].key)
3564 sortedClass.erase(sortedClass.begin() + j);
3565 }
3566
3567 appendToTable(classes, (UInt32)sortedClass.size());
3568 if (toUni)
3569 if (currentPass.supplementaryChars)
3570 for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3571 appendToTable(classes, x->value);
3572 else
3573 for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3574 appendToTable(classes, (UInt16)x->value);
3575 else
3576 for (vector<Member>::iterator x = sortedClass.begin(); x != sortedClass.end(); ++x)
3577 appendToTable(classes, (UInt8)(x->value));
3578 align(classes, 4);
3579 }
3580 // copy the real classOffsets into the table
3581 for (i = 0; i < classOffsets.size(); ++i)
3582 appendToTable(table, classOffsets[i]);
3583
3584 // now append the actual classes
3585 table.insert(table.end(), classes.begin(), classes.end());
3586 }
3587
3588 // stuff the real header values into the beginning of the table
3589 if (currentPass.supplementaryChars)
3590 WRITE(th.flags, READ(th.flags) | kTableFlags_Supplementary);
3591 WRITE(th.length, table.size());
3592 table.replace(0, sizeof(th), (const char*)&th, sizeof(th));
3593 }
3594
3595 void
clear()3596 Compiler::Pass::clear()
3597 {
3598 fwdRules.clear();
3599 revRules.clear();
3600 xmlRules.clear();
3601 xmlContexts.clear();
3602
3603 byteClassNames.clear();
3604 uniClassNames.clear();
3605
3606 byteClassMembers.clear();
3607 uniClassMembers.clear();
3608
3609 uniDefault = 0xfffd; // REPLACEMENT CHARACTER
3610 byteDefault = '?';
3611 passType = 0;
3612 supplementaryChars = false;
3613 startingLine = 0;
3614 }
3615
3616 void
setLineNo(UInt32 lineNo)3617 Compiler::Pass::setLineNo(UInt32 lineNo)
3618 {
3619 if (startingLine == 0)
3620 startingLine = lineNo;
3621 }
3622
3623 void
clear()3624 Compiler::BuildVars::clear()
3625 {
3626 planeMap.erase(planeMap.begin(), planeMap.end());
3627 pageMaps.clear();
3628 charMaps.clear();
3629 maxMatch = 1;
3630 maxPre = 0;
3631 maxPost = 0;
3632 maxOutput = 0;
3633 }
3634
3635 void
clear()3636 Compiler::CurrRule::clear()
3637 {
3638 lhsString.clear();
3639 lhsPreContext.clear();
3640 lhsPostContext.clear();
3641 rhsString.clear();
3642 rhsPreContext.clear();
3643 rhsPostContext.clear();
3644 startingLine = 0;
3645 }
3646
3647 void
setLineNo(UInt32 lineNo)3648 Compiler::CurrRule::setLineNo(UInt32 lineNo)
3649 {
3650 if (startingLine == 0)
3651 startingLine = lineNo;
3652 }
3653