1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15 
16 #include "unicode/utypes.h"
17 
18 #if !UCONFIG_NO_COLLATION
19 
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "uvectr32.h"
38 
39 U_NAMESPACE_BEGIN
40 
41 namespace {
42 
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45 
46 }  // namespace
47 
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49 
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52 
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55 
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57 
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59         : nfd(*Normalizer2::getNFDInstance(errorCode)),
60           nfc(*Normalizer2::getNFCInstance(errorCode)),
61           rules(NULL), baseData(base), settings(NULL),
62           parseError(NULL), errorReason(NULL),
63           sink(NULL), importer(NULL),
64           ruleIndex(0) {
65 }
66 
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69 
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72                            CollationSettings &outSettings,
73                            UParseError *outParseError,
74                            UErrorCode &errorCode) {
75     if(U_FAILURE(errorCode)) { return; }
76     settings = &outSettings;
77     parseError = outParseError;
78     if(parseError != NULL) {
79         parseError->line = 0;
80         parseError->offset = -1;
81         parseError->preContext[0] = 0;
82         parseError->postContext[0] = 0;
83     }
84     errorReason = NULL;
85     parse(ruleString, errorCode);
86 }
87 
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90     if(U_FAILURE(errorCode)) { return; }
91     rules = &ruleString;
92     ruleIndex = 0;
93 
94     while(ruleIndex < rules->length()) {
95         UChar c = rules->charAt(ruleIndex);
96         if(PatternProps::isWhiteSpace(c)) {
97             ++ruleIndex;
98             continue;
99         }
100         switch(c) {
101         case 0x26:  // '&'
102             parseRuleChain(errorCode);
103             break;
104         case 0x5b:  // '['
105             parseSetting(errorCode);
106             break;
107         case 0x23:  // '#' starts a comment, until the end of the line
108             ruleIndex = skipComment(ruleIndex + 1);
109             break;
110         case 0x40:  // '@' is equivalent to [backwards 2]
111             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112                               UCOL_ON, 0, errorCode);
113             ++ruleIndex;
114             break;
115         case 0x21:  // '!' used to turn on Thai/Lao character reversal
116             // Accept but ignore. The root collator has contractions
117             // that are equivalent to the character reversal, where appropriate.
118             ++ruleIndex;
119             break;
120         default:
121             setParseError("expected a reset or setting or comment", errorCode);
122             break;
123         }
124         if(U_FAILURE(errorCode)) { return; }
125     }
126 }
127 
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130     int32_t resetStrength = parseResetAndPosition(errorCode);
131     UBool isFirstRelation = TRUE;
132     for(;;) {
133         int32_t result = parseRelationOperator(errorCode);
134         if(U_FAILURE(errorCode)) { return; }
135         if(result < 0) {
136             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137                 // '#' starts a comment, until the end of the line
138                 ruleIndex = skipComment(ruleIndex + 1);
139                 continue;
140             }
141             if(isFirstRelation) {
142                 setParseError("reset not followed by a relation", errorCode);
143             }
144             return;
145         }
146         int32_t strength = result & STRENGTH_MASK;
147         if(resetStrength < UCOL_IDENTICAL) {
148             // reset-before rule chain
149             if(isFirstRelation) {
150                 if(strength != resetStrength) {
151                     setParseError("reset-before strength differs from its first relation", errorCode);
152                     return;
153                 }
154             } else {
155                 if(strength < resetStrength) {
156                     setParseError("reset-before strength followed by a stronger relation", errorCode);
157                     return;
158                 }
159             }
160         }
161         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
162         if((result & STARRED_FLAG) == 0) {
163             parseRelationStrings(strength, i, errorCode);
164         } else {
165             parseStarredCharacters(strength, i, errorCode);
166         }
167         if(U_FAILURE(errorCode)) { return; }
168         isFirstRelation = FALSE;
169     }
170 }
171 
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175     int32_t i = skipWhiteSpace(ruleIndex + 1);
176     int32_t j;
177     UChar c;
178     int32_t resetStrength;
179     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180             (j = i + BEFORE_LENGTH) < rules->length() &&
181             PatternProps::isWhiteSpace(rules->charAt(j)) &&
182             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184             rules->charAt(j + 1) == 0x5d) {
185         // &[before n] with n=1 or 2 or 3
186         resetStrength = UCOL_PRIMARY + (c - 0x31);
187         i = skipWhiteSpace(j + 2);
188     } else {
189         resetStrength = UCOL_IDENTICAL;
190     }
191     if(i >= rules->length()) {
192         setParseError("reset without position", errorCode);
193         return UCOL_DEFAULT;
194     }
195     UnicodeString str;
196     if(rules->charAt(i) == 0x5b) {  // '['
197         i = parseSpecialPosition(i, str, errorCode);
198     } else {
199         i = parseTailoringString(i, str, errorCode);
200     }
201     sink->addReset(resetStrength, str, errorReason, errorCode);
202     if(U_FAILURE(errorCode)) { setErrorContext(); }
203     ruleIndex = i;
204     return resetStrength;
205 }
206 
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210     ruleIndex = skipWhiteSpace(ruleIndex);
211     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212     int32_t strength;
213     int32_t i = ruleIndex;
214     UChar c = rules->charAt(i++);
215     switch(c) {
216     case 0x3c:  // '<'
217         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
218             ++i;
219             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
220                 ++i;
221                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
222                     ++i;
223                     strength = UCOL_QUATERNARY;
224                 } else {
225                     strength = UCOL_TERTIARY;
226                 }
227             } else {
228                 strength = UCOL_SECONDARY;
229             }
230         } else {
231             strength = UCOL_PRIMARY;
232         }
233         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
234             ++i;
235             strength |= STARRED_FLAG;
236         }
237         break;
238     case 0x3b:  // ';' same as <<
239         strength = UCOL_SECONDARY;
240         break;
241     case 0x2c:  // ',' same as <<<
242         strength = UCOL_TERTIARY;
243         break;
244     case 0x3d:  // '='
245         strength = UCOL_IDENTICAL;
246         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
247             ++i;
248             strength |= STARRED_FLAG;
249         }
250         break;
251     default:
252         return UCOL_DEFAULT;
253     }
254     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256 
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259     // Parse
260     //     prefix | str / extension
261     // where prefix and extension are optional.
262     UnicodeString prefix, str, extension;
263     i = parseTailoringString(i, str, errorCode);
264     if(U_FAILURE(errorCode)) { return; }
265     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266     if(next == 0x7c) {  // '|' separates the context prefix from the string.
267         prefix = str;
268         i = parseTailoringString(i + 1, str, errorCode);
269         if(U_FAILURE(errorCode)) { return; }
270         next = (i < rules->length()) ? rules->charAt(i) : 0;
271     }
272     if(next == 0x2f) {  // '/' separates the string from the extension.
273         i = parseTailoringString(i + 1, extension, errorCode);
274     }
275     if(!prefix.isEmpty()) {
276         UChar32 prefix0 = prefix.char32At(0);
277         UChar32 c = str.char32At(0);
278         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280                           errorCode);
281             return;
282         }
283     }
284     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285     if(U_FAILURE(errorCode)) { setErrorContext(); }
286     ruleIndex = i;
287 }
288 
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291     UnicodeString empty, raw;
292     i = parseString(skipWhiteSpace(i), raw, errorCode);
293     if(U_FAILURE(errorCode)) { return; }
294     if(raw.isEmpty()) {
295         setParseError("missing starred-relation string", errorCode);
296         return;
297     }
298     UChar32 prev = -1;
299     int32_t j = 0;
300     for(;;) {
301         while(j < raw.length()) {
302             UChar32 c = raw.char32At(j);
303             if(!nfd.isInert(c)) {
304                 setParseError("starred-relation string is not all NFD-inert", errorCode);
305                 return;
306             }
307             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308             if(U_FAILURE(errorCode)) {
309                 setErrorContext();
310                 return;
311             }
312             j += U16_LENGTH(c);
313             prev = c;
314         }
315         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
316             break;
317         }
318         if(prev < 0) {
319             setParseError("range without start in starred-relation string", errorCode);
320             return;
321         }
322         i = parseString(i + 1, raw, errorCode);
323         if(U_FAILURE(errorCode)) { return; }
324         if(raw.isEmpty()) {
325             setParseError("range without end in starred-relation string", errorCode);
326             return;
327         }
328         UChar32 c = raw.char32At(0);
329         if(c < prev) {
330             setParseError("range start greater than end in starred-relation string", errorCode);
331             return;
332         }
333         // range prev-c
334         UnicodeString s;
335         while(++prev <= c) {
336             if(!nfd.isInert(prev)) {
337                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338                 return;
339             }
340             if(U_IS_SURROGATE(prev)) {
341                 setParseError("starred-relation string range contains a surrogate", errorCode);
342                 return;
343             }
344             if(0xfffd <= prev && prev <= 0xffff) {
345                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346                 return;
347             }
348             s.setTo(prev);
349             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350             if(U_FAILURE(errorCode)) {
351                 setErrorContext();
352                 return;
353             }
354         }
355         prev = -1;
356         j = U16_LENGTH(c);
357     }
358     ruleIndex = skipWhiteSpace(i);
359 }
360 
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363     i = parseString(skipWhiteSpace(i), raw, errorCode);
364     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365         setParseError("missing relation string", errorCode);
366     }
367     return skipWhiteSpace(i);
368 }
369 
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372     if(U_FAILURE(errorCode)) { return i; }
373     raw.remove();
374     while(i < rules->length()) {
375         UChar32 c = rules->charAt(i++);
376         if(isSyntaxChar(c)) {
377             if(c == 0x27) {  // apostrophe
378                 if(i < rules->length() && rules->charAt(i) == 0x27) {
379                     // Double apostrophe, encodes a single one.
380                     raw.append((UChar)0x27);
381                     ++i;
382                     continue;
383                 }
384                 // Quote literal text until the next single apostrophe.
385                 for(;;) {
386                     if(i == rules->length()) {
387                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
388                         return i;
389                     }
390                     c = rules->charAt(i++);
391                     if(c == 0x27) {
392                         if(i < rules->length() && rules->charAt(i) == 0x27) {
393                             // Double apostrophe inside quoted literal text,
394                             // still encodes a single apostrophe.
395                             ++i;
396                         } else {
397                             break;
398                         }
399                     }
400                     raw.append((UChar)c);
401                 }
402             } else if(c == 0x5c) {  // backslash
403                 if(i == rules->length()) {
404                     setParseError("backslash escape at the end of the rule string", errorCode);
405                     return i;
406                 }
407                 c = rules->char32At(i);
408                 raw.append(c);
409                 i += U16_LENGTH(c);
410             } else {
411                 // Any other syntax character terminates a string.
412                 --i;
413                 break;
414             }
415         } else if(PatternProps::isWhiteSpace(c)) {
416             // Unquoted white space terminates a string.
417             --i;
418             break;
419         } else {
420             raw.append((UChar)c);
421         }
422     }
423     for(int32_t j = 0; j < raw.length();) {
424         UChar32 c = raw.char32At(j);
425         if(U_IS_SURROGATE(c)) {
426             setParseError("string contains an unpaired surrogate", errorCode);
427             return i;
428         }
429         if(0xfffd <= c && c <= 0xffff) {
430             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431             return i;
432         }
433         j += U16_LENGTH(c);
434     }
435     return i;
436 }
437 
438 namespace {
439 
440 static const char *const positions[] = {
441     "first tertiary ignorable",
442     "last tertiary ignorable",
443     "first secondary ignorable",
444     "last secondary ignorable",
445     "first primary ignorable",
446     "last primary ignorable",
447     "first variable",
448     "last variable",
449     "first regular",
450     "last regular",
451     "first implicit",
452     "last implicit",
453     "first trailing",
454     "last trailing"
455 };
456 
457 }  // namespace
458 
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461     if(U_FAILURE(errorCode)) { return 0; }
462     UnicodeString raw;
463     int32_t j = readWords(i + 1, raw);
464     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
465         ++j;
466         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469                 return j;
470             }
471         }
472         if(raw == UNICODE_STRING_SIMPLE("top")) {
473             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474             return j;
475         }
476         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478             return j;
479         }
480     }
481     setParseError("not a valid special reset position", errorCode);
482     return i;
483 }
484 
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487     if(U_FAILURE(errorCode)) { return; }
488     UnicodeString raw;
489     int32_t i = ruleIndex + 1;
490     int32_t j = readWords(i, raw);
491     if(j <= i || raw.isEmpty()) {
492         setParseError("expected a setting/option at '['", errorCode);
493     }
494     if(rules->charAt(j) == 0x5d) {  // words end with ]
495         ++j;
496         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498             parseReordering(raw, errorCode);
499             ruleIndex = j;
500             return;
501         }
502         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504                               UCOL_ON, 0, errorCode);
505             ruleIndex = j;
506             return;
507         }
508         UnicodeString v;
509         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510         if(valueIndex >= 0) {
511             v.setTo(raw, valueIndex + 1);
512             raw.truncate(valueIndex);
513         }
514         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515             int32_t value = UCOL_DEFAULT;
516             UChar c = v.charAt(0);
517             if(0x31 <= c && c <= 0x34) {  // 1..4
518                 value = UCOL_PRIMARY + (c - 0x31);
519             } else if(c == 0x49) {  // 'I'
520                 value = UCOL_IDENTICAL;
521             }
522             if(value != UCOL_DEFAULT) {
523                 settings->setStrength(value, 0, errorCode);
524                 ruleIndex = j;
525                 return;
526             }
527         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528             UColAttributeValue value = UCOL_DEFAULT;
529             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530                 value = UCOL_NON_IGNORABLE;
531             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532                 value = UCOL_SHIFTED;
533             }
534             if(value != UCOL_DEFAULT) {
535                 settings->setAlternateHandling(value, 0, errorCode);
536                 ruleIndex = j;
537                 return;
538             }
539         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540             int32_t value = UCOL_DEFAULT;
541             if(v == UNICODE_STRING_SIMPLE("space")) {
542                 value = CollationSettings::MAX_VAR_SPACE;
543             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544                 value = CollationSettings::MAX_VAR_PUNCT;
545             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546                 value = CollationSettings::MAX_VAR_SYMBOL;
547             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548                 value = CollationSettings::MAX_VAR_CURRENCY;
549             }
550             if(value != UCOL_DEFAULT) {
551                 settings->setMaxVariable(value, 0, errorCode);
552                 settings->variableTop = baseData->getLastPrimaryForGroup(
553                     UCOL_REORDER_CODE_FIRST + value);
554                 U_ASSERT(settings->variableTop != 0);
555                 ruleIndex = j;
556                 return;
557             }
558         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559             UColAttributeValue value = UCOL_DEFAULT;
560             if(v == UNICODE_STRING_SIMPLE("off")) {
561                 value = UCOL_OFF;
562             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563                 value = UCOL_LOWER_FIRST;
564             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565                 value = UCOL_UPPER_FIRST;
566             }
567             if(value != UCOL_DEFAULT) {
568                 settings->setCaseFirst(value, 0, errorCode);
569                 ruleIndex = j;
570                 return;
571             }
572         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573             UColAttributeValue value = getOnOffValue(v);
574             if(value != UCOL_DEFAULT) {
575                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576                 ruleIndex = j;
577                 return;
578             }
579         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580             UColAttributeValue value = getOnOffValue(v);
581             if(value != UCOL_DEFAULT) {
582                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583                 ruleIndex = j;
584                 return;
585             }
586         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587             UColAttributeValue value = getOnOffValue(v);
588             if(value != UCOL_DEFAULT) {
589                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590                 ruleIndex = j;
591                 return;
592             }
593         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594             UColAttributeValue value = getOnOffValue(v);
595             if(value != UCOL_DEFAULT) {
596                 if(value == UCOL_ON) {
597                     setParseError("[hiraganaQ on] is not supported", errorCode);
598                 }
599                 ruleIndex = j;
600                 return;
601             }
602         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603             CharString lang;
604             lang.appendInvariantChars(v, errorCode);
605             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606             // BCP 47 language tag -> ICU locale ID
607             char localeID[ULOC_FULLNAME_CAPACITY];
608             int32_t parsedLength;
609             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610                                                  &parsedLength, &errorCode);
611             if(U_FAILURE(errorCode) ||
612                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613                 errorCode = U_ZERO_ERROR;
614                 setParseError("expected language tag in [import langTag]", errorCode);
615                 return;
616             }
617             // localeID minus all keywords
618             char baseID[ULOC_FULLNAME_CAPACITY];
619             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621                 errorCode = U_ZERO_ERROR;
622                 setParseError("expected language tag in [import langTag]", errorCode);
623                 return;
624             }
625             if(length == 0) {
626                 uprv_strcpy(baseID, "root");
627             } else if(*baseID == '_') {
628                 uprv_memmove(baseID + 3, baseID, length + 1);
629                 uprv_memcpy(baseID, "und", 3);
630             }
631             // @collation=type, or length=0 if not specified
632             char collationType[ULOC_KEYWORDS_CAPACITY];
633             length = uloc_getKeywordValue(localeID, "collation",
634                                           collationType, ULOC_KEYWORDS_CAPACITY,
635                                           &errorCode);
636             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
637                 errorCode = U_ZERO_ERROR;
638                 setParseError("expected language tag in [import langTag]", errorCode);
639                 return;
640             }
641             if(importer == NULL) {
642                 setParseError("[import langTag] is not supported", errorCode);
643             } else {
644                 UnicodeString importedRules;
645                 importer->getRules(baseID, length > 0 ? collationType : "standard",
646                                    importedRules, errorReason, errorCode);
647                 if(U_FAILURE(errorCode)) {
648                     if(errorReason == NULL) {
649                         errorReason = "[import langTag] failed";
650                     }
651                     setErrorContext();
652                     return;
653                 }
654                 const UnicodeString *outerRules = rules;
655                 int32_t outerRuleIndex = ruleIndex;
656                 parse(importedRules, errorCode);
657                 if(U_FAILURE(errorCode)) {
658                     if(parseError != NULL) {
659                         parseError->offset = outerRuleIndex;
660                     }
661                 }
662                 rules = outerRules;
663                 ruleIndex = j;
664             }
665             return;
666         }
667     } else if(rules->charAt(j) == 0x5b) {  // words end with [
668         UnicodeSet set;
669         j = parseUnicodeSet(j, set, errorCode);
670         if(U_FAILURE(errorCode)) { return; }
671         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
672             sink->optimize(set, errorReason, errorCode);
673             if(U_FAILURE(errorCode)) { setErrorContext(); }
674             ruleIndex = j;
675             return;
676         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
677             sink->suppressContractions(set, errorReason, errorCode);
678             if(U_FAILURE(errorCode)) { setErrorContext(); }
679             ruleIndex = j;
680             return;
681         }
682     }
683     setParseError("not a valid setting/option", errorCode);
684 }
685 
686 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)687 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
688     if(U_FAILURE(errorCode)) { return; }
689     int32_t i = 7;  // after "reorder"
690     if(i == raw.length()) {
691         // empty [reorder] with no codes
692         settings->resetReordering();
693         return;
694     }
695     // Parse the codes in [reorder aa bb cc].
696     UVector32 reorderCodes(errorCode);
697     if(U_FAILURE(errorCode)) { return; }
698     CharString word;
699     while(i < raw.length()) {
700         ++i;  // skip the word-separating space
701         int32_t limit = raw.indexOf((UChar)0x20, i);
702         if(limit < 0) { limit = raw.length(); }
703         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
704         if(U_FAILURE(errorCode)) { return; }
705         int32_t code = getReorderCode(word.data());
706         if(code < 0) {
707             setParseError("unknown script or reorder code", errorCode);
708             return;
709         }
710         reorderCodes.addElement(code, errorCode);
711         if(U_FAILURE(errorCode)) { return; }
712         i = limit;
713     }
714     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
715 }
716 
717 static const char *const gSpecialReorderCodes[] = {
718     "space", "punct", "symbol", "currency", "digit"
719 };
720 
721 int32_t
getReorderCode(const char * word)722 CollationRuleParser::getReorderCode(const char *word) {
723     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
724         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
725             return UCOL_REORDER_CODE_FIRST + i;
726         }
727     }
728     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
729     if(script >= 0) {
730         return script;
731     }
732     if(uprv_stricmp(word, "others") == 0) {
733         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
734     }
735     return -1;
736 }
737 
738 UColAttributeValue
getOnOffValue(const UnicodeString & s)739 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
740     if(s == UNICODE_STRING_SIMPLE("on")) {
741         return UCOL_ON;
742     } else if(s == UNICODE_STRING_SIMPLE("off")) {
743         return UCOL_OFF;
744     } else {
745         return UCOL_DEFAULT;
746     }
747 }
748 
749 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)750 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
751     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
752     int32_t level = 0;
753     int32_t j = i;
754     for(;;) {
755         if(j == rules->length()) {
756             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
757             return j;
758         }
759         UChar c = rules->charAt(j++);
760         if(c == 0x5b) {  // '['
761             ++level;
762         } else if(c == 0x5d) {  // ']'
763             if(--level == 0) { break; }
764         }
765     }
766     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
767     if(U_FAILURE(errorCode)) {
768         errorCode = U_ZERO_ERROR;
769         setParseError("not a valid UnicodeSet pattern", errorCode);
770         return j;
771     }
772     j = skipWhiteSpace(j);
773     if(j == rules->length() || rules->charAt(j) != 0x5d) {
774         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
775         return j;
776     }
777     return ++j;
778 }
779 
780 int32_t
readWords(int32_t i,UnicodeString & raw) const781 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
782     static const UChar sp = 0x20;
783     raw.remove();
784     i = skipWhiteSpace(i);
785     for(;;) {
786         if(i >= rules->length()) { return 0; }
787         UChar c = rules->charAt(i);
788         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
789             if(raw.isEmpty()) { return i; }
790             if(raw.endsWith(&sp, 1)) {  // remove trailing space
791                 raw.truncate(raw.length() - 1);
792             }
793             return i;
794         }
795         if(PatternProps::isWhiteSpace(c)) {
796             raw.append(sp);
797             i = skipWhiteSpace(i + 1);
798         } else {
799             raw.append(c);
800             ++i;
801         }
802     }
803 }
804 
805 int32_t
skipComment(int32_t i) const806 CollationRuleParser::skipComment(int32_t i) const {
807     // skip to past the newline
808     while(i < rules->length()) {
809         UChar c = rules->charAt(i++);
810         // LF or FF or CR or NEL or LS or PS
811         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
812             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
813             // NLF (new line function) = CR or LF or CR+LF or NEL.
814             // No need to collect all of CR+LF because a following LF will be ignored anyway.
815             break;
816         }
817     }
818     return i;
819 }
820 
821 void
setParseError(const char * reason,UErrorCode & errorCode)822 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
823     if(U_FAILURE(errorCode)) { return; }
824     // Error code consistent with the old parser (from ca. 2001),
825     // rather than U_PARSE_ERROR;
826     errorCode = U_INVALID_FORMAT_ERROR;
827     errorReason = reason;
828     if(parseError != NULL) { setErrorContext(); }
829 }
830 
831 void
setErrorContext()832 CollationRuleParser::setErrorContext() {
833     if(parseError == NULL) { return; }
834 
835     // Note: This relies on the calling code maintaining the ruleIndex
836     // at a position that is useful for debugging.
837     // For example, at the beginning of a reset or relation etc.
838     parseError->offset = ruleIndex;
839     parseError->line = 0;  // We are not counting line numbers.
840 
841     // before ruleIndex
842     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
843     if(start < 0) {
844         start = 0;
845     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
846         ++start;
847     }
848     int32_t length = ruleIndex - start;
849     rules->extract(start, length, parseError->preContext);
850     parseError->preContext[length] = 0;
851 
852     // starting from ruleIndex
853     length = rules->length() - ruleIndex;
854     if(length >= U_PARSE_CONTEXT_LEN) {
855         length = U_PARSE_CONTEXT_LEN - 1;
856         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
857             --length;
858         }
859     }
860     rules->extract(ruleIndex, length, parseError->postContext);
861     parseError->postContext[length] = 0;
862 }
863 
864 UBool
isSyntaxChar(UChar32 c)865 CollationRuleParser::isSyntaxChar(UChar32 c) {
866     return 0x21 <= c && c <= 0x7e &&
867             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
868             (0x5b <= c && c <= 0x60) || (0x7b <= c));
869 }
870 
871 int32_t
skipWhiteSpace(int32_t i) const872 CollationRuleParser::skipWhiteSpace(int32_t i) const {
873     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
874         ++i;
875     }
876     return i;
877 }
878 
879 U_NAMESPACE_END
880 
881 #endif  // !UCONFIG_NO_COLLATION
882