1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24 
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41 
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h"  // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52 
53 
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57 
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59     if (!(x)) { \
60         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61     } \
62 } UPRV_BLOCK_MACRO_END
63 
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65     if (U_FAILURE(errcode)) { \
66         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67     } \
68 } UPRV_BLOCK_MACRO_END
69 
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74 
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78 
79 
80 //  Note:  Before adding new tests to this file, check whether the desired test data can
81 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
82 //         it's much less work than writing a new test, diagnostic output in the event of failures
83 //         is good, and the test data file will is shared with ICU4J, so eventually the test
84 //         will run there as well, without additional effort.
85 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89     fTestParams = params;
90 
91     TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93     TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96     TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98     TESTCASE_AUTO(TestGetAvailableLocales);
99     TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101     TESTCASE_AUTO(TestEndBehaviour);
102     TESTCASE_AUTO(TestWordBreaks);
103     TESTCASE_AUTO(TestWordBoundary);
104     TESTCASE_AUTO(TestLineBreaks);
105     TESTCASE_AUTO(TestSentBreaks);
106     TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109     TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112     TESTCASE_AUTO(TestBug3818);
113 #endif
114     TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116     TESTCASE_AUTO(TestBug5775);
117 #endif
118     TESTCASE_AUTO(TestBug9983);
119     TESTCASE_AUTO(TestDictRules);
120     TESTCASE_AUTO(TestBug5532);
121     TESTCASE_AUTO(TestBug7547);
122     TESTCASE_AUTO(TestBug12797);
123     TESTCASE_AUTO(TestBug12918);
124     TESTCASE_AUTO(TestBug12932);
125     TESTCASE_AUTO(TestEmoji);
126     TESTCASE_AUTO(TestBug12519);
127     TESTCASE_AUTO(TestBug12677);
128     TESTCASE_AUTO(TestTableRedundancies);
129     TESTCASE_AUTO(TestBug13447);
130     TESTCASE_AUTO(TestReverse);
131     TESTCASE_AUTO(TestBug13692);
132     TESTCASE_AUTO(TestDebugRules);
133     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137     TESTCASE_AUTO(TestTable_8_16_Bits);
138     TESTCASE_AUTO(TestBug13590);
139     TESTCASE_AUTO(TestUnpairedSurrogate);
140     TESTCASE_AUTO(TestLSTMThai);
141     TESTCASE_AUTO(TestLSTMBurmese);
142 
143 #if U_ENABLE_TRACING
144     TESTCASE_AUTO(TestTraceCreateCharacter);
145     TESTCASE_AUTO(TestTraceCreateWord);
146     TESTCASE_AUTO(TestTraceCreateSentence);
147     TESTCASE_AUTO(TestTraceCreateTitle);
148     TESTCASE_AUTO(TestTraceCreateLine);
149     TESTCASE_AUTO(TestTraceCreateLineNormal);
150     TESTCASE_AUTO(TestTraceCreateLineLoose);
151     TESTCASE_AUTO(TestTraceCreateLineStrict);
152     TESTCASE_AUTO(TestTraceCreateBreakEngine);
153 #endif
154 
155     TESTCASE_AUTO_END;
156 }
157 
158 
159 //--------------------------------------------------------------------------------------
160 //
161 //    RBBITest    constructor and destructor
162 //
163 //--------------------------------------------------------------------------------------
164 
RBBITest()165 RBBITest::RBBITest() {
166     fTestParams = NULL;
167 }
168 
169 
~RBBITest()170 RBBITest::~RBBITest() {
171 }
172 
173 
printStringBreaks(UText * tstr,int expected[],int expectedCount)174 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
175     UErrorCode status = U_ZERO_ERROR;
176     char name[100];
177     printf("code    alpha extend alphanum type word sent line name\n");
178     int nextExpectedIndex = 0;
179     utext_setNativeIndex(tstr, 0);
180     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
181         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
182             printf("------------------------------------------------ %d\n", j);
183             ++nextExpectedIndex;
184         }
185 
186         UChar32 c = utext_next32(tstr);
187         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
188         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
189                            u_isUAlphabetic(c),
190                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
191                            u_isalnum(c),
192                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
193                                                   u_charType(c),
194                                                   U_SHORT_PROPERTY_NAME),
195                            u_getPropertyValueName(UCHAR_WORD_BREAK,
196                                                   u_getIntPropertyValue(c,
197                                                           UCHAR_WORD_BREAK),
198                                                   U_SHORT_PROPERTY_NAME),
199                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
200                                    u_getIntPropertyValue(c,
201                                            UCHAR_SENTENCE_BREAK),
202                                    U_SHORT_PROPERTY_NAME),
203                            u_getPropertyValueName(UCHAR_LINE_BREAK,
204                                    u_getIntPropertyValue(c,
205                                            UCHAR_LINE_BREAK),
206                                    U_SHORT_PROPERTY_NAME),
207                            name);
208     }
209 }
210 
211 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)212 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
213    UErrorCode status = U_ZERO_ERROR;
214    UText *tstr = NULL;
215    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
216    if (U_FAILURE(status)) {
217        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
218        return;
219     }
220    printStringBreaks(tstr, expected, expectedCount);
221    utext_close(tstr);
222 }
223 
224 
TestBug3818()225 void RBBITest::TestBug3818() {
226     UErrorCode  status = U_ZERO_ERROR;
227 
228     // Four Thai words...
229     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
230                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
231     UnicodeString  thaiStr(thaiWordData);
232 
233     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
234     if (U_FAILURE(status) || bi == NULL) {
235         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
236         return;
237     }
238     bi->setText(thaiStr);
239 
240     int32_t  startOfSecondWord = bi->following(1);
241     if (startOfSecondWord != 4) {
242         errln("Fail at file %s, line %d expected start of word at 4, got %d",
243             __FILE__, __LINE__, startOfSecondWord);
244     }
245     startOfSecondWord = bi->following(0);
246     if (startOfSecondWord != 4) {
247         errln("Fail at file %s, line %d expected start of word at 4, got %d",
248             __FILE__, __LINE__, startOfSecondWord);
249     }
250     delete bi;
251 }
252 
253 
254 //---------------------------------------------
255 //
256 //     other tests
257 //
258 //---------------------------------------------
259 
TestGetAvailableLocales()260 void RBBITest::TestGetAvailableLocales()
261 {
262     int32_t locCount = 0;
263     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
264 
265     if (locCount == 0)
266         dataerrln("getAvailableLocales() returned an empty list!");
267     // Just make sure that it's returning good memory.
268     int32_t i;
269     for (i = 0; i < locCount; ++i) {
270         logln(locList[i].getName());
271     }
272 }
273 
274 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()275 void RBBITest::TestGetDisplayName()
276 {
277     UnicodeString   result;
278 
279     BreakIterator::getDisplayName(Locale::getUS(), result);
280     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
281         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
282                 + result);
283 
284     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
285     if (result != "French (France)")
286         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
287                 + result);
288 }
289 /**
290  * Test End Behaviour
291  * @bug 4068137
292  */
TestEndBehaviour()293 void RBBITest::TestEndBehaviour()
294 {
295     UErrorCode status = U_ZERO_ERROR;
296     UnicodeString testString("boo.");
297     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
298     if (U_FAILURE(status))
299     {
300         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
301         return;
302     }
303     wb->setText(testString);
304 
305     if (wb->first() != 0)
306         errln("Didn't get break at beginning of string.");
307     if (wb->next() != 3)
308         errln("Didn't get break before period in \"boo.\"");
309     if (wb->current() != 4 && wb->next() != 4)
310         errln("Didn't get break at end of string.");
311     delete wb;
312 }
313 /*
314  * @bug 4153072
315  */
TestBug4153072()316 void RBBITest::TestBug4153072() {
317     UErrorCode status = U_ZERO_ERROR;
318     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
319     if (U_FAILURE(status))
320     {
321         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
322         return;
323     }
324     UnicodeString str("...Hello, World!...");
325     int32_t begin = 3;
326     int32_t end = str.length() - 3;
327     UBool onBoundary;
328 
329     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
330     iter->adoptText(textIterator);
331     int index;
332     // Note: with the switch to UText, there is no way to restrict the
333     //       iteration range to begin at an index other than zero.
334     //       String character iterators created with a non-zero bound are
335     //         treated by RBBI as being empty.
336     for (index = -1; index < begin + 1; ++index) {
337         onBoundary = iter->isBoundary(index);
338         if (index == 0?  !onBoundary : onBoundary) {
339             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
340                             " and begin index = " + begin);
341         }
342     }
343     delete iter;
344 }
345 
346 
347 //
348 // Test for problem reported by Ashok Matoria on 9 July 2007
349 //    One.<kSoftHyphen><kSpace>Two.
350 //
351 //    Sentence break at start (0) and then on calling next() it breaks at
352 //   'T' of "Two". Now, at this point if I do next() and
353 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
354 //
TestBug5775()355 void RBBITest::TestBug5775() {
356     UErrorCode status = U_ZERO_ERROR;
357     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
358     TEST_ASSERT_SUCCESS(status);
359     if (U_FAILURE(status)) {
360         return;
361     }
362 // Check for status first for better handling of no data errors.
363     TEST_ASSERT(bi != NULL);
364     if (bi == NULL) {
365         return;
366     }
367 
368     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
369     //               01234      56789
370     s = s.unescape();
371     bi->setText(s);
372     int pos = bi->next();
373     TEST_ASSERT(pos == 6);
374     pos = bi->next();
375     TEST_ASSERT(pos == 10);
376     pos = bi->previous();
377     TEST_ASSERT(pos == 6);
378     delete bi;
379 }
380 
381 
382 
383 //------------------------------------------------------------------------------
384 //
385 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
386 //
387 //------------------------------------------------------------------------------
388 
389 struct TestParams {
390     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
391                                            //   Changed out whenever test data changes break type.
392 
393     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
394     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
395     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
396     UVector32       *srcCol;
397 
398     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
399     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
400     CharString       utf8String;           // UTF-8 form of text to break.
401 
TestParamsTestParams402     TestParams(UErrorCode &status) : dataToBreak() {
403         bi               = NULL;
404         expectedBreaks   = new UVector32(status);
405         srcLine          = new UVector32(status);
406         srcCol           = new UVector32(status);
407         textToBreak      = NULL;
408         textMap          = new UVector32(status);
409     }
410 
~TestParamsTestParams411     ~TestParams() {
412         delete bi;
413         delete expectedBreaks;
414         delete srcLine;
415         delete srcCol;
416         utext_close(textToBreak);
417         delete textMap;
418     }
419 
420     int32_t getSrcLine(int32_t bp);
421     int32_t getExpectedBreak(int32_t bp);
422     int32_t getSrcCol(int32_t bp);
423 
424     void setUTF16(UErrorCode &status);
425     void setUTF8(UErrorCode &status);
426 };
427 
428 // Append a UnicodeString to a CharString with UTF-8 encoding.
429 // Substitute any invalid chars.
430 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)431 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
432     if (U_FAILURE(status)) {
433         return;
434     }
435     int32_t utf8Length;
436     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
437                        src.getBuffer(), src.length(),   // UTF-16 data
438                        0xfffd, NULL,                    // Substitution char, number of subs.
439                        &status);
440     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
441         return;
442     }
443     status = U_ZERO_ERROR;
444     int32_t capacity;
445     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
446     u_strToUTF8WithSub(buffer, utf8Length, NULL,
447                        src.getBuffer(), src.length(),
448                        0xfffd, NULL, &status);
449     dest.append(buffer, utf8Length, status);
450 }
451 
452 
setUTF16(UErrorCode & status)453 void TestParams::setUTF16(UErrorCode &status) {
454     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
455     textMap->removeAllElements();
456     for (int32_t i=0; i<dataToBreak.length(); i++) {
457         if (i == dataToBreak.getChar32Start(i)) {
458             textMap->addElement(i, status);
459         } else {
460             textMap->addElement(-1, status);
461         }
462     }
463     textMap->addElement(dataToBreak.length(), status);
464     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
465 }
466 
467 
setUTF8(UErrorCode & status)468 void TestParams::setUTF8(UErrorCode &status) {
469     if (U_FAILURE(status)) {
470         return;
471     }
472     utf8String.clear();
473     CharStringAppend(utf8String, dataToBreak, status);
474     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
475     if (U_FAILURE(status)) {
476         return;
477     }
478 
479     textMap->removeAllElements();
480     int32_t utf16Index = 0;
481     for (;;) {
482         textMap->addElement(utf16Index, status);
483         UChar32 c32 = utext_current32(textToBreak);
484         if (c32 < 0) {
485             break;
486         }
487         utf16Index += U16_LENGTH(c32);
488         utext_next32(textToBreak);
489         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
490             textMap->addElement(-1, status);
491         }
492     }
493     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
494 }
495 
496 
getSrcLine(int32_t bp)497 int32_t TestParams::getSrcLine(int32_t bp) {
498     if (bp >= textMap->size()) {
499         bp = textMap->size() - 1;
500     }
501     int32_t i = 0;
502     for(; bp >= 0 ; --bp) {
503         // Move to a character boundary if we are not on one already.
504         i = textMap->elementAti(bp);
505         if (i >= 0) {
506             break;
507         }
508     }
509     return srcLine->elementAti(i);
510 }
511 
512 
getExpectedBreak(int32_t bp)513 int32_t TestParams::getExpectedBreak(int32_t bp) {
514     if (bp >= textMap->size()) {
515         return 0;
516     }
517     int32_t i = textMap->elementAti(bp);
518     int32_t retVal = 0;
519     if (i >= 0) {
520         retVal = expectedBreaks->elementAti(i);
521     }
522     return retVal;
523 }
524 
525 
getSrcCol(int32_t bp)526 int32_t TestParams::getSrcCol(int32_t bp) {
527     if (bp >= textMap->size()) {
528         bp = textMap->size() - 1;
529     }
530     int32_t i = 0;
531     for(; bp >= 0; --bp) {
532         // Move bp to a character boundary if we are not on one already.
533         i = textMap->elementAti(bp);
534         if (i >= 0) {
535             break;
536         }
537     }
538     return srcCol->elementAti(i);
539 }
540 
541 
executeTest(TestParams * t,UErrorCode & status)542 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
543     int32_t    bp;
544     int32_t    prevBP;
545     int32_t    i;
546 
547     TEST_ASSERT_SUCCESS(status);
548     if (U_FAILURE(status)) {
549         return;
550     }
551 
552     if (t->bi == NULL) {
553         return;
554     }
555 
556     t->bi->setText(t->textToBreak, status);
557     //
558     //  Run the iterator forward
559     //
560     prevBP = -1;
561     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
562         if (prevBP ==  bp) {
563             // Fail for lack of forward progress.
564             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
565                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
566             break;
567         }
568 
569         // Check that there we didn't miss an expected break between the last one
570         //  and this one.
571         for (i=prevBP+1; i<bp; i++) {
572             if (t->getExpectedBreak(i) != 0) {
573                 int expected[] = {0, i};
574                 printStringBreaks(t->dataToBreak, expected, 2);
575                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
576                       i, t->getSrcLine(i), t->getSrcCol(i));
577             }
578         }
579 
580         // Check that the break we did find was expected
581         if (t->getExpectedBreak(bp) == 0) {
582             int expected[] = {0, bp};
583             printStringBreaks(t->textToBreak, expected, 2);
584             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
585                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
586         } else {
587             // The break was expected.
588             //   Check that the {nnn} tag value is correct.
589             int32_t expectedTagVal = t->getExpectedBreak(bp);
590             if (expectedTagVal == -1) {
591                 expectedTagVal = 0;
592             }
593             int32_t line = t->getSrcLine(bp);
594             int32_t rs = t->bi->getRuleStatus();
595             if (rs != expectedTagVal) {
596                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
597                       "          Actual, Expected status = %4d, %4d",
598                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
599             }
600         }
601 
602         prevBP = bp;
603     }
604 
605     // Verify that there were no missed expected breaks after the last one found
606     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
607         if (t->getExpectedBreak(i) != 0) {
608             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
609                       i, t->getSrcLine(i), t->getSrcCol(i));
610         }
611     }
612 
613     //
614     //  Run the iterator backwards, verify that the same breaks are found.
615     //
616     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
617     bp = t->bi->last();
618     while (bp != BreakIterator::DONE) {
619         if (prevBP ==  bp) {
620             // Fail for lack of progress.
621             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
622                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
623             break;
624         }
625 
626         // Check that we didn't miss an expected break between the last one
627         //  and this one.  (UVector returns zeros for index out of bounds.)
628         for (i=prevBP-1; i>bp; i--) {
629             if (t->getExpectedBreak(i) != 0) {
630                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
631                       i, t->getSrcLine(i), t->getSrcCol(i));
632             }
633         }
634 
635         // Check that the break we did find was expected
636         if (t->getExpectedBreak(bp) == 0) {
637             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
638                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
639         } else {
640             // The break was expected.
641             //   Check that the {nnn} tag value is correct.
642             int32_t expectedTagVal = t->getExpectedBreak(bp);
643             if (expectedTagVal == -1) {
644                 expectedTagVal = 0;
645             }
646             int line = t->getSrcLine(bp);
647             int32_t rs = t->bi->getRuleStatus();
648             if (rs != expectedTagVal) {
649                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
650                       "          Actual, Expected status = %4d, %4d",
651                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
652             }
653         }
654 
655         prevBP = bp;
656         bp = t->bi->previous();
657     }
658 
659     // Verify that there were no missed breaks prior to the last one found
660     for (i=prevBP-1; i>=0; i--) {
661         if (t->getExpectedBreak(i) != 0) {
662             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
663                       i, t->getSrcLine(i), t->getSrcCol(i));
664         }
665     }
666 
667     // Check isBoundary()
668     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
669         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
670         UBool boundaryFound    = t->bi->isBoundary(i);
671         if (boundaryExpected != boundaryFound) {
672             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
673                   "        Expected, Actual= %s, %s",
674                   i, t->getSrcLine(i), t->getSrcCol(i),
675                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
676         }
677     }
678 
679     // Check following()
680     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
681         int32_t actualBreak = t->bi->following(i);
682         int32_t expectedBreak = BreakIterator::DONE;
683         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
684             if (t->getExpectedBreak(j) != 0) {
685                 expectedBreak = j;
686                 break;
687             }
688         }
689         if (expectedBreak != actualBreak) {
690             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
691                   "        Expected, Actual= %d, %d",
692                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
693         }
694     }
695 
696     // Check preceding()
697     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
698         int32_t actualBreak = t->bi->preceding(i);
699         int32_t expectedBreak = BreakIterator::DONE;
700 
701         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
702         // preceding(trailing byte) will return the index of some preceding code point,
703         // not the lead byte of the current code point, even though that has a smaller index.
704         // Therefore, start looking at the expected break data not at i-1, but at
705         // the start of code point index - 1.
706         utext_setNativeIndex(t->textToBreak, i);
707         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
708         for (; j >= 0; j--) {
709             if (t->getExpectedBreak(j) != 0) {
710                 expectedBreak = j;
711                 break;
712             }
713         }
714         if (expectedBreak != actualBreak) {
715             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
716                   "        Expected, Actual= %d, %d",
717                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
718         }
719     }
720 }
721 
TestExtended()722 void RBBITest::TestExtended() {
723      // The expectations in this test heavily depends on the Thai dictionary.
724      // Therefore, we skip this test under the LSTM configuration.
725      if (skipDictionaryTest()) {
726          return;
727      }
728   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
729   // data driven test closely entangles filtered and regular data.
730 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
731     UErrorCode      status  = U_ZERO_ERROR;
732     Locale          locale("");
733 
734     TestParams          tp(status);
735 
736     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
737     if (U_FAILURE(status)) {
738         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
739     }
740 
741     //
742     //  Open and read the test data file.
743     //
744     const char *testDataDirectory = IntlTest::getSourceTestData(status);
745     CharString testFileName(testDataDirectory, -1, status);
746     testFileName.append("rbbitst.txt", -1, status);
747 
748     int    len;
749     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
750     if (U_FAILURE(status)) {
751         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
752         return;
753     }
754 
755     bool skipTest = false; // Skip this test?
756 
757     //
758     //  Put the test data into a UnicodeString
759     //
760     UnicodeString testString(FALSE, testFile, len);
761 
762     enum EParseState{
763         PARSE_COMMENT,
764         PARSE_TAG,
765         PARSE_DATA,
766         PARSE_NUM,
767         PARSE_RULES
768     }
769     parseState = PARSE_TAG;
770 
771     EParseState savedState = PARSE_TAG;
772 
773     int32_t    lineNum  = 1;
774     int32_t    colStart = 0;
775     int32_t    column   = 0;
776     int32_t    charIdx  = 0;
777 
778     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
779 
780     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
781     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
782 
783     for (charIdx = 0; charIdx < len; ) {
784         status = U_ZERO_ERROR;
785         UChar  c = testString.charAt(charIdx);
786         charIdx++;
787         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
788             // treat CRLF as a unit
789             c = u'\n';
790             charIdx++;
791         }
792         if (c == u'\n' || c == u'\r') {
793             lineNum++;
794             colStart = charIdx;
795         }
796         column = charIdx - colStart + 1;
797 
798         switch (parseState) {
799         case PARSE_COMMENT:
800             if (c == u'\n' || c == u'\r') {
801                 parseState = savedState;
802             }
803             break;
804 
805         case PARSE_TAG:
806             {
807             if (c == u'#') {
808                 parseState = PARSE_COMMENT;
809                 savedState = PARSE_TAG;
810                 break;
811             }
812             if (u_isUWhiteSpace(c)) {
813                 break;
814             }
815             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
816                 delete tp.bi;
817                 tp.bi = BreakIterator::createWordInstance(locale,  status);
818                 skipTest = false;
819                 charIdx += 5;
820                 break;
821             }
822             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
823                 delete tp.bi;
824                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
825                 skipTest = false;
826                 charIdx += 5;
827                 break;
828             }
829             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
830                 delete tp.bi;
831                 tp.bi = BreakIterator::createLineInstance(locale,  status);
832                 skipTest = false;
833                 charIdx += 5;
834                 break;
835             }
836             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
837                 delete tp.bi;
838                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
839                 skipTest = false;
840                 charIdx += 5;
841                 break;
842             }
843             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
844                 delete tp.bi;
845                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
846                 charIdx += 6;
847                 break;
848             }
849 
850             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
851                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
852                 charIdx = testString.indexOf(u'>', charIdx) + 1;
853                 parseState = PARSE_RULES;
854                 rules.remove();
855                 rulesFirstLine = lineNum;
856                 break;
857             }
858 
859             // <locale  loc_name>
860             localeMatcher.reset(testString);
861             if (localeMatcher.lookingAt(charIdx-1, status)) {
862                 UnicodeString localeName = localeMatcher.group(1, status);
863                 char localeName8[100];
864                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
865                 locale = Locale::createFromName(localeName8);
866                 charIdx += localeMatcher.group(0, status).length() - 1;
867                 TEST_ASSERT_SUCCESS(status);
868                 break;
869             }
870             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
871                 parseState = PARSE_DATA;
872                 charIdx += 5;
873                 tp.dataToBreak = "";
874                 tp.expectedBreaks->removeAllElements();
875                 tp.srcCol ->removeAllElements();
876                 tp.srcLine->removeAllElements();
877                 break;
878             }
879 
880             errln("line %d: Tag expected in test file.", lineNum);
881             parseState = PARSE_COMMENT;
882             savedState = PARSE_DATA;
883             goto end_test; // Stop the test.
884             }
885             break;
886 
887         case PARSE_RULES:
888             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
889                 charIdx += 7;
890                 parseState = PARSE_TAG;
891                 delete tp.bi;
892                 UParseError pe;
893                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
894                 skipTest = U_FAILURE(status);
895                 if (U_FAILURE(status)) {
896                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
897                         rulesFirstLine + pe.line - 1, u_errorName(status));
898                 }
899             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
900                 charIdx += 10;
901                 parseState = PARSE_TAG;
902                 UErrorCode ec = U_ZERO_ERROR;
903                 UParseError pe;
904                 RuleBasedBreakIterator bi(rules, pe, ec);
905                 if (U_SUCCESS(ec)) {
906                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
907                         rulesFirstLine + pe.line - 1);
908                 }
909             } else {
910                 rules.append(c);
911             }
912             break;
913 
914         case PARSE_DATA:
915             if (c == u'•') {
916                 int32_t  breakIdx = tp.dataToBreak.length();
917                 if (tp.expectedBreaks->size() > breakIdx) {
918                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
919                           lineNum, column);
920                 }
921                 tp.expectedBreaks->setSize(breakIdx+1);
922                 tp.expectedBreaks->setElementAt(-1, breakIdx);
923                 tp.srcLine->setSize(breakIdx+1);
924                 tp.srcLine->setElementAt(lineNum, breakIdx);
925                 tp.srcCol ->setSize(breakIdx+1);
926                 tp.srcCol ->setElementAt(column, breakIdx);
927                 break;
928             }
929 
930             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
931                 // Add final entry to mappings from break location to source file position.
932                 //  Need one extra because last break position returned is after the
933                 //    last char in the data, not at the last char.
934                 tp.srcLine->addElement(lineNum, status);
935                 tp.srcCol ->addElement(column, status);
936 
937                 parseState = PARSE_TAG;
938                 charIdx += 6;
939 
940                 if (!skipTest) {
941                     // RUN THE TEST!
942                     status = U_ZERO_ERROR;
943                     tp.setUTF16(status);
944                     executeTest(&tp, status);
945                     TEST_ASSERT_SUCCESS(status);
946 
947                     // Run again, this time with UTF-8 text wrapped in a UText.
948                     status = U_ZERO_ERROR;
949                     tp.setUTF8(status);
950                     TEST_ASSERT_SUCCESS(status);
951                     executeTest(&tp, status);
952                 }
953                 break;
954             }
955 
956             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
957                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
958                 // Get the code point from the name and insert it into the test data.
959                 //   (Damn, no API takes names in Unicode  !!!
960                 //    we've got to take it back to char *)
961                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
962                 int32_t nameLength = nameEndIdx - (charIdx+2);
963                 char charNameBuf[200];
964                 UChar32 theChar = -1;
965                 if (nameEndIdx != -1) {
966                     UErrorCode status = U_ZERO_ERROR;
967                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
968                     charNameBuf[sizeof(charNameBuf)-1] = 0;
969                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
970                     if (U_FAILURE(status)) {
971                         theChar = -1;
972                     }
973                 }
974                 if (theChar == -1) {
975                     errln("Error in named character in test file at line %d, col %d",
976                         lineNum, column);
977                 } else {
978                     // Named code point was recognized.  Insert it
979                     //   into the test data.
980                     tp.dataToBreak.append(theChar);
981                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
982                         tp.srcLine->addElement(lineNum, status);
983                         tp.srcCol ->addElement(column, status);
984                     }
985                 }
986                 if (nameEndIdx > charIdx) {
987                     charIdx = nameEndIdx+1;
988 
989                 }
990                 break;
991             }
992 
993 
994 
995             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
996                 charIdx++;
997                 int32_t  breakIdx = tp.dataToBreak.length();
998                 tp.expectedBreaks->setSize(breakIdx+1);
999                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1000                 tp.srcLine->setSize(breakIdx+1);
1001                 tp.srcLine->setElementAt(lineNum, breakIdx);
1002                 tp.srcCol ->setSize(breakIdx+1);
1003                 tp.srcCol ->setElementAt(column, breakIdx);
1004                 break;
1005             }
1006 
1007             if (c == u'<') {
1008                 tagValue   = 0;
1009                 parseState = PARSE_NUM;
1010                 break;
1011             }
1012 
1013             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1014                 parseState = PARSE_COMMENT;
1015                 savedState = PARSE_DATA;
1016                 break;
1017             }
1018 
1019             if (c == u'\\') {
1020                 // Check for \ at end of line, a line continuation.
1021                 //     Advance over (discard) the newline
1022                 UChar32 cp = testString.char32At(charIdx);
1023                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1024                     // We have a CR LF
1025                     //  Need an extra increment of the input ptr to move over both of them
1026                     charIdx++;
1027                 }
1028                 if (cp == u'\n' || cp == u'\r') {
1029                     lineNum++;
1030                     colStart = charIdx;
1031                     charIdx++;
1032                     break;
1033                 }
1034 
1035                 // Let unescape handle the back slash.
1036                 cp = testString.unescapeAt(charIdx);
1037                 if (cp != -1) {
1038                     // Escape sequence was recognized.  Insert the char
1039                     //   into the test data.
1040                     tp.dataToBreak.append(cp);
1041                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1042                         tp.srcLine->addElement(lineNum, status);
1043                         tp.srcCol ->addElement(column, status);
1044                     }
1045                     break;
1046                 }
1047 
1048 
1049                 // Not a recognized backslash escape sequence.
1050                 // Take the next char as a literal.
1051                 //  TODO:  Should this be an error?
1052                 c = testString.charAt(charIdx);
1053                 charIdx = testString.moveIndex32(charIdx, 1);
1054             }
1055 
1056             // Normal, non-escaped data char.
1057             tp.dataToBreak.append(c);
1058 
1059             // Save the mapping from offset in the data to line/column numbers in
1060             //   the original input file.  Will be used for better error messages only.
1061             //   If there's an expected break before this char, the slot in the mapping
1062             //     vector will already be set for this char; don't overwrite it.
1063             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1064                 tp.srcLine->addElement(lineNum, status);
1065                 tp.srcCol ->addElement(column, status);
1066             }
1067             break;
1068 
1069 
1070         case PARSE_NUM:
1071             // We are parsing an expected numeric tag value, like <1234>,
1072             //   within a chunk of data.
1073             if (u_isUWhiteSpace(c)) {
1074                 break;
1075             }
1076 
1077             if (c == u'>') {
1078                 // Finished the number.  Add the info to the expected break data,
1079                 //   and switch parse state back to doing plain data.
1080                 parseState = PARSE_DATA;
1081                 if (tagValue == 0) {
1082                     tagValue = -1;
1083                 }
1084                 int32_t  breakIdx = tp.dataToBreak.length();
1085                 if (tp.expectedBreaks->size() > breakIdx) {
1086                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1087                           lineNum, column);
1088                 }
1089                 tp.expectedBreaks->setSize(breakIdx+1);
1090                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1091                 tp.srcLine->setSize(breakIdx+1);
1092                 tp.srcLine->setElementAt(lineNum, breakIdx);
1093                 tp.srcCol ->setSize(breakIdx+1);
1094                 tp.srcCol ->setElementAt(column, breakIdx);
1095                 break;
1096             }
1097 
1098             if (u_isdigit(c)) {
1099                 tagValue = tagValue*10 + u_charDigitValue(c);
1100                 break;
1101             }
1102 
1103             errln("Syntax Error in test file at line %d, col %d",
1104                 lineNum, column);
1105             parseState = PARSE_COMMENT;
1106             goto end_test; // Stop the test
1107             break;
1108         }
1109 
1110 
1111         if (U_FAILURE(status)) {
1112             dataerrln("ICU Error %s while parsing test file at line %d.",
1113                 u_errorName(status), lineNum);
1114             status = U_ZERO_ERROR;
1115             goto end_test; // Stop the test
1116         }
1117 
1118     }
1119 
1120     // Reached end of test file. Raise an error if parseState indicates that we are
1121     //   within a block that should have been terminated.
1122 
1123     if (parseState == PARSE_RULES) {
1124         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1125             lineNum, rulesFirstLine);
1126     }
1127     if (parseState == PARSE_DATA) {
1128         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1129     }
1130 
1131 
1132 end_test:
1133     delete [] testFile;
1134 #endif
1135 }
1136 
1137 //-------------------------------------------------------------------------------
1138 //
1139 //  TestDictRules   create a break iterator from source rules that includes a
1140 //                  dictionary range.   Regression for bug #7130.  Source rules
1141 //                  do not declare a break iterator type (word, line, sentence, etc.
1142 //                  but the dictionary code, without a type, would loop.
1143 //
1144 //-------------------------------------------------------------------------------
TestDictRules()1145 void RBBITest::TestDictRules() {
1146     const char *rules =  "$dictionary = [a-z]; \n"
1147                          "!!forward; \n"
1148                          "$dictionary $dictionary; \n"
1149                          "!!reverse; \n"
1150                          "$dictionary $dictionary; \n";
1151     const char *text = "aa";
1152     UErrorCode status = U_ZERO_ERROR;
1153     UParseError parseError;
1154 
1155     RuleBasedBreakIterator bi(rules, parseError, status);
1156     if (U_SUCCESS(status)) {
1157         UnicodeString utext = text;
1158         bi.setText(utext);
1159         int32_t position;
1160         int32_t loops;
1161         for (loops = 0; loops<10; loops++) {
1162             position = bi.next();
1163             if (position == RuleBasedBreakIterator::DONE) {
1164                 break;
1165             }
1166         }
1167         TEST_ASSERT(loops == 1);
1168     } else {
1169         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1170     }
1171 }
1172 
1173 
1174 
1175 //--------------------------------------------------------------------------------------------
1176 //
1177 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1178 //
1179 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1180 void RBBITest::TestUnicodeFiles() {
1181     RuleBasedBreakIterator  *bi;
1182     UErrorCode               status = U_ZERO_ERROR;
1183 
1184     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1185     TEST_ASSERT_SUCCESS(status);
1186     if (U_SUCCESS(status)) {
1187         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1188     }
1189     delete bi;
1190 
1191     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1192     TEST_ASSERT_SUCCESS(status);
1193     if (U_SUCCESS(status)) {
1194         runUnicodeTestData("WordBreakTest.txt", bi);
1195     }
1196     delete bi;
1197 
1198     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1199     TEST_ASSERT_SUCCESS(status);
1200     if (U_SUCCESS(status)) {
1201         runUnicodeTestData("SentenceBreakTest.txt", bi);
1202     }
1203     delete bi;
1204 
1205     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1206     TEST_ASSERT_SUCCESS(status);
1207     if (U_SUCCESS(status)) {
1208         runUnicodeTestData("LineBreakTest.txt", bi);
1209     }
1210     delete bi;
1211 }
1212 
1213 
1214 // Check for test cases from the Unicode test data files that are known to fail
1215 // and should be skipped as known issues because ICU does not fully implement
1216 // the Unicode specifications, or because ICU includes tailorings that differ from
1217 // the Unicode standard.
1218 //
1219 // Test cases are identified by the test data sequence, which tends to be more stable
1220 // across Unicode versions than the test file line numbers.
1221 //
1222 // The test case with ticket "10666" is a dummy, included as an example.
1223 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1224 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1225     static struct TestCase {
1226         const char *fTicketNum;
1227         const char *fFileName;
1228         const UChar *fString;
1229     } badTestCases[] = {
1230         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1231         // The following tests were originally for
1232         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1233         // However, that ticket has been closed as fixed but these tests still fail, so
1234         // ICU-21097 has been created to investigate and address these remaining issues.
1235         {"21097",  "LineBreakTest.txt", u"-#"},
1236         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1237         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1238         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1239         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1240         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1241         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1242         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1243 
1244         // The following tests were originally for
1245         // Issue ICU-12017 Improve line break around numbers.
1246         // However, that ticket has been closed as fixed but these tests still fail, so
1247         // ICU-21097 has been created to investigate and address these remaining issues.
1248         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1249         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1250         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1251         {"21097", "LineBreakTest.txt", u"a.2 "},
1252         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1253         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1254         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1255         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1256         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1257         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1258         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1259         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1260         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1261         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1262     };
1263 
1264     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1265         const TestCase &badCase = badTestCases[n];
1266         if (!strcmp(fileName, badCase.fFileName) &&
1267                 testCase == UnicodeString(badCase.fString)) {
1268             return logKnownIssue(badCase.fTicketNum);
1269         }
1270     }
1271     return FALSE;
1272 }
1273 
1274 
1275 //--------------------------------------------------------------------------------------------
1276 //
1277 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1278 //
1279 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1280 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1281 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1282     UErrorCode  status = U_ZERO_ERROR;
1283 
1284     //
1285     //  Open and read the test data file, put it into a UnicodeString.
1286     //
1287     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1288     char testFileName[1000];
1289     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1290         dataerrln("Can't open test data.  Path too long.");
1291         return;
1292     }
1293     strcpy(testFileName, testDataDirectory);
1294     strcat(testFileName, fileName);
1295 
1296     logln("Opening data file %s\n", fileName);
1297 
1298     int    len;
1299     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1300     if (status != U_FILE_ACCESS_ERROR) {
1301         TEST_ASSERT_SUCCESS(status);
1302         TEST_ASSERT(testFile != NULL);
1303     }
1304     if (U_FAILURE(status) || testFile == NULL) {
1305         return; /* something went wrong, error already output */
1306     }
1307     UnicodeString testFileAsString(TRUE, testFile, len);
1308 
1309     //
1310     //  Parse the test data file using a regular expression.
1311     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1312     //     is identified by which group had a match.
1313     //
1314     //    Capture Group  #                  1          2            3            4           5
1315     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1316     //
1317     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1318     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1319     UnicodeString   testString;
1320     UVector32       breakPositions(status);
1321     int             lineNumber = 1;
1322     TEST_ASSERT_SUCCESS(status);
1323     if (U_FAILURE(status)) {
1324         return;
1325     }
1326 
1327     //
1328     //  Scan through each test case, building up the string to be broken in testString,
1329     //   and the positions that should be boundaries in the breakPositions vector.
1330     //
1331     int spin = 0;
1332     while (tokenMatcher.find()) {
1333         if(tokenMatcher.hitEnd()) {
1334           /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1335              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1336              and caused an infinite loop here on EBCDIC systems!
1337           */
1338           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1339           //       return;
1340         }
1341         if (tokenMatcher.start(1, status) >= 0) {
1342             // Scanned a divide sign, indicating a break position in the test data.
1343             if (testString.length()>0) {
1344                 breakPositions.addElement(testString.length(), status);
1345             }
1346         }
1347         else if (tokenMatcher.start(2, status) >= 0) {
1348             // Scanned an 'x', meaning no break at this position in the test data
1349             //   Nothing to be done here.
1350             }
1351         else if (tokenMatcher.start(3, status) >= 0) {
1352             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1353             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1354             int length = hexNumber.length();
1355             if (length<=8) {
1356                 char buf[10];
1357                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1358                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1359                 if (c<=0x10ffff) {
1360                     testString.append(c);
1361                 } else {
1362                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1363                        fileName, lineNumber);
1364                 }
1365             } else {
1366                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1367                        fileName, lineNumber);
1368              }
1369         }
1370         else if (tokenMatcher.start(4, status) >= 0) {
1371             // Scanned to end of a line, possibly skipping over a comment in the process.
1372             //   If the line from the file contained test data, run the test now.
1373             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1374                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1375             }
1376 
1377             // Clear out this test case.
1378             //    The string and breakPositions vector will be refilled as the next
1379             //       test case is parsed.
1380             testString.remove();
1381             breakPositions.removeAllElements();
1382             lineNumber++;
1383         } else {
1384             // Scanner catchall.  Something unrecognized appeared on the line.
1385             char token[16];
1386             UnicodeString uToken = tokenMatcher.group(0, status);
1387             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1388             token[sizeof(token)-1] = 0;
1389             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1390 
1391             // Clean up, in preparation for continuing with the next line.
1392             testString.remove();
1393             breakPositions.removeAllElements();
1394             lineNumber++;
1395         }
1396         TEST_ASSERT_SUCCESS(status);
1397         if (U_FAILURE(status)) {
1398             break;
1399         }
1400     }
1401 
1402     delete [] testFile;
1403  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1404 }
1405 
1406 //--------------------------------------------------------------------------------------------
1407 //
1408 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1409 //                            test data files.  Do only a simple, forward-only check -
1410 //                            this test is mostly to check that ICU and the Unicode
1411 //                            data agree with each other.
1412 //
1413 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1414 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1415                          const UnicodeString &testString,   // Text data to be broken
1416                          UVector32 *breakPositions,         // Positions where breaks should be found.
1417                          RuleBasedBreakIterator *bi) {
1418     int32_t pos;                 // Break Position in the test string
1419     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1420     int32_t expectedPos;         // Expected break position (index into test string)
1421 
1422     bi->setText(testString);
1423     pos = bi->first();
1424     pos = bi->next();
1425 
1426     while (pos != BreakIterator::DONE) {
1427         if (expectedI >= breakPositions->size()) {
1428             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1429                 testFileName, lineNumber, pos);
1430             break;
1431         }
1432         expectedPos = breakPositions->elementAti(expectedI);
1433         if (pos < expectedPos) {
1434             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1435                 testFileName, lineNumber, pos);
1436             break;
1437         }
1438         if (pos > expectedPos) {
1439             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1440                 testFileName, lineNumber, expectedPos);
1441             break;
1442         }
1443         pos = bi->next();
1444         expectedI++;
1445     }
1446 
1447     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1448         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1449             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1450     }
1451 }
1452 
1453 
1454 
1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1456 //---------------------------------------------------------------------------------------
1457 //
1458 //   class RBBIMonkeyKind
1459 //
1460 //      Monkey Test for Break Iteration
1461 //      Abstract interface class.   Concrete derived classes independently
1462 //      implement the break rules for different iterator types.
1463 //
1464 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1465 //      testing, but works purely in terms of the interface defined here.
1466 //
1467 //---------------------------------------------------------------------------------------
1468 class RBBIMonkeyKind {
1469 public:
1470     // Return a UVector of UnicodeSets, representing the character classes used
1471     //   for this type of iterator.
1472     virtual  UVector  *charClasses() = 0;
1473 
1474     // Set the test text on which subsequent calls to next() will operate
1475     virtual  void      setText(const UnicodeString &s) = 0;
1476 
1477     // Find the next break position, starting from the prev break position, or from zero.
1478     // Return -1 after reaching end of string.
1479     virtual  int32_t   next(int32_t i) = 0;
1480 
1481     // Name of each character class, parallel with charClasses. Used for debugging output
1482     // of characters.
1483     virtual  std::vector<std::string>&     characterClassNames();
1484 
1485     void setAppliedRule(int32_t position, const char* value);
1486 
1487     std::string getAppliedRule(int32_t position);
1488 
1489     virtual ~RBBIMonkeyKind();
1490     UErrorCode deferredStatus;
1491 
1492     std::string classNameFromCodepoint(const UChar32 c);
1493     unsigned int maxClassNameSize();
1494 
1495  protected:
1496      RBBIMonkeyKind();
1497      std::vector<std::string> classNames;
1498      std::vector<std::string> appliedRules;
1499 
1500     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1501     void prepareAppliedRules(int32_t size );
1502 
1503  private:
1504 
1505 };
1506 
RBBIMonkeyKind()1507 RBBIMonkeyKind::RBBIMonkeyKind() {
1508     deferredStatus = U_ZERO_ERROR;
1509 }
1510 
~RBBIMonkeyKind()1511 RBBIMonkeyKind::~RBBIMonkeyKind() {
1512 }
1513 
characterClassNames()1514 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1515     return classNames;
1516 }
1517 
prepareAppliedRules(int32_t size)1518 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1519     // Remove all the information in the `appliedRules`.
1520     appliedRules.clear();
1521     appliedRules.resize(size + 1);
1522 }
1523 
setAppliedRule(int32_t position,const char * value)1524 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1525     appliedRules[position] = value;
1526 }
1527 
getAppliedRule(int32_t position)1528 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1529     return appliedRules[position];
1530 }
1531 
classNameFromCodepoint(const UChar32 c)1532 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1533     // Simply iterate through charClasses to find character's class
1534     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1535         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1536         if (classSet->contains(c)) {
1537             return classNames[aClassNum];
1538         }
1539     }
1540     U_ASSERT(FALSE);  // This should not happen.
1541     return "bad class name";
1542 }
1543 
maxClassNameSize()1544 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1545     unsigned int maxSize = 0;
1546     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1547         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1548         if (aClassNumSize > maxSize) {
1549             maxSize = aClassNumSize;
1550         }
1551     }
1552     return maxSize;
1553 }
1554 
1555 //----------------------------------------------------------------------------------------
1556 //
1557 //   Random Numbers.  Similar to standard lib rand() and srand()
1558 //                    Not using library to
1559 //                      1.  Get same results on all platforms.
1560 //                      2.  Get access to current seed, to more easily reproduce failures.
1561 //
1562 //---------------------------------------------------------------------------------------
1563 static uint32_t m_seed = 1;
1564 
m_rand()1565 static uint32_t m_rand()
1566 {
1567     m_seed = m_seed * 1103515245 + 12345;
1568     return (uint32_t)(m_seed/65536) % 32768;
1569 }
1570 
1571 
1572 //------------------------------------------------------------------------------------------
1573 //
1574 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1575 //                             of RBBIMonkeyKind.
1576 //
1577 //------------------------------------------------------------------------------------------
1578 class RBBICharMonkey: public RBBIMonkeyKind {
1579 public:
1580     RBBICharMonkey();
1581     virtual          ~RBBICharMonkey();
1582     virtual  UVector *charClasses() override;
1583     virtual  void     setText(const UnicodeString &s) override;
1584     virtual  int32_t  next(int32_t i) override;
1585 private:
1586     UVector   *fSets;
1587 
1588     UnicodeSet  *fCRLFSet;
1589     UnicodeSet  *fControlSet;
1590     UnicodeSet  *fExtendSet;
1591     UnicodeSet  *fZWJSet;
1592     UnicodeSet  *fRegionalIndicatorSet;
1593     UnicodeSet  *fPrependSet;
1594     UnicodeSet  *fSpacingSet;
1595     UnicodeSet  *fLSet;
1596     UnicodeSet  *fVSet;
1597     UnicodeSet  *fTSet;
1598     UnicodeSet  *fLVSet;
1599     UnicodeSet  *fLVTSet;
1600     UnicodeSet  *fHangulSet;
1601     UnicodeSet  *fExtendedPictSet;
1602     UnicodeSet  *fViramaSet;
1603     UnicodeSet  *fLinkingConsonantSet;
1604     UnicodeSet  *fExtCccZwjSet;
1605     UnicodeSet  *fAnySet;
1606 
1607     const UnicodeString *fText;
1608 };
1609 
1610 
RBBICharMonkey()1611 RBBICharMonkey::RBBICharMonkey() {
1612     UErrorCode  status = U_ZERO_ERROR;
1613 
1614     fText = NULL;
1615 
1616     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1617     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1618     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1619     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1620     fRegionalIndicatorSet =
1621                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1622     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1623     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1624     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1625     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1626     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1627     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1628     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1629     fHangulSet  = new UnicodeSet();
1630     fHangulSet->addAll(*fLSet);
1631     fHangulSet->addAll(*fVSet);
1632     fHangulSet->addAll(*fTSet);
1633     fHangulSet->addAll(*fLVSet);
1634     fHangulSet->addAll(*fLVTSet);
1635 
1636     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1637     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1638                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1639     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1640                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1641     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1642     fAnySet           = new UnicodeSet(0, 0x10ffff);
1643 
1644     // Create sets of characters, and add the names of the above character sets.
1645     // In each new ICU release, add new names corresponding to the sets above.
1646     fSets             = new UVector(status);
1647 
1648     // Important: Keep class names the same as the class contents.
1649     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1650     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1651     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1652     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1653     if (!fPrependSet->isEmpty()) {
1654         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1655     }
1656     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1657     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1658     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1659     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1660     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1661     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1662     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1663     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1664 
1665     if (U_FAILURE(status)) {
1666         deferredStatus = status;
1667     }
1668 }
1669 
1670 
setText(const UnicodeString & s)1671 void RBBICharMonkey::setText(const UnicodeString &s) {
1672     fText = &s;
1673     prepareAppliedRules(s.length());
1674 }
1675 
1676 
1677 
next(int32_t prevPos)1678 int32_t RBBICharMonkey::next(int32_t prevPos) {
1679     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1680                               //   break position being tested.  The candidate break
1681                               //   location is before p2.
1682 
1683     int     breakPos = -1;
1684 
1685     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1686     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1687 
1688     if (U_FAILURE(deferredStatus)) {
1689         return -1;
1690     }
1691 
1692     // Previous break at end of string.  return DONE.
1693     if (prevPos >= fText->length()) {
1694         return -1;
1695     }
1696 
1697     p0 = p1 = p2 = p3 = prevPos;
1698     c3 =  fText->char32At(prevPos);
1699     c0 = c1 = c2 = cBase = 0;
1700     (void)p0;   // suppress set but not used warning.
1701     (void)c0;
1702 
1703     // Loop runs once per "significant" character position in the input text.
1704     for (;;) {
1705         // Move all of the positions forward in the input string.
1706         p0 = p1;  c0 = c1;
1707         p1 = p2;  c1 = c2;
1708         p2 = p3;  c2 = c3;
1709 
1710         // Advance p3 by one codepoint
1711         p3 = fText->moveIndex32(p3, 1);
1712         c3 = fText->char32At(p3);
1713 
1714         if (p1 == p2) {
1715             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1716             continue;
1717         }
1718 
1719         if (p2 == fText->length()) {
1720             setAppliedRule(p2, "End of String");
1721             break;
1722         }
1723 
1724         //     No Extend or Format characters may appear between the CR and LF,
1725         //     which requires the additional check for p2 immediately following p1.
1726         //
1727         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1728           setAppliedRule(p2, "GB3   CR x LF");
1729           continue;
1730         }
1731 
1732         if (fControlSet->contains(c1) ||
1733             c1 == 0x0D ||
1734             c1 == 0x0A)  {
1735           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1736           break;
1737         }
1738 
1739         if (fControlSet->contains(c2) ||
1740             c2 == 0x0D ||
1741             c2 == 0x0A)  {
1742             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1743             break;
1744         }
1745 
1746         if (fLSet->contains(c1) &&
1747                (fLSet->contains(c2)  ||
1748                 fVSet->contains(c2)  ||
1749                 fLVSet->contains(c2) ||
1750                 fLVTSet->contains(c2))) {
1751             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1752             continue;
1753         }
1754 
1755         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1756             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1757             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1758             continue;
1759         }
1760 
1761         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1762             fTSet->contains(c2))  {
1763             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1764             continue;
1765         }
1766 
1767         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1768             if (!fExtendSet->contains(c1)) {
1769                 cBase = c1;
1770             }
1771             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1772             continue;
1773         }
1774 
1775         if (fSpacingSet->contains(c2)) {
1776             setAppliedRule(p2, "GB9a  x  SpacingMark");
1777             continue;
1778         }
1779 
1780         if (fPrependSet->contains(c1)) {
1781             setAppliedRule(p2, "GB9b  Prepend x");
1782             continue;
1783         }
1784 
1785         //   Note: Viramas are also included in the ExtCccZwj class.
1786         if (fLinkingConsonantSet->contains(c2)) {
1787             int pi = p1;
1788             bool sawVirama = false;
1789             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1790                 if (fViramaSet->contains(fText->char32At(pi))) {
1791                     sawVirama = true;
1792                 }
1793                 pi = fText->moveIndex32(pi, -1);
1794             }
1795             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1796               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1797               continue;
1798             }
1799         }
1800 
1801         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1802           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1803           continue;
1804         }
1805 
1806         //                   Note: The first if condition is a little tricky. We only need to force
1807         //                      a break if there are three or more contiguous RIs. If there are
1808         //                      only two, a break following will occur via other rules, and will include
1809         //                      any trailing extend characters, which is needed behavior.
1810         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1811                 && fRegionalIndicatorSet->contains(c2)) {
1812           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1813           break;
1814         }
1815         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1816           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1817           continue;
1818         }
1819 
1820         setAppliedRule(p2, "GB999 Any <break> Any");
1821         break;
1822     }
1823 
1824     breakPos = p2;
1825     return breakPos;
1826 }
1827 
1828 
1829 
charClasses()1830 UVector  *RBBICharMonkey::charClasses() {
1831     return fSets;
1832 }
1833 
~RBBICharMonkey()1834 RBBICharMonkey::~RBBICharMonkey() {
1835     delete fSets;
1836     delete fCRLFSet;
1837     delete fControlSet;
1838     delete fExtendSet;
1839     delete fRegionalIndicatorSet;
1840     delete fPrependSet;
1841     delete fSpacingSet;
1842     delete fLSet;
1843     delete fVSet;
1844     delete fTSet;
1845     delete fLVSet;
1846     delete fLVTSet;
1847     delete fHangulSet;
1848     delete fAnySet;
1849     delete fZWJSet;
1850     delete fExtendedPictSet;
1851     delete fViramaSet;
1852     delete fLinkingConsonantSet;
1853     delete fExtCccZwjSet;
1854 }
1855 
1856 //------------------------------------------------------------------------------------------
1857 //
1858 //   class RBBIWordMonkey      Word Break specific implementation
1859 //                             of RBBIMonkeyKind.
1860 //
1861 //------------------------------------------------------------------------------------------
1862 class RBBIWordMonkey: public RBBIMonkeyKind {
1863 public:
1864     RBBIWordMonkey();
1865     virtual          ~RBBIWordMonkey();
1866     virtual  UVector *charClasses() override;
1867     virtual  void     setText(const UnicodeString &s) override;
1868     virtual int32_t   next(int32_t i) override;
1869 private:
1870     UVector      *fSets;
1871 
1872     UnicodeSet  *fCRSet;
1873     UnicodeSet  *fLFSet;
1874     UnicodeSet  *fNewlineSet;
1875     UnicodeSet  *fRegionalIndicatorSet;
1876     UnicodeSet  *fKatakanaSet;
1877     UnicodeSet  *fHebrew_LetterSet;
1878     UnicodeSet  *fALetterSet;
1879     UnicodeSet  *fSingle_QuoteSet;
1880     UnicodeSet  *fDouble_QuoteSet;
1881     UnicodeSet  *fMidNumLetSet;
1882     UnicodeSet  *fMidLetterSet;
1883     UnicodeSet  *fMidNumSet;
1884     UnicodeSet  *fNumericSet;
1885     UnicodeSet  *fFormatSet;
1886     UnicodeSet  *fOtherSet = nullptr;
1887     UnicodeSet  *fExtendSet;
1888     UnicodeSet  *fExtendNumLetSet;
1889     UnicodeSet  *fWSegSpaceSet;
1890     UnicodeSet  *fDictionarySet = nullptr;
1891     UnicodeSet  *fZWJSet;
1892     UnicodeSet  *fExtendedPictSet;
1893 
1894     const UnicodeString  *fText;
1895 };
1896 
1897 
RBBIWordMonkey()1898 RBBIWordMonkey::RBBIWordMonkey()
1899 {
1900     UErrorCode  status = U_ZERO_ERROR;
1901 
1902     fSets            = new UVector(status);
1903 
1904     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1905     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1906     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1907     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1908     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1909     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1910     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1911     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1912     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1913     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1914     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
1915     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1916     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1917     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1918     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1919     // There are some sc=Hani characters with WB=Extend.
1920     // The break rules need to pick one or the other because
1921     // Extend overlapping with something else is messy.
1922     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1923     // in $Han (for $dictionary) and out of $Extend.
1924     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1925     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1926 
1927     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1928     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1929     if(U_FAILURE(status)) {
1930         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1931         deferredStatus = status;
1932         return;
1933     }
1934 
1935     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1936     fDictionarySet->addAll(*fKatakanaSet);
1937     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1938 
1939     fALetterSet->removeAll(*fDictionarySet);
1940 
1941     fOtherSet        = new UnicodeSet();
1942     if(U_FAILURE(status)) {
1943         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1944         deferredStatus = status;
1945         return;
1946     }
1947 
1948     fOtherSet->complement();
1949     fOtherSet->removeAll(*fCRSet);
1950     fOtherSet->removeAll(*fLFSet);
1951     fOtherSet->removeAll(*fNewlineSet);
1952     fOtherSet->removeAll(*fKatakanaSet);
1953     fOtherSet->removeAll(*fHebrew_LetterSet);
1954     fOtherSet->removeAll(*fALetterSet);
1955     fOtherSet->removeAll(*fSingle_QuoteSet);
1956     fOtherSet->removeAll(*fDouble_QuoteSet);
1957     fOtherSet->removeAll(*fMidLetterSet);
1958     fOtherSet->removeAll(*fMidNumSet);
1959     fOtherSet->removeAll(*fNumericSet);
1960     fOtherSet->removeAll(*fExtendNumLetSet);
1961     fOtherSet->removeAll(*fWSegSpaceSet);
1962     fOtherSet->removeAll(*fFormatSet);
1963     fOtherSet->removeAll(*fExtendSet);
1964     fOtherSet->removeAll(*fRegionalIndicatorSet);
1965     fOtherSet->removeAll(*fZWJSet);
1966     fOtherSet->removeAll(*fExtendedPictSet);
1967 
1968     // Inhibit dictionary characters from being tested at all.
1969     fOtherSet->removeAll(*fDictionarySet);
1970 
1971     // Add classes and their names
1972     fSets->addElement(fCRSet, status); classNames.push_back("CR");
1973     fSets->addElement(fLFSet, status); classNames.push_back("LF");
1974     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1975     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1976     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1977     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1978     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1979     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1980     // Omit Katakana from fSets, which omits Katakana characters
1981     // from the test data. They are all in the dictionary set,
1982     // which this (old, to be retired) monkey test cannot handle.
1983     //fSets->addElement(fKatakanaSet, status);
1984 
1985     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1986     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1987     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1988     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1989     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
1990     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
1991     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
1992     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
1993     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
1994 
1995     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1996     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1997 
1998     if (U_FAILURE(status)) {
1999         deferredStatus = status;
2000     }
2001 }
2002 
setText(const UnicodeString & s)2003 void RBBIWordMonkey::setText(const UnicodeString &s) {
2004     fText       = &s;
2005     prepareAppliedRules(s.length());
2006 }
2007 
2008 
next(int32_t prevPos)2009 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2010     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2011                               //   break position being tested.  The candidate break
2012                               //   location is before p2.
2013 
2014     int     breakPos = -1;
2015 
2016     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2017 
2018     if (U_FAILURE(deferredStatus)) {
2019         return -1;
2020     }
2021 
2022     // Prev break at end of string.  return DONE.
2023     if (prevPos >= fText->length()) {
2024         return -1;
2025     }
2026     p0 = p1 = p2 = p3 = prevPos;
2027     c3 =  fText->char32At(prevPos);
2028     c0 = c1 = c2 = 0;
2029     (void)p0;       // Suppress set but not used warning.
2030 
2031     // Loop runs once per "significant" character position in the input text.
2032     for (;;) {
2033         // Move all of the positions forward in the input string.
2034         p0 = p1;  c0 = c1;
2035         p1 = p2;  c1 = c2;
2036         p2 = p3;  c2 = c3;
2037 
2038         // Advance p3 by    X(Extend | Format)*   Rule 4
2039         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2040         do {
2041             p3 = fText->moveIndex32(p3, 1);
2042             c3 = fText->char32At(p3);
2043             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2044                break;
2045             }
2046         }
2047         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2048 
2049 
2050         if (p1 == p2) {
2051             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2052             continue;
2053         }
2054 
2055         if (p2 == fText->length()) {
2056             // Reached end of string.  Always a break position.
2057             break;
2058         }
2059 
2060         //     No Extend or Format characters may appear between the CR and LF,
2061         //     which requires the additional check for p2 immediately following p1.
2062         //
2063         if (c1==0x0D && c2==0x0A) {
2064           setAppliedRule(p2, "WB3   CR x LF");
2065           continue;
2066         }
2067 
2068         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2069             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2070             break;
2071         }
2072         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2073             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2074             break;
2075         }
2076 
2077         //              Not ignoring extend chars, so peek into input text to
2078         //              get the potential ZWJ, the character immediately preceding c2.
2079         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2080         //              but char32At will get the full code point.
2081         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2082             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2083             continue;
2084         }
2085 
2086         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2087             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2088             continue;
2089         }
2090 
2091         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2092             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2093             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2094             continue;
2095         }
2096 
2097         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2098              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2099              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2100             setAppliedRule(p2,
2101                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2102             continue;
2103         }
2104 
2105         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2106             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2107             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2108             setAppliedRule(p2,
2109                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2110             continue;
2111         }
2112 
2113         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2114             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2115             continue;
2116         }
2117 
2118           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2119             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2120             continue;
2121         }
2122 
2123         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2124             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2125             continue;
2126         }
2127 
2128         if (fNumericSet->contains(c1) &&
2129             fNumericSet->contains(c2)) {
2130             setAppliedRule(p2, "WB8   Numeric x Numeric");
2131             continue;
2132         }
2133 
2134         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2135             fNumericSet->contains(c2)) {
2136             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2137             continue;
2138         }
2139 
2140         if (fNumericSet->contains(c1) &&
2141             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2142             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2143             continue;
2144         }
2145 
2146           if (fNumericSet->contains(c0) &&
2147             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2148             fNumericSet->contains(c2)) {
2149             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2150             continue;
2151         }
2152 
2153         if (fNumericSet->contains(c1) &&
2154             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2155             fNumericSet->contains(c3)) {
2156             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2157             continue;
2158         }
2159 
2160         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2161         //                  all Katakana are handled by the dictionary breaker.
2162         if (fKatakanaSet->contains(c1) &&
2163             fKatakanaSet->contains(c2))  {
2164             setAppliedRule(p2, "WB13  Katakana x Katakana");
2165             continue;
2166         }
2167 
2168         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2169              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2170              fExtendNumLetSet->contains(c2)) {
2171             setAppliedRule(p2,
2172                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2173             continue;
2174         }
2175 
2176         if (fExtendNumLetSet->contains(c1) &&
2177                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2178                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2179             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2180             continue;
2181         }
2182 
2183         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2184             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2185             break;
2186         }
2187         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2188             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2189             continue;
2190         }
2191 
2192         setAppliedRule(p2, "WB999");
2193         break;
2194     }
2195 
2196     breakPos = p2;
2197     return breakPos;
2198 }
2199 
2200 
charClasses()2201 UVector  *RBBIWordMonkey::charClasses() {
2202     return fSets;
2203 }
2204 
~RBBIWordMonkey()2205 RBBIWordMonkey::~RBBIWordMonkey() {
2206     delete fSets;
2207     delete fCRSet;
2208     delete fLFSet;
2209     delete fNewlineSet;
2210     delete fKatakanaSet;
2211     delete fHebrew_LetterSet;
2212     delete fALetterSet;
2213     delete fSingle_QuoteSet;
2214     delete fDouble_QuoteSet;
2215     delete fMidNumLetSet;
2216     delete fMidLetterSet;
2217     delete fMidNumSet;
2218     delete fNumericSet;
2219     delete fFormatSet;
2220     delete fExtendSet;
2221     delete fExtendNumLetSet;
2222     delete fWSegSpaceSet;
2223     delete fRegionalIndicatorSet;
2224     delete fDictionarySet;
2225     delete fOtherSet;
2226     delete fZWJSet;
2227     delete fExtendedPictSet;
2228 }
2229 
2230 
2231 
2232 
2233 //------------------------------------------------------------------------------------------
2234 //
2235 //   class RBBISentMonkey      Sentence Break specific implementation
2236 //                             of RBBIMonkeyKind.
2237 //
2238 //------------------------------------------------------------------------------------------
2239 class RBBISentMonkey: public RBBIMonkeyKind {
2240 public:
2241     RBBISentMonkey();
2242     virtual          ~RBBISentMonkey();
2243     virtual  UVector *charClasses() override;
2244     virtual  void     setText(const UnicodeString &s) override;
2245     virtual int32_t   next(int32_t i) override;
2246 private:
2247     int               moveBack(int posFrom);
2248     int               moveForward(int posFrom);
2249     UChar32           cAt(int pos);
2250 
2251     UVector      *fSets;
2252 
2253     UnicodeSet  *fSepSet;
2254     UnicodeSet  *fFormatSet;
2255     UnicodeSet  *fSpSet;
2256     UnicodeSet  *fLowerSet;
2257     UnicodeSet  *fUpperSet;
2258     UnicodeSet  *fOLetterSet;
2259     UnicodeSet  *fNumericSet;
2260     UnicodeSet  *fATermSet;
2261     UnicodeSet  *fSContinueSet;
2262     UnicodeSet  *fSTermSet;
2263     UnicodeSet  *fCloseSet;
2264     UnicodeSet  *fOtherSet;
2265     UnicodeSet  *fExtendSet;
2266 
2267     const UnicodeString  *fText;
2268 };
2269 
RBBISentMonkey()2270 RBBISentMonkey::RBBISentMonkey()
2271 {
2272     UErrorCode  status = U_ZERO_ERROR;
2273 
2274     fSets            = new UVector(status);
2275 
2276     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2277     //                       set and made into character classes of their own.  For the monkey impl,
2278     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2279     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2280     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2281     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2282     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2283     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2284     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2285     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2286     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2287     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2288     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2289     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2290     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2291     fOtherSet        = new UnicodeSet();
2292 
2293     if(U_FAILURE(status)) {
2294       deferredStatus = status;
2295       return;
2296     }
2297 
2298     fOtherSet->complement();
2299     fOtherSet->removeAll(*fSepSet);
2300     fOtherSet->removeAll(*fFormatSet);
2301     fOtherSet->removeAll(*fSpSet);
2302     fOtherSet->removeAll(*fLowerSet);
2303     fOtherSet->removeAll(*fUpperSet);
2304     fOtherSet->removeAll(*fOLetterSet);
2305     fOtherSet->removeAll(*fNumericSet);
2306     fOtherSet->removeAll(*fATermSet);
2307     fOtherSet->removeAll(*fSContinueSet);
2308     fOtherSet->removeAll(*fSTermSet);
2309     fOtherSet->removeAll(*fCloseSet);
2310     fOtherSet->removeAll(*fExtendSet);
2311 
2312     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2313     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2314     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2315     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2316     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2317     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2318     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2319     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2320     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2321     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2322     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2323     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2324     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2325 
2326     if (U_FAILURE(status)) {
2327         deferredStatus = status;
2328     }
2329 }
2330 
2331 
2332 
setText(const UnicodeString & s)2333 void RBBISentMonkey::setText(const UnicodeString &s) {
2334     fText       = &s;
2335     prepareAppliedRules(s.length());
2336 }
2337 
charClasses()2338 UVector  *RBBISentMonkey::charClasses() {
2339     return fSets;
2340 }
2341 
2342 //  moveBack()   Find the "significant" code point preceding the index i.
2343 //               Skips over ($Extend | $Format)* .
2344 //
moveBack(int i)2345 int RBBISentMonkey::moveBack(int i) {
2346     if (i <= 0) {
2347         return -1;
2348     }
2349     UChar32   c;
2350     int32_t   j = i;
2351     do {
2352         j = fText->moveIndex32(j, -1);
2353         c = fText->char32At(j);
2354     }
2355     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2356     return j;
2357 
2358  }
2359 
2360 
moveForward(int i)2361 int RBBISentMonkey::moveForward(int i) {
2362     if (i>=fText->length()) {
2363         return fText->length();
2364     }
2365     UChar32   c;
2366     int32_t   j = i;
2367     do {
2368         j = fText->moveIndex32(j, 1);
2369         c = cAt(j);
2370     }
2371     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2372     return j;
2373 }
2374 
cAt(int pos)2375 UChar32 RBBISentMonkey::cAt(int pos) {
2376     if (pos<0 || pos>=fText->length()) {
2377         return -1;
2378     } else {
2379         return fText->char32At(pos);
2380     }
2381 }
2382 
next(int32_t prevPos)2383 int32_t RBBISentMonkey::next(int32_t prevPos) {
2384     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2385                               //   break position being tested.  The candidate break
2386                               //   location is before p2.
2387 
2388     int     breakPos = -1;
2389 
2390     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2391     UChar32 c;
2392 
2393     if (U_FAILURE(deferredStatus)) {
2394         return -1;
2395     }
2396 
2397     // Prev break at end of string.  return DONE.
2398     if (prevPos >= fText->length()) {
2399         return -1;
2400     }
2401     p0 = p1 = p2 = p3 = prevPos;
2402     c3 =  fText->char32At(prevPos);
2403     c0 = c1 = c2 = 0;
2404     (void)p0;     // Suppress set but not used warning.
2405 
2406     // Loop runs once per "significant" character position in the input text.
2407     for (;;) {
2408         // Move all of the positions forward in the input string.
2409         p0 = p1;  c0 = c1;
2410         p1 = p2;  c1 = c2;
2411         p2 = p3;  c2 = c3;
2412 
2413         // Advance p3 by    X(Extend | Format)*   Rule 4
2414         p3 = moveForward(p3);
2415         c3 = cAt(p3);
2416 
2417         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2418             setAppliedRule(p2, "SB3   CR x LF");
2419             continue;
2420         }
2421 
2422         if (fSepSet->contains(c1)) {
2423             p2 = p1+1;   // Separators don't combine with Extend or Format.
2424 
2425             setAppliedRule(p2, "SB4   Sep  <break>");
2426             break;
2427         }
2428 
2429         if (p2 >= fText->length()) {
2430             // Reached end of string.  Always a break position.
2431             setAppliedRule(p2, "SB4   Sep  <break>");
2432             break;
2433         }
2434 
2435         if (p2 == prevPos) {
2436             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2437             setAppliedRule(p2, "SB4   Sep  <break>");
2438             continue;
2439         }
2440 
2441         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2442             setAppliedRule(p2, "SB6   ATerm x Numeric");
2443             continue;
2444         }
2445 
2446           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2447                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2448             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2449             continue;
2450         }
2451 
2452         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2453         //                  note to the Unicode 5.0 documents.
2454         int p8 = p1;
2455         while (fSpSet->contains(cAt(p8))) {
2456             p8 = moveBack(p8);
2457         }
2458         while (fCloseSet->contains(cAt(p8))) {
2459             p8 = moveBack(p8);
2460         }
2461         if (fATermSet->contains(cAt(p8))) {
2462             p8=p2;
2463             for (;;) {
2464                 c = cAt(p8);
2465                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2466                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2467                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2468 
2469                     setAppliedRule(p2,
2470                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2471                     break;
2472                 }
2473                 p8 = moveForward(p8);
2474             }
2475             if (fLowerSet->contains(cAt(p8))) {
2476 
2477                 setAppliedRule(p2,
2478                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2479                 continue;
2480             }
2481         }
2482 
2483         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2484             p8 = p1;
2485             while (fSpSet->contains(cAt(p8))) {
2486                 p8 = moveBack(p8);
2487             }
2488             while (fCloseSet->contains(cAt(p8))) {
2489                 p8 = moveBack(p8);
2490             }
2491             c = cAt(p8);
2492             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2493                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2494                 continue;
2495             }
2496         }
2497 
2498         int p9 = p1;
2499         while (fCloseSet->contains(cAt(p9))) {
2500             p9 = moveBack(p9);
2501         }
2502         c = cAt(p9);
2503         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2504             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2505 
2506                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2507                 continue;
2508             }
2509         }
2510 
2511         int p10 = p1;
2512         while (fSpSet->contains(cAt(p10))) {
2513             p10 = moveBack(p10);
2514         }
2515         while (fCloseSet->contains(cAt(p10))) {
2516             p10 = moveBack(p10);
2517         }
2518         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2519             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2520                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2521                 continue;
2522             }
2523         }
2524 
2525         int p11 = p1;
2526         if (fSepSet->contains(cAt(p11))) {
2527             p11 = moveBack(p11);
2528         }
2529         while (fSpSet->contains(cAt(p11))) {
2530             p11 = moveBack(p11);
2531         }
2532         while (fCloseSet->contains(cAt(p11))) {
2533             p11 = moveBack(p11);
2534         }
2535         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2536           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2537             break;
2538         }
2539 
2540         setAppliedRule(p2, "SB12  Any x Any");
2541         continue;
2542     }
2543 
2544     breakPos = p2;
2545     return breakPos;
2546 }
2547 
~RBBISentMonkey()2548 RBBISentMonkey::~RBBISentMonkey() {
2549     delete fSets;
2550     delete fSepSet;
2551     delete fFormatSet;
2552     delete fSpSet;
2553     delete fLowerSet;
2554     delete fUpperSet;
2555     delete fOLetterSet;
2556     delete fNumericSet;
2557     delete fATermSet;
2558     delete fSContinueSet;
2559     delete fSTermSet;
2560     delete fCloseSet;
2561     delete fOtherSet;
2562     delete fExtendSet;
2563 }
2564 
2565 
2566 
2567 //-------------------------------------------------------------------------------------------
2568 //
2569 //  RBBILineMonkey
2570 //
2571 //-------------------------------------------------------------------------------------------
2572 
2573 class RBBILineMonkey: public RBBIMonkeyKind {
2574 public:
2575     RBBILineMonkey();
2576     virtual          ~RBBILineMonkey();
2577     virtual  UVector *charClasses() override;
2578     virtual  void     setText(const UnicodeString &s) override;
2579     virtual  int32_t  next(int32_t i) override;
2580     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2581 private:
2582     UVector      *fSets;
2583 
2584     UnicodeSet  *fBK;
2585     UnicodeSet  *fCR;
2586     UnicodeSet  *fLF;
2587     UnicodeSet  *fCM;
2588     UnicodeSet  *fNL;
2589     UnicodeSet  *fSG;
2590     UnicodeSet  *fWJ;
2591     UnicodeSet  *fZW;
2592     UnicodeSet  *fGL;
2593     UnicodeSet  *fCB;
2594     UnicodeSet  *fSP;
2595     UnicodeSet  *fB2;
2596     UnicodeSet  *fBA;
2597     UnicodeSet  *fBB;
2598     UnicodeSet  *fHH;
2599     UnicodeSet  *fHY;
2600     UnicodeSet  *fH2;
2601     UnicodeSet  *fH3;
2602     UnicodeSet  *fCL;
2603     UnicodeSet  *fCP;
2604     UnicodeSet  *fEX;
2605     UnicodeSet  *fIN;
2606     UnicodeSet  *fJL;
2607     UnicodeSet  *fJV;
2608     UnicodeSet  *fJT;
2609     UnicodeSet  *fNS;
2610     UnicodeSet  *fOP;
2611     UnicodeSet  *fQU;
2612     UnicodeSet  *fIS;
2613     UnicodeSet  *fNU;
2614     UnicodeSet  *fPO;
2615     UnicodeSet  *fPR;
2616     UnicodeSet  *fSY;
2617     UnicodeSet  *fAI;
2618     UnicodeSet  *fAL;
2619     UnicodeSet  *fCJ;
2620     UnicodeSet  *fHL;
2621     UnicodeSet  *fID;
2622     UnicodeSet  *fRI;
2623     UnicodeSet  *fXX;
2624     UnicodeSet  *fEB;
2625     UnicodeSet  *fEM;
2626     UnicodeSet  *fZWJ;
2627     UnicodeSet  *fOP30;
2628     UnicodeSet  *fCP30;
2629     UnicodeSet  *fExtPictUnassigned;
2630 
2631     BreakIterator        *fCharBI;
2632     const UnicodeString  *fText;
2633     RegexMatcher         *fNumberMatcher;
2634 };
2635 
RBBILineMonkey()2636 RBBILineMonkey::RBBILineMonkey() :
2637     RBBIMonkeyKind(),
2638     fSets(NULL),
2639 
2640     fCharBI(NULL),
2641     fText(NULL),
2642     fNumberMatcher(NULL)
2643 
2644 {
2645     if (U_FAILURE(deferredStatus)) {
2646         return;
2647     }
2648 
2649     UErrorCode  status = U_ZERO_ERROR;
2650 
2651     fSets  = new UVector(status);
2652 
2653     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2654     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2655     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2656     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2657     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2658     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2659     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2660     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2661     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2662     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2663     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2664     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2665     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2666     fHH    = new UnicodeSet();
2667     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2668     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2669     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2670     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2671     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2672     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2673     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2674     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2675     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2676     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2677     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2678     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2679     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2680     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2681     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2682     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2683     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2684     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2685     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2686     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2687     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2688     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2689     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2690     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2691     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2692     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2693     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2694     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2695     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2696     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2697     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2698     fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2699 
2700     if (U_FAILURE(status)) {
2701         deferredStatus = status;
2702         return;
2703     }
2704 
2705     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2706     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2707     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2708 
2709     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2710     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2711 
2712     fHH->add(u'\u2010');   // Hyphen, '‐'
2713 
2714     // Sets and names.
2715     fSets->addElement(fBK, status); classNames.push_back("fBK");
2716     fSets->addElement(fCR, status); classNames.push_back("fCR");
2717     fSets->addElement(fLF, status); classNames.push_back("fLF");
2718     fSets->addElement(fCM, status); classNames.push_back("fCM");
2719     fSets->addElement(fNL, status); classNames.push_back("fNL");
2720     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2721     fSets->addElement(fZW, status); classNames.push_back("fZW");
2722     fSets->addElement(fGL, status); classNames.push_back("fGL");
2723     fSets->addElement(fCB, status); classNames.push_back("fCB");
2724     fSets->addElement(fSP, status); classNames.push_back("fSP");
2725     fSets->addElement(fB2, status); classNames.push_back("fB2");
2726     fSets->addElement(fBA, status); classNames.push_back("fBA");
2727     fSets->addElement(fBB, status); classNames.push_back("fBB");
2728     fSets->addElement(fHY, status); classNames.push_back("fHY");
2729     fSets->addElement(fH2, status); classNames.push_back("fH2");
2730     fSets->addElement(fH3, status); classNames.push_back("fH3");
2731     fSets->addElement(fCL, status); classNames.push_back("fCL");
2732     fSets->addElement(fCP, status); classNames.push_back("fCP");
2733     fSets->addElement(fEX, status); classNames.push_back("fEX");
2734     fSets->addElement(fIN, status); classNames.push_back("fIN");
2735     fSets->addElement(fJL, status); classNames.push_back("fJL");
2736     fSets->addElement(fJT, status); classNames.push_back("fJT");
2737     fSets->addElement(fJV, status); classNames.push_back("fJV");
2738     fSets->addElement(fNS, status); classNames.push_back("fNS");
2739     fSets->addElement(fOP, status); classNames.push_back("fOP");
2740     fSets->addElement(fQU, status); classNames.push_back("fQU");
2741     fSets->addElement(fIS, status); classNames.push_back("fIS");
2742     fSets->addElement(fNU, status); classNames.push_back("fNU");
2743     fSets->addElement(fPO, status); classNames.push_back("fPO");
2744     fSets->addElement(fPR, status); classNames.push_back("fPR");
2745     fSets->addElement(fSY, status); classNames.push_back("fSY");
2746     fSets->addElement(fAI, status); classNames.push_back("fAI");
2747     fSets->addElement(fAL, status); classNames.push_back("fAL");
2748     fSets->addElement(fHL, status); classNames.push_back("fHL");
2749     fSets->addElement(fID, status); classNames.push_back("fID");
2750     fSets->addElement(fRI, status); classNames.push_back("fRI");
2751     fSets->addElement(fSG, status); classNames.push_back("fSG");
2752     fSets->addElement(fEB, status); classNames.push_back("fEB");
2753     fSets->addElement(fEM, status); classNames.push_back("fEM");
2754     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2755     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2756     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2757     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2758     fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2759 
2760     const char *rules =
2761             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2762             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2763             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2764             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2765             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2766             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2767             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2768 
2769     fNumberMatcher = new RegexMatcher(
2770         UnicodeString(rules, -1, US_INV), 0, status);
2771 
2772     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2773 
2774     if (U_FAILURE(status)) {
2775         deferredStatus = status;
2776     }
2777 
2778 }
2779 
2780 
setText(const UnicodeString & s)2781 void RBBILineMonkey::setText(const UnicodeString &s) {
2782     fText       = &s;
2783     fCharBI->setText(s);
2784     prepareAppliedRules(s.length());
2785     fNumberMatcher->reset(s);
2786 }
2787 
2788 //
2789 //  rule9Adjust
2790 //     Line Break TR rules 9 and 10 implementation.
2791 //     This deals with combining marks and other sequences that
2792 //     that must be treated as if they were something other than what they actually are.
2793 //
2794 //     This is factored out into a separate function because it must be applied twice for
2795 //     each potential break, once to the chars before the position being checked, then
2796 //     again to the text following the possible break.
2797 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2798 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2799     if (pos == -1) {
2800         // Invalid initial position.  Happens during the warmup iteration of the
2801         //   main loop in next().
2802         return;
2803     }
2804 
2805     int32_t  nPos = *nextPos;
2806 
2807     // LB 9  Keep combining sequences together.
2808     // advance over any CM class chars.  Note that Line Break CM is different
2809     // from the normal Grapheme Extend property.
2810     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2811           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2812         for (;;) {
2813             *nextChar = fText->char32At(nPos);
2814             if (!fCM->contains(*nextChar)) {
2815                 break;
2816             }
2817             nPos = fText->moveIndex32(nPos, 1);
2818         }
2819     }
2820 
2821 
2822     // LB 9 Treat X CM* as if it were x.
2823     //       No explicit action required.
2824 
2825     // LB 10  Treat any remaining combining mark as AL
2826     if (fCM->contains(*posChar)) {
2827         *posChar = u'A';
2828     }
2829 
2830     // Push the updated nextPos and nextChar back to our caller.
2831     // This only makes a difference if posChar got bigger by consuming a
2832     // combining sequence.
2833     *nextPos  = nPos;
2834     *nextChar = fText->char32At(nPos);
2835 }
2836 
2837 
2838 
next(int32_t startPos)2839 int32_t RBBILineMonkey::next(int32_t startPos) {
2840     UErrorCode status = U_ZERO_ERROR;
2841     int32_t    pos;       //  Index of the char following a potential break position
2842     UChar32    thisChar;  //  Character at above position "pos"
2843 
2844     int32_t    prevPos;   //  Index of the char preceding a potential break position
2845     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2846                           //   and thisChar may not be adjacent because combining
2847                           //   characters between them will be ignored.
2848 
2849     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2850     UChar32    prevCharX2;
2851 
2852     int32_t    nextPos;   //  Index of the next character following pos.
2853                           //     Usually skips over combining marks.
2854     int32_t    nextCPPos; //  Index of the code point following "pos."
2855                           //     May point to a combining mark.
2856     int32_t    tPos;      //  temp value.
2857     UChar32    c;
2858 
2859     if (U_FAILURE(deferredStatus)) {
2860         return -1;
2861     }
2862 
2863     if (startPos >= fText->length()) {
2864         return -1;
2865     }
2866 
2867 
2868     // Initial values for loop.  Loop will run the first time without finding breaks,
2869     //                           while the invalid values shift out and the "this" and
2870     //                           "prev" positions are filled in with good values.
2871     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2872     thisChar = prevChar  = prevCharX2 = 0;
2873     nextPos  = nextCPPos = startPos;
2874 
2875 
2876     // Loop runs once per position in the test text, until a break position
2877     //  is found.
2878     for (;;) {
2879         prevPosX2 = prevPos;
2880         prevCharX2 = prevChar;
2881 
2882         prevPos   = pos;
2883         prevChar  = thisChar;
2884 
2885         pos       = nextPos;
2886         thisChar  = fText->char32At(pos);
2887 
2888         nextCPPos = fText->moveIndex32(pos, 1);
2889         nextPos   = nextCPPos;
2890 
2891 
2892         if (pos >= fText->length()) {
2893             setAppliedRule(pos, "LB2 - Break at end of text.");
2894             break;
2895         }
2896 
2897 
2898         //             We do this one out-of-order because the adjustment does not change anything
2899         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2900         //             be applied.
2901         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2902         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2903         c = fText->char32At(nextPos);
2904         rule9Adjust(pos, &thisChar, &nextPos, &c);
2905 
2906         // If the loop is still warming up - if we haven't shifted the initial
2907         //   -1 positions out of prevPos yet - loop back to advance the
2908         //    position in the input without any further looking for breaks.
2909         if (prevPos == -1) {
2910           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2911             continue;
2912         }
2913 
2914 
2915         if (fBK->contains(prevChar)) {
2916             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
2917             break;
2918         }
2919 
2920 
2921         if (prevChar == 0x0d && thisChar == 0x0a) {
2922             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2923             continue;
2924         }
2925         if (prevChar == 0x0d ||
2926             prevChar == 0x0a ||
2927             prevChar == 0x85)  {
2928             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2929             break;
2930         }
2931 
2932 
2933         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2934             fBK->contains(thisChar)) {
2935             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
2936             continue;
2937         }
2938 
2939 
2940         if (fSP->contains(thisChar)) {
2941             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2942             continue;
2943         }
2944 
2945         // !!! ??? Is this the right text for the applied rule?
2946         if (fZW->contains(thisChar)) {
2947             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2948             continue;
2949         }
2950 
2951 
2952         //       ZW SP* ÷
2953         //       Scan backwards from prevChar for SP* ZW
2954         tPos = prevPos;
2955         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2956             tPos = fText->moveIndex32(tPos, -1);
2957         }
2958         if (fZW->contains(fText->char32At(tPos))) {
2959             setAppliedRule(pos, "LB 8  Break after zero width space");
2960             break;
2961         }
2962 
2963 
2964         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2965         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2966         if (fNumberMatcher->lookingAt(prevPos, status)) {
2967             if (U_FAILURE(status)) {
2968                 setAppliedRule(pos, "LB 25 Numbers");
2969                 break;
2970             }
2971             // Matched a number.  But could have been just a single digit, which would
2972             //    not represent a "no break here" between prevChar and thisChar
2973             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2974             if (numEndIdx > pos) {
2975                 // Number match includes at least our two chars being checked
2976                 if (numEndIdx > nextPos) {
2977                     // Number match includes additional chars.  Update pos and nextPos
2978                     //   so that next loop iteration will continue at the end of the number,
2979                     //   checking for breaks between last char in number & whatever follows.
2980                     pos = nextPos = numEndIdx;
2981                     do {
2982                         pos = fText->moveIndex32(pos, -1);
2983                         thisChar = fText->char32At(pos);
2984                     } while (fCM->contains(thisChar));
2985                 }
2986                 setAppliedRule(pos, "LB 25 Numbers");
2987                 continue;
2988             }
2989         }
2990 
2991 
2992         //       The monkey test's way of ignoring combining characters doesn't work
2993         //       for this rule. ZJ is also a CM. Need to get the actual character
2994         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
2995         {
2996             int32_t prevIdx = fText->moveIndex32(pos, -1);
2997             UChar32 prevC = fText->char32At(prevIdx);
2998             if (fZWJ->contains(prevC)) {
2999                 setAppliedRule(pos, "LB 8a ZWJ x");
3000                 continue;
3001             }
3002         }
3003 
3004 
3005         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3006         //
3007 
3008 
3009         //    x  WJ
3010         //    WJ  x
3011         //
3012         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3013             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3014             continue;
3015         }
3016 
3017 
3018         if (fGL->contains(prevChar)) {
3019             setAppliedRule(pos, "LB 12  GL  x");
3020             continue;
3021         }
3022 
3023 
3024           if (!(fSP->contains(prevChar) ||
3025               fBA->contains(prevChar) ||
3026               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3027               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3028               continue;
3029         }
3030 
3031 
3032         if (fCL->contains(thisChar) ||
3033                 fCP->contains(thisChar) ||
3034                 fEX->contains(thisChar) ||
3035                 fSY->contains(thisChar)) {
3036             setAppliedRule(pos, "LB 13  Don't break before closings.");
3037             continue;
3038         }
3039 
3040 
3041         //       Scan backwards, checking for this sequence.
3042         //       The OP char could include combining marks, so we actually check for
3043         //           OP CM* SP*
3044         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3045         //       sequence into a ID char, so before scanning back through spaces,
3046         //       verify that prevChar is indeed a space.  The prevChar variable
3047         //       may differ from fText[prevPos]
3048         tPos = prevPos;
3049         if (fSP->contains(prevChar)) {
3050             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3051                 tPos=fText->moveIndex32(tPos, -1);
3052             }
3053         }
3054         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3055             tPos=fText->moveIndex32(tPos, -1);
3056         }
3057         if (fOP->contains(fText->char32At(tPos))) {
3058             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3059             continue;
3060         }
3061 
3062 
3063         if (nextPos < fText->length()) {
3064             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3065             //       from a legit ffff character. So test length separately.
3066             UChar32 nextChar = fText->char32At(nextPos);
3067             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3068                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3069                 break;
3070             }
3071         }
3072 
3073 
3074           if (fIS->contains(thisChar)) {
3075               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3076               continue;
3077         }
3078 
3079 
3080         if (fOP->contains(thisChar)) {
3081             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3082             int tPos = prevPos;
3083             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3084                 tPos = fText->moveIndex32(tPos, -1);
3085             }
3086             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3087                 tPos = fText->moveIndex32(tPos, -1);
3088             }
3089             if (fQU->contains(fText->char32At(tPos))) {
3090                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3091                 continue;
3092             }
3093         }
3094 
3095 
3096         //    Scan backwards for SP* CM* (CL | CP)
3097         if (fNS->contains(thisChar)) {
3098             int tPos = prevPos;
3099             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3100                 tPos = fText->moveIndex32(tPos, -1);
3101             }
3102             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3103                 tPos = fText->moveIndex32(tPos, -1);
3104             }
3105             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3106                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3107                 continue;
3108             }
3109         }
3110 
3111 
3112         if (fB2->contains(thisChar)) {
3113             //  Scan backwards, checking for the B2 CM* SP* sequence.
3114             tPos = prevPos;
3115             if (fSP->contains(prevChar)) {
3116                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3117                     tPos=fText->moveIndex32(tPos, -1);
3118                 }
3119             }
3120             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3121                 tPos=fText->moveIndex32(tPos, -1);
3122             }
3123             if (fB2->contains(fText->char32At(tPos))) {
3124                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3125                 continue;
3126             }
3127         }
3128 
3129 
3130         if (fSP->contains(prevChar)) {
3131             setAppliedRule(pos, "LB 18    break after space");
3132             break;
3133         }
3134 
3135         //    x   QU
3136         //    QU  x
3137         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3138             setAppliedRule(pos, "LB 19");
3139             continue;
3140         }
3141 
3142         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3143             setAppliedRule(pos, "LB 20  Break around a CB");
3144             break;
3145         }
3146 
3147         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3148         //           Formerly this was a Finnish tailoring.
3149         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3150         //           ^($HY | $HH) $AL;
3151         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3152                 prevPosX2 == -1) {
3153             setAppliedRule(pos, "LB 20.09");
3154             continue;
3155         }
3156 
3157         if (fBA->contains(thisChar) ||
3158             fHY->contains(thisChar) ||
3159             fNS->contains(thisChar) ||
3160             fBB->contains(prevChar) )   {
3161             setAppliedRule(pos, "LB 21");
3162             continue;
3163         }
3164 
3165         if (fHL->contains(prevCharX2) &&
3166                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3167             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3168             continue;
3169         }
3170 
3171         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3172             setAppliedRule(pos, "LB 21b SY x HL");
3173             continue;
3174         }
3175 
3176         if (fIN->contains(thisChar))   {
3177             setAppliedRule(pos, "LB 22");
3178             continue;
3179         }
3180 
3181 
3182         //          (AL | HL) x NU
3183         //          NU x (AL | HL)
3184         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3185             setAppliedRule(pos, "LB 23");
3186             continue;
3187         }
3188         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3189             setAppliedRule(pos, "LB 23");
3190             continue;
3191         }
3192 
3193         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3194         //      PR x (ID | EB | EM)
3195         //     (ID | EB | EM) x PO
3196         if (fPR->contains(prevChar) &&
3197                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3198             setAppliedRule(pos, "LB 23a");
3199             continue;
3200         }
3201         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3202                 fPO->contains(thisChar)) {
3203             setAppliedRule(pos, "LB 23a");
3204             continue;
3205         }
3206 
3207         //   Do not break between prefix and letters or ideographs.
3208         //         (PR | PO) x (AL | HL)
3209         //         (AL | HL) x (PR | PO)
3210         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3211                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3212             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3213             continue;
3214         }
3215         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3216                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3217             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3218             continue;
3219         }
3220 
3221         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3222 
3223         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3224                                         fJV->contains(thisChar) ||
3225                                         fH2->contains(thisChar) ||
3226                                         fH3->contains(thisChar))) {
3227             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3228             continue;
3229                                         }
3230 
3231         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3232             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3233             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3234             continue;
3235         }
3236 
3237         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3238             fJT->contains(thisChar)) {
3239             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3240             continue;
3241         }
3242 
3243         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3244             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3245             fPO->contains(thisChar)) {
3246             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3247             continue;
3248         }
3249         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3250             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3251             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3252             continue;
3253         }
3254 
3255 
3256 
3257         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3258             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3259             continue;
3260         }
3261 
3262           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3263               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3264               continue;
3265         }
3266 
3267         //          (AL | NU) x OP
3268         //          CP x (AL | NU)
3269         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3270             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3271             continue;
3272         }
3273         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3274             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3275             continue;
3276         }
3277 
3278         //             RI  x  RI
3279         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3280             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3281             break;
3282         }
3283         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3284             // Two Regional Indicators have been paired.
3285             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3286             // following RI. This is a hack.
3287             thisChar = -1;
3288             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3289             continue;
3290         }
3291 
3292         // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3293         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3294             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3295             continue;
3296         }
3297 
3298         if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3299             setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3300             continue;
3301         }
3302 
3303         setAppliedRule(pos, "LB 31    Break everywhere else");
3304         break;
3305     }
3306 
3307     return pos;
3308 }
3309 
3310 
charClasses()3311 UVector  *RBBILineMonkey::charClasses() {
3312     return fSets;
3313 }
3314 
3315 
~RBBILineMonkey()3316 RBBILineMonkey::~RBBILineMonkey() {
3317     delete fSets;
3318 
3319     delete fBK;
3320     delete fCR;
3321     delete fLF;
3322     delete fCM;
3323     delete fNL;
3324     delete fWJ;
3325     delete fZW;
3326     delete fGL;
3327     delete fCB;
3328     delete fSP;
3329     delete fB2;
3330     delete fBA;
3331     delete fBB;
3332     delete fHH;
3333     delete fHY;
3334     delete fH2;
3335     delete fH3;
3336     delete fCL;
3337     delete fCP;
3338     delete fEX;
3339     delete fIN;
3340     delete fJL;
3341     delete fJV;
3342     delete fJT;
3343     delete fNS;
3344     delete fOP;
3345     delete fQU;
3346     delete fIS;
3347     delete fNU;
3348     delete fPO;
3349     delete fPR;
3350     delete fSY;
3351     delete fAI;
3352     delete fAL;
3353     delete fCJ;
3354     delete fHL;
3355     delete fID;
3356     delete fRI;
3357     delete fSG;
3358     delete fXX;
3359     delete fEB;
3360     delete fEM;
3361     delete fZWJ;
3362     delete fOP30;
3363     delete fCP30;
3364     delete fExtPictUnassigned;
3365 
3366     delete fCharBI;
3367     delete fNumberMatcher;
3368 }
3369 
3370 
3371 //-------------------------------------------------------------------------------------------
3372 //
3373 //   TestMonkey
3374 //
3375 //     params
3376 //       seed=nnnnn        Random number starting seed.
3377 //                         Setting the seed allows errors to be reproduced.
3378 //       loop=nnn          Looping count.  Controls running time.
3379 //                         -1:  run forever.
3380 //                          0 or greater:  run length.
3381 //
3382 //       type = char | word | line | sent | title
3383 //
3384 //  Example:
3385 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3386 //
3387 //-------------------------------------------------------------------------------------------
3388 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3389 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3390     int32_t val = defaultVal;
3391     name.append(" *= *(-?\\d+)");
3392     UErrorCode status = U_ZERO_ERROR;
3393     RegexMatcher m(name, params, 0, status);
3394     if (m.find()) {
3395         // The param exists.  Convert the string to an int.
3396         char valString[100];
3397         int32_t paramLength = m.end(1, status) - m.start(1, status);
3398         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3399             paramLength = (int32_t)(sizeof(valString)-2);
3400         }
3401         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3402         val = strtol(valString, NULL, 10);
3403 
3404         // Delete this parameter from the params string.
3405         m.reset();
3406         params = m.replaceFirst("", status);
3407     }
3408     U_ASSERT(U_SUCCESS(status));
3409     return val;
3410 }
3411 #endif
3412 
3413 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3414 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3415                                     BreakIterator *bi,
3416                                     int expected[],
3417                                     int expectedcount)
3418 {
3419     int count = 0;
3420     int i = 0;
3421     int forward[50];
3422     bi->setText(ustr);
3423     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3424         forward[count] = i;
3425         if (count < expectedcount && expected[count] != i) {
3426             test->errln("%s:%d break forward test failed: expected %d but got %d",
3427                         __FILE__, __LINE__, expected[count], i);
3428             break;
3429         }
3430         count ++;
3431     }
3432     if (count != expectedcount) {
3433         printStringBreaks(ustr, expected, expectedcount);
3434         test->errln("%s:%d break forward test failed: missed %d match",
3435                     __FILE__, __LINE__, expectedcount - count);
3436         return;
3437     }
3438     // testing boundaries
3439     for (i = 1; i < expectedcount; i ++) {
3440         int j = expected[i - 1];
3441         if (!bi->isBoundary(j)) {
3442             printStringBreaks(ustr, expected, expectedcount);
3443             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3444                     __FILE__, __LINE__, j);
3445             return;
3446         }
3447         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3448             if (bi->isBoundary(j)) {
3449                 printStringBreaks(ustr, expected, expectedcount);
3450                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3451                     __FILE__, __LINE__, j);
3452                 return;
3453             }
3454         }
3455     }
3456 
3457     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3458         count --;
3459         if (forward[count] != i) {
3460             printStringBreaks(ustr, expected, expectedcount);
3461             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3462                         __FILE__, __LINE__, forward[count], i);
3463             break;
3464         }
3465     }
3466     if (count != 0) {
3467         printStringBreaks(ustr, expected, expectedcount);
3468         test->errln("break test previous() failed: missed a match");
3469         return;
3470     }
3471 
3472     // testing preceding
3473     for (i = 0; i < expectedcount - 1; i ++) {
3474         // int j = expected[i] + 1;
3475         int j = ustr.moveIndex32(expected[i], 1);
3476         for (; j <= expected[i + 1]; j ++) {
3477             int32_t expectedPreceding = expected[i];
3478             int32_t actualPreceding = bi->preceding(j);
3479             if (actualPreceding != expectedPreceding) {
3480                 printStringBreaks(ustr, expected, expectedcount);
3481                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3482                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3483                 return;
3484             }
3485         }
3486     }
3487 }
3488 #endif
3489 
TestWordBreaks(void)3490 void RBBITest::TestWordBreaks(void)
3491 {
3492 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3493 
3494     Locale        locale("en");
3495     UErrorCode    status = U_ZERO_ERROR;
3496     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3497     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3498     // Replaced any C+J characters in a row with a random sequence of characters
3499     // of the same length to make our C+J segmentation not get in the way.
3500     static const char *strlist[] =
3501     {
3502     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3503     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3504     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3505     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3506     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3507     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3508     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3509     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3510     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3511     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3512     "\\u2027\\U000e0067\\u0a47\\u00b7",
3513     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3514     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3515     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3516     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3517     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3518     "\\u0027\\u11af\\U000e0057\\u0602",
3519     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3520     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3521     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3522     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3523     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3524     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3525     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3526     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3527     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3528     "\\u18f4\\U000e0049\\u20e7\\u2027",
3529     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3530     "\\ua183\\u102d\\u0bec\\u003a",
3531     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3532     "\\u003a\\u0e57\\u0fad\\u002e",
3533     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3534     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3535     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3536     "\\u003a\\u0664\\u00b7\\u1fba",
3537     "\\u003b\\u0027\\u00b7\\u47a3",
3538     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3539     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3540     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3541     };
3542     int loop;
3543     if (U_FAILURE(status)) {
3544         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3545         return;
3546     }
3547     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3548         // printf("looping %d\n", loop);
3549         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3550         // RBBICharMonkey monkey;
3551         RBBIWordMonkey monkey;
3552 
3553         int expected[50];
3554         int expectedcount = 0;
3555 
3556         monkey.setText(ustr);
3557         int i;
3558         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3559             expected[expectedcount ++] = i;
3560         }
3561 
3562         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3563     }
3564     delete bi;
3565 #endif
3566 }
3567 
TestWordBoundary(void)3568 void RBBITest::TestWordBoundary(void)
3569 {
3570     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3571     Locale        locale("en");
3572     UErrorCode    status = U_ZERO_ERROR;
3573     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3574     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3575     if (U_FAILURE(status)) {
3576         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3577                 __FILE__, __LINE__, u_errorName(status));
3578         return;
3579     }
3580     UChar         str[50];
3581     static const char *strlist[] =
3582     {
3583     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586     "\\u2027\\U000e0067\\u0a47\\u00b7",
3587     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592     "\\u0027\\u11af\\U000e0057\\u0602",
3593     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3598     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3599     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602     "\\u58f4\\U000e0049\\u20e7\\u2027",
3603     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3604     "\\ua183\\u102d\\u0bec\\u003a",
3605     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606     "\\u003a\\u0e57\\u0fad\\u002e",
3607     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610     "\\u003a\\u0664\\u00b7\\u1fba",
3611     "\\u003b\\u0027\\u00b7\\u47a3",
3612     };
3613     int loop;
3614     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3615         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3616         UnicodeString ustr(str);
3617         int forward[50];
3618         int count = 0;
3619 
3620         bi->setText(ustr);
3621         int prev = -1;
3622         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3623             ++count;
3624             if (count >= UPRV_LENGTHOF(forward)) {
3625                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3626                         __FILE__, __LINE__, loop, count, boundary);
3627                 return;
3628             }
3629             forward[count] = boundary;
3630             if (boundary <= prev) {
3631                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3632                         __FILE__, __LINE__, loop, prev, boundary);
3633                 break;
3634             }
3635             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3636                 if (bi->isBoundary(nonBoundary)) {
3637                     printStringBreaks(ustr, forward, count);
3638                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3639                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3640                     return;
3641                 }
3642             }
3643             if (!bi->isBoundary(boundary)) {
3644                 printStringBreaks(ustr, forward, count);
3645                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3646                        __FILE__, __LINE__, boundary);
3647                 return;
3648             }
3649             prev = boundary;
3650         }
3651     }
3652 }
3653 
TestLineBreaks(void)3654 void RBBITest::TestLineBreaks(void)
3655 {
3656 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3657     Locale        locale("en");
3658     UErrorCode    status = U_ZERO_ERROR;
3659     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3660     const int32_t  STRSIZE = 50;
3661     UChar         str[STRSIZE];
3662     static const char *strlist[] =
3663     {
3664      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3665      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3666              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3667      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3668              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3669      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3670      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3671      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3672      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3673      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3674      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3690      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3691      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3692      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3693      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3694      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3695      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3696      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3697      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3698      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3699      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3700      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3701      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3702      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3703          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3704     };
3705     int loop;
3706     TEST_ASSERT_SUCCESS(status);
3707     if (U_FAILURE(status)) {
3708         return;
3709     }
3710     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3711         // printf("looping %d\n", loop);
3712         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3713         if (t >= STRSIZE) {
3714             TEST_ASSERT(FALSE);
3715             continue;
3716         }
3717 
3718 
3719         UnicodeString ustr(str);
3720         RBBILineMonkey monkey;
3721         if (U_FAILURE(monkey.deferredStatus)) {
3722             continue;
3723         }
3724 
3725         const int EXPECTEDSIZE = 50;
3726         int expected[EXPECTEDSIZE];
3727         int expectedcount = 0;
3728 
3729         monkey.setText(ustr);
3730 
3731         int i;
3732         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3733             if (expectedcount >= EXPECTEDSIZE) {
3734                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3735                 return;
3736             }
3737             expected[expectedcount ++] = i;
3738         }
3739 
3740         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3741     }
3742     delete bi;
3743 #endif
3744 }
3745 
TestSentBreaks(void)3746 void RBBITest::TestSentBreaks(void)
3747 {
3748 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3749     Locale        locale("en");
3750     UErrorCode    status = U_ZERO_ERROR;
3751     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3752     UChar         str[200];
3753     static const char *strlist[] =
3754     {
3755      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3756      "This\n",
3757      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3758      "\"Sentence ending with a quote.\" Bye.",
3759      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3760      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3761      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3762      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3763      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3764      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3765      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3766              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3767              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3768              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3769      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3770              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3771              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3772              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3773              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3774              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3775     };
3776     int loop;
3777     if (U_FAILURE(status)) {
3778         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3779         return;
3780     }
3781     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3782         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3783         UnicodeString ustr(str);
3784 
3785         RBBISentMonkey monkey;
3786         if (U_FAILURE(monkey.deferredStatus)) {
3787             continue;
3788         }
3789 
3790         const int EXPECTEDSIZE = 50;
3791         int expected[EXPECTEDSIZE];
3792         int expectedcount = 0;
3793 
3794         monkey.setText(ustr);
3795 
3796         int i;
3797         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3798             if (expectedcount >= EXPECTEDSIZE) {
3799                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3800                 return;
3801             }
3802             expected[expectedcount ++] = i;
3803         }
3804 
3805         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3806     }
3807     delete bi;
3808 #endif
3809 }
3810 
TestMonkey()3811 void RBBITest::TestMonkey() {
3812 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3813 
3814     UErrorCode     status    = U_ZERO_ERROR;
3815     int32_t        loopCount = 500;
3816     int32_t        seed      = 1;
3817     UnicodeString  breakType = "all";
3818     Locale         locale("en");
3819     UBool          useUText  = FALSE;
3820 
3821     if (quick == FALSE) {
3822         loopCount = 10000;
3823     }
3824 
3825     if (fTestParams) {
3826         UnicodeString p(fTestParams);
3827         loopCount = getIntParam("loop", p, loopCount);
3828         seed      = getIntParam("seed", p, seed);
3829 
3830         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3831         if (m.find()) {
3832             breakType = m.group(1, status);
3833             m.reset();
3834             p = m.replaceFirst("", status);
3835         }
3836 
3837         RegexMatcher u(" *utext", p, 0, status);
3838         if (u.find()) {
3839             useUText = TRUE;
3840             u.reset();
3841             p = u.replaceFirst("", status);
3842         }
3843 
3844 
3845         // m.reset(p);
3846         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3847             // Each option is stripped out of the option string as it is processed.
3848             // All options have been checked.  The option string should have been completely emptied..
3849             char buf[100];
3850             p.extract(buf, sizeof(buf), NULL, status);
3851             buf[sizeof(buf)-1] = 0;
3852             errln("Unrecognized or extra parameter:  %s\n", buf);
3853             return;
3854         }
3855 
3856     }
3857 
3858     if (breakType == "char" || breakType == "all") {
3859         RBBICharMonkey  m;
3860         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3861         if (U_SUCCESS(status)) {
3862             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3863             if (breakType == "all" && useUText==FALSE) {
3864                 // Also run a quick test with UText when "all" is specified
3865                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3866             }
3867         }
3868         else {
3869             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3870         }
3871         delete bi;
3872     }
3873 
3874     if (breakType == "word" || breakType == "all") {
3875         logln("Word Break Monkey Test");
3876         RBBIWordMonkey  m;
3877         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3878         if (U_SUCCESS(status)) {
3879             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3880         }
3881         else {
3882             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3883         }
3884         delete bi;
3885     }
3886 
3887     if (breakType == "line" || breakType == "all") {
3888         logln("Line Break Monkey Test");
3889         RBBILineMonkey  m;
3890         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3891         if (loopCount >= 10) {
3892             loopCount = loopCount / 5;   // Line break runs slower than the others.
3893         }
3894         if (U_SUCCESS(status)) {
3895             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3896         }
3897         else {
3898             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3899         }
3900         delete bi;
3901     }
3902 
3903     if (breakType == "sent" || breakType == "all"  ) {
3904         logln("Sentence Break Monkey Test");
3905         RBBISentMonkey  m;
3906         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3907         if (loopCount >= 10) {
3908             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3909         }
3910         if (U_SUCCESS(status)) {
3911             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3912         }
3913         else {
3914             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3915         }
3916         delete bi;
3917     }
3918 
3919 #endif
3920 }
3921 
3922 //
3923 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3924 //    Parameters:
3925 //       bi      - the break iterator to use
3926 //       mk      - MonkeyKind, abstraction for obtaining expected results
3927 //       name    - Name of test (char, word, etc.) for use in error messages
3928 //       seed    - Seed for starting random number generator (parameter from user)
3929 //       numIterations
3930 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3931 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3932                          int32_t numIterations, UBool useUText) {
3933 
3934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3935 
3936     const int32_t    TESTSTRINGLEN = 500;
3937     UnicodeString    testText;
3938     int32_t          numCharClasses;
3939     UVector          *chClasses;
3940     int              expectedCount = 0;
3941     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3942     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3943     char             reverseBreaks[TESTSTRINGLEN*2+1];
3944     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3945     char             followingBreaks[TESTSTRINGLEN*2+1];
3946     char             precedingBreaks[TESTSTRINGLEN*2+1];
3947     int              i;
3948     int              loopCount = 0;
3949 
3950 
3951     m_seed = seed;
3952 
3953     numCharClasses = mk.charClasses()->size();
3954     chClasses      = mk.charClasses();
3955 
3956     // Check for errors that occurred during the construction of the MonkeyKind object.
3957     //  Can't report them where they occurred because errln() is a method coming from intlTest,
3958     //  and is not visible outside of RBBITest :-(
3959     if (U_FAILURE(mk.deferredStatus)) {
3960         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3961         return;
3962     }
3963 
3964     // Verify that the character classes all have at least one member.
3965     for (i=0; i<numCharClasses; i++) {
3966         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3967         if (s == NULL || s->size() == 0) {
3968             errln("Character Class #%d is null or of zero size.", i);
3969             return;
3970         }
3971     }
3972 
3973     // For minimizing width of class name output.
3974     int classNameSize = mk.maxClassNameSize();
3975 
3976     while (loopCount < numIterations || numIterations == -1) {
3977         if (numIterations == -1 && loopCount % 10 == 0) {
3978             // If test is running in an infinite loop, display a periodic tic so
3979             //   we can tell that it is making progress.
3980             fprintf(stderr, ".");
3981         }
3982         // Save current random number seed, so that we can recreate the random numbers
3983         //   for this loop iteration in event of an error.
3984         seed = m_seed;
3985 
3986         // Populate a test string with data.
3987         testText.truncate(0);
3988         for (i=0; i<TESTSTRINGLEN; i++) {
3989             int32_t  aClassNum = m_rand() % numCharClasses;
3990             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3991             int32_t   charIdx = m_rand() % classSet->size();
3992             UChar32   c = classSet->charAt(charIdx);
3993             if (c < 0) {   // TODO:  deal with sets containing strings.
3994                 errln("%s:%d c < 0", __FILE__, __LINE__);
3995                 break;
3996             }
3997             // Do not assemble a supplementary character from randomly generated separate surrogates.
3998             //   (It could be a dictionary character)
3999             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4000                 continue;
4001             }
4002 
4003             testText.append(c);
4004         }
4005 
4006         // Calculate the expected results for this test string and reset applied rules.
4007         mk.setText(testText);
4008 
4009         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4010         expectedBreaks[0] = 1;
4011         int32_t breakPos = 0;
4012         expectedCount = 0;
4013         for (;;) {
4014             breakPos = mk.next(breakPos);
4015             if (breakPos == -1) {
4016                 break;
4017             }
4018             if (breakPos > testText.length()) {
4019                 errln("breakPos > testText.length()");
4020             }
4021             expectedBreaks[breakPos] = 1;
4022             expectedCount++;
4023             U_ASSERT(expectedCount<testText.length());
4024         }
4025 
4026         // Find the break positions using forward iteration
4027         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4028         if (useUText) {
4029             UErrorCode status = U_ZERO_ERROR;
4030             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4031             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4032             bi->setText(testUText, status);
4033             TEST_ASSERT_SUCCESS(status);
4034             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4035                                       //  This UText can be closed immediately, so long as the
4036                                       //  testText string continues to exist.
4037         } else {
4038             bi->setText(testText);
4039         }
4040 
4041         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4042             if (i < 0 || i > testText.length()) {
4043                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4044                 break;
4045             }
4046             forwardBreaks[i] = 1;
4047         }
4048 
4049         // Find the break positions using reverse iteration
4050         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4051         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4052             if (i < 0 || i > testText.length()) {
4053                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4054                 break;
4055             }
4056             reverseBreaks[i] = 1;
4057         }
4058 
4059         // Find the break positions using isBoundary() tests.
4060         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4061         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4062         for (i=0; i<=testText.length(); i++) {
4063             isBoundaryBreaks[i] = bi->isBoundary(i);
4064         }
4065 
4066 
4067         // Find the break positions using the following() function.
4068         // printf(".");
4069         memset(followingBreaks, 0, sizeof(followingBreaks));
4070         int32_t   lastBreakPos = 0;
4071         followingBreaks[0] = 1;
4072         for (i=0; i<testText.length(); i++) {
4073             breakPos = bi->following(i);
4074             if (breakPos <= i ||
4075                 breakPos < lastBreakPos ||
4076                 breakPos > testText.length() ||
4077                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4078                 errln("%s break monkey test: "
4079                     "Out of range value returned by BreakIterator::following().\n"
4080                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4081                          name, seed, i, breakPos, lastBreakPos);
4082                 break;
4083             }
4084             followingBreaks[breakPos] = 1;
4085             lastBreakPos = breakPos;
4086         }
4087 
4088         // Find the break positions using the preceding() function.
4089         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4090         lastBreakPos = testText.length();
4091         precedingBreaks[testText.length()] = 1;
4092         for (i=testText.length(); i>0; i--) {
4093             breakPos = bi->preceding(i);
4094             if (breakPos >= i ||
4095                 breakPos > lastBreakPos ||
4096                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4097                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4098                 errln("%s break monkey test: "
4099                     "Out of range value returned by BreakIterator::preceding().\n"
4100                     "index=%d;  prev returned %d; lastBreak=%d" ,
4101                     name,  i, breakPos, lastBreakPos);
4102                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4103                     precedingBreaks[i] = 2;   // Forces an error.
4104                 }
4105             } else {
4106                 if (breakPos >= 0) {
4107                     precedingBreaks[breakPos] = 1;
4108                 }
4109                 lastBreakPos = breakPos;
4110             }
4111         }
4112 
4113         // Compare the expected and actual results.
4114         for (i=0; i<=testText.length(); i++) {
4115             const char *errorType = NULL;
4116             const char* currentBreakData = NULL;
4117             if  (forwardBreaks[i] != expectedBreaks[i]) {
4118                 errorType = "next()";
4119                 currentBreakData = forwardBreaks;
4120             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4121                 errorType = "previous()";
4122                 currentBreakData = reverseBreaks;
4123            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4124                 errorType = "isBoundary()";
4125                 currentBreakData = isBoundaryBreaks;
4126             } else if (followingBreaks[i] != expectedBreaks[i]) {
4127                 errorType = "following()";
4128                 currentBreakData = followingBreaks;
4129             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4130                 errorType = "preceding()";
4131                 currentBreakData = precedingBreaks;
4132             }
4133 
4134             if (errorType != NULL) {
4135                 // Format a range of the test text that includes the failure as
4136                 //  a data item that can be included in the rbbi test data file.
4137 
4138                 // Start of the range is the last point where expected and actual results
4139                 //  both agreed that there was a break position.
4140 
4141                 int startContext = i;
4142                 int32_t count = 0;
4143                 for (;;) {
4144                     if (startContext==0) { break; }
4145                     startContext --;
4146                     if (expectedBreaks[startContext] != 0) {
4147                         if (count == 2) break;
4148                         count ++;
4149                     }
4150                 }
4151 
4152                 // End of range is two expected breaks past the start position.
4153                 int endContext = i + 1;
4154                 int ci;
4155                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4156                     for (;;) {
4157                         if (endContext >= testText.length()) {break;}
4158                         if (expectedBreaks[endContext-1] != 0) {
4159                             if (count == 0) break;
4160                             count --;
4161                         }
4162                         endContext ++;
4163                     }
4164                 }
4165 
4166                 // Formatting of each line includes:
4167                 //   character code
4168                 //   reference break: '|' -> a break, '.' -> no break
4169                 //   actual break:    '|' -> a break, '.' -> no break
4170                 //   (name of character clase)
4171                 //   Unicode name of character
4172                 //   '-->' indicates location of the difference.
4173 
4174                 MONKEY_ERROR(
4175                     (expectedBreaks[i] ? "Break expected but not found" :
4176                        "Break found but not expected"),
4177                     name, i, seed);
4178 
4179                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4180                     UChar32  c;
4181                     c = testText.char32At(ci);
4182 
4183                     std::string currentLineFlag = "   ";
4184                     if (ci == i) {
4185                         currentLineFlag = "-->";  // Error position
4186                     }
4187 
4188                     // BMP or SMP character in hex
4189                     char hexCodePoint[12];
4190                     std::string format = "    \\u%04x";
4191                     if (c >= 0x10000) {
4192                         format = "\\U%08x";
4193                     }
4194                     sprintf(hexCodePoint, format.c_str(), c);
4195 
4196                     // Get the class name and character name for the character.
4197                     char cName[200];
4198                     UErrorCode status = U_ZERO_ERROR;
4199                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4200 
4201                     char buffer[200];
4202                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4203                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4204                              currentLineFlag.c_str(),
4205                              ci,
4206                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4207                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4208                              hexCodePoint,
4209                              classNameSize,
4210                              mk.classNameFromCodepoint(c).c_str(),
4211                              mk.getAppliedRule(ci).c_str(), cName);
4212                     (void)ret;
4213                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4214 
4215                     // Output the error
4216                     if (ci == i) {
4217                         errln(buffer);
4218                     } else {
4219                         infoln(buffer);
4220                     }
4221 
4222                     if (ci >= endContext) { break; }
4223                 }
4224                 break;
4225             }
4226         }
4227 
4228         loopCount++;
4229     }
4230 #endif
4231 }
4232 
4233 
4234 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4235 //             This test checks the initial patch,
4236 //             which is to just keep it from crashing.  Correct word boundaries
4237 //             await a proper fix to the dictionary code.
4238 //
TestBug5532(void)4239 void RBBITest::TestBug5532(void)  {
4240    // Text includes a mixture of Thai and Latin.
4241    const unsigned char utf8Data[] = {
4242            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4243            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4244            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4245            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4246            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4247            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4248            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4249            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4250            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4251            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4252            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4253 
4254     UErrorCode status = U_ZERO_ERROR;
4255     UText utext=UTEXT_INITIALIZER;
4256     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4257     TEST_ASSERT_SUCCESS(status);
4258 
4259     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4260     TEST_ASSERT_SUCCESS(status);
4261     if (U_SUCCESS(status)) {
4262         bi->setText(&utext, status);
4263         TEST_ASSERT_SUCCESS(status);
4264 
4265         int32_t breakCount = 0;
4266         int32_t previousBreak = -1;
4267         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4268             // For now, just make sure that the break iterator doesn't hang.
4269             TEST_ASSERT(previousBreak < bi->current());
4270             previousBreak = bi->current();
4271         }
4272         TEST_ASSERT(breakCount > 0);
4273     }
4274     delete bi;
4275     utext_close(&utext);
4276 }
4277 
4278 
TestBug9983(void)4279 void RBBITest::TestBug9983(void)  {
4280     UnicodeString text = UnicodeString("\\u002A"  // * Other
4281                                        "\\uFF65"  //   Other
4282                                        "\\u309C"  //   Katakana
4283                                        "\\uFF9F"  //   Extend
4284                                        "\\uFF65"  //   Other
4285                                        "\\u0020"  //   Other
4286                                        "\\u0000").unescape();
4287 
4288     UErrorCode status = U_ZERO_ERROR;
4289     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4290         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4291     TEST_ASSERT_SUCCESS(status);
4292     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4293         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4294     TEST_ASSERT_SUCCESS(status);
4295     if (U_FAILURE(status)) {
4296         return;
4297     }
4298     int32_t offset, rstatus, iterationCount;
4299 
4300     brkiter->setText(text);
4301     brkiter->last();
4302     iterationCount = 0;
4303     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4304         iterationCount++;
4305         rstatus = brkiter->getRuleStatus();
4306         (void)rstatus;     // Suppress set but not used warning.
4307         if (iterationCount >= 10) {
4308            break;
4309         }
4310     }
4311     TEST_ASSERT(iterationCount == 6);
4312 
4313     brkiterPOSIX->setText(text);
4314     brkiterPOSIX->last();
4315     iterationCount = 0;
4316     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4317         iterationCount++;
4318         rstatus = brkiterPOSIX->getRuleStatus();
4319         (void)rstatus;     // Suppress set but not used warning.
4320         if (iterationCount >= 10) {
4321            break;
4322         }
4323     }
4324     TEST_ASSERT(iterationCount == 6);
4325 }
4326 
4327 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4328 //
TestBug7547()4329 void RBBITest::TestBug7547() {
4330     UnicodeString rules;
4331     UErrorCode status = U_ZERO_ERROR;
4332     UParseError parseError;
4333     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4334     if (status != U_BRK_RULE_SYNTAX) {
4335         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4336     }
4337     if (parseError.line != 1 || parseError.offset != 0) {
4338         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4339     }
4340 }
4341 
4342 
TestBug12797()4343 void RBBITest::TestBug12797() {
4344     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4345     UErrorCode status = U_ZERO_ERROR;
4346     UParseError parseError;
4347     RuleBasedBreakIterator bi(rules, parseError, status);
4348     if (U_FAILURE(status)) {
4349         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4350         return;
4351     }
4352     UnicodeString text = "abc";
4353     bi.setText(text);
4354     bi.first();
4355     int32_t boundary = bi.next();
4356     if (boundary != 3) {
4357         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4358     }
4359 }
4360 
TestBug12918()4361 void RBBITest::TestBug12918() {
4362     // This test triggers an assertion failure in dictbe.cpp
4363     const UChar *crasherString = u"\u3325\u4a16";
4364     UErrorCode status = U_ZERO_ERROR;
4365     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4366     if (U_FAILURE(status)) {
4367         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4368         return;
4369     }
4370     ubrk_first(iter);
4371     int32_t pos = 0;
4372     int32_t lastPos = -1;
4373     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4374         if (pos <= lastPos) {
4375             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4376             break;
4377         }
4378     }
4379     ubrk_close(iter);
4380 }
4381 
TestBug12932()4382 void RBBITest::TestBug12932() {
4383     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4384     UnicodeString ruleStr(
4385             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4386             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4387             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4388             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4389             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4390             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4391 
4392     UErrorCode status = U_ZERO_ERROR;
4393     UParseError parseError;
4394     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4395     if (status != U_BRK_RULE_SYNTAX) {
4396         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4397                 __FILE__, __LINE__, u_errorName(status));
4398     }
4399 }
4400 
4401 
4402 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4403 //             remain undevided by ICU char, word and line break.
TestEmoji()4404 void RBBITest::TestEmoji() {
4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4406     UErrorCode  status = U_ZERO_ERROR;
4407 
4408     CharString testFileName;
4409     testFileName.append(IntlTest::getSourceTestData(status), status);
4410     testFileName.appendPathPart("emoji-test.txt", status);
4411     if (U_FAILURE(status)) {
4412         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4413         return;
4414     }
4415     logln("Opening data file %s\n", testFileName.data());
4416 
4417     int    len;
4418     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4419     if (U_FAILURE(status) || testFile == NULL) {
4420         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4421         return;
4422     }
4423     UnicodeString testFileAsString(testFile, len);
4424     delete [] testFile;
4425 
4426     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4427     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4428     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4429     int32_t lineNumber = 0;
4430 
4431     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4432     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4433     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4434     if (U_FAILURE(status)) {
4435         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4436         return;
4437     }
4438 
4439     while (lineMatcher.find()) {
4440         ++lineNumber;
4441         UnicodeString line = lineMatcher.group(status);
4442         hexMatcher.reset(line);
4443         UnicodeString testString;   // accumulates the emoji sequence.
4444         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4445             UnicodeString hex = hexMatcher.group(1, status);
4446             if (hex.length() > 8) {
4447                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4448                 break;
4449             }
4450             CharString hex8;
4451             hex8.appendInvariantChars(hex, status);
4452             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4453             if (c<=0x10ffff) {
4454                 testString.append(c);
4455             } else {
4456                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4457                         __FILE__, __LINE__, lineNumber, hex8.data());
4458                 break;
4459             }
4460         }
4461 
4462         if (testString.length() > 1) {
4463             charBreaks->setText(testString);
4464             charBreaks->first();
4465             int32_t firstBreak = charBreaks->next();
4466             if (testString.length() != firstBreak) {
4467                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4468                         __FILE__, __LINE__, lineNumber, firstBreak);
4469             }
4470             wordBreaks->setText(testString);
4471             wordBreaks->first();
4472             firstBreak = wordBreaks->next();
4473             if (testString.length() != firstBreak) {
4474                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4475                         __FILE__, __LINE__, lineNumber, firstBreak);
4476             }
4477             lineBreaks->setText(testString);
4478             lineBreaks->first();
4479             firstBreak = lineBreaks->next();
4480             if (testString.length() != firstBreak) {
4481                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4482                         __FILE__, __LINE__, lineNumber, firstBreak);
4483             }
4484         }
4485     }
4486 #endif
4487 }
4488 
4489 
4490 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4491 
TestBug12519()4492 void RBBITest::TestBug12519() {
4493     UErrorCode status = U_ZERO_ERROR;
4494     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4495     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4496     if (!assertSuccess(WHERE, status)) {
4497         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4498         return;
4499     }
4500     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4501 
4502     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4503     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4504 
4505     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4506     assertTrue(WHERE, *biEn == *cloneEn);
4507     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4508 
4509     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4510     assertTrue(WHERE, *biFr == *cloneFr);
4511     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4512 
4513     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4514     UnicodeString text("Hallo Welt");
4515     biDe->setText(text);
4516     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4517     *biDe = *biFr;
4518     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4519 }
4520 
TestBug12677()4521 void RBBITest::TestBug12677() {
4522     // Check that stripping of comments from rules for getRules() is not confused by
4523     // the presence of '#' characters in the rules that do not introduce comments.
4524     UnicodeString rules(u"!!forward; \n"
4525                          "$x = [ab#];  # a set with a # literal. \n"
4526                          " # .;        # a comment that looks sort of like a rule.   \n"
4527                          " '#' '?';    # a rule with a quoted #   \n"
4528                        );
4529 
4530     UErrorCode status = U_ZERO_ERROR;
4531     UParseError pe;
4532     RuleBasedBreakIterator bi(rules, pe, status);
4533     assertSuccess(WHERE, status);
4534     UnicodeString rtRules = bi.getRules();
4535     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4536 }
4537 
4538 
TestTableRedundancies()4539 void RBBITest::TestTableRedundancies() {
4540     UErrorCode status = U_ZERO_ERROR;
4541 
4542     LocalPointer<RuleBasedBreakIterator> bi (
4543         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4544     assertSuccess(WHERE, status);
4545     if (U_FAILURE(status)) return;
4546 
4547     RBBIDataWrapper *dw = bi->fData;
4548     const RBBIStateTable *fwtbl = dw->fForwardTable;
4549     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4550     int32_t numCharClasses = dw->fHeader->fCatCount;
4551     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4552 
4553     // Check for duplicate columns (character categories)
4554 
4555     std::vector<UnicodeString> columns;
4556     for (int32_t column = 0; column < numCharClasses; column++) {
4557         UnicodeString s;
4558         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4559             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4560             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4561         }
4562         columns.push_back(s);
4563     }
4564     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4565     for (int c1=1; c1<numCharClasses; c1++) {
4566         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4567         for (int c2 = c1+1; c2 < limit; c2++) {
4568             if (columns.at(c1) == columns.at(c2)) {
4569                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4570                 goto out;
4571             }
4572         }
4573     }
4574   out:
4575 
4576     // Check for duplicate states
4577     std::vector<UnicodeString> rows;
4578     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4579         UnicodeString s;
4580         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4581         if (in8Bits) {
4582             s.append(row->r8.fAccepting);
4583             s.append(row->r8.fLookAhead);
4584             s.append(row->r8.fTagsIdx);
4585             for (int32_t column = 0; column < numCharClasses; column++) {
4586                 s.append(row->r8.fNextState[column]);
4587             }
4588         } else {
4589             s.append(row->r16.fAccepting);
4590             s.append(row->r16.fLookAhead);
4591             s.append(row->r16.fTagsIdx);
4592             for (int32_t column = 0; column < numCharClasses; column++) {
4593                 s.append(row->r16.fNextState[column]);
4594             }
4595         }
4596         rows.push_back(s);
4597     }
4598     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4599         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4600             if (rows.at(r1) == rows.at(r2)) {
4601                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4602                 return;
4603             }
4604         }
4605     }
4606 }
4607 
4608 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4609 //            even after next() has returned DONE.
4610 
TestBug13447()4611 void RBBITest::TestBug13447() {
4612     UErrorCode status = U_ZERO_ERROR;
4613     LocalPointer<RuleBasedBreakIterator> bi(
4614         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4615     assertSuccess(WHERE, status);
4616     if (U_FAILURE(status)) return;
4617     UnicodeString data(u"1234");
4618     bi->setText(data);
4619     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4620     assertEquals(WHERE, 4, bi->next());
4621     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4622     assertEquals(WHERE, UBRK_DONE, bi->next());
4623     assertEquals(WHERE, 4, bi->current());
4624     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4625 }
4626 
4627 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4628 //  for filling the break iterator cache when starting from random positions
4629 //  in the text.
4630 //
4631 //  It's a monkey test, working on random data, with the expected data obtained
4632 //  from forward iteration (no safe rules involved), comparing with results
4633 //  when indexing into the interior of the string (safe rules needed).
4634 
TestReverse()4635 void RBBITest::TestReverse() {
4636     UErrorCode status = U_ZERO_ERROR;
4637 
4638     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4639             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4640     assertSuccess(WHERE, status, true);
4641     status = U_ZERO_ERROR;
4642     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4643             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4644     assertSuccess(WHERE, status, true);
4645     status = U_ZERO_ERROR;
4646     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4647             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4648     assertSuccess(WHERE, status, true);
4649     status = U_ZERO_ERROR;
4650     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4651             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4652     assertSuccess(WHERE, status, true);
4653 }
4654 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4655 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4656     if (!bi) {
4657         return;
4658     }
4659 
4660     // From the mapping trie in the break iterator's internal data, create a
4661     // vector of UnicodeStrings, one for each character category, containing
4662     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4663     // to avoid an execess of unassigned code points.
4664 
4665     RBBIDataWrapper *data = bi->fData;
4666     int32_t categoryCount = data->fHeader->fCatCount;
4667     UCPTrie *trie = data->fTrie;
4668     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4669     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4670 
4671     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4672     for (int cp=0; cp<0x1fff0; ++cp) {
4673         int cat = ucptrie_get(trie, cp);
4674         cat &= ~dictBit;    // And off the dictionary bit from the category.
4675         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4676         if (cat < 0 || cat >= categoryCount) return;
4677         strings[cat].append(cp);
4678     }
4679 
4680     icu_rand randomGen;
4681     const int testStringLength = 10000;
4682     UnicodeString testString;
4683 
4684     for (int i=0; i<testStringLength; ++i) {
4685         int charClass = randomGen() % categoryCount;
4686         if (strings[charClass].length() > 0) {
4687             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4688             testString.append(cp);
4689         }
4690     }
4691 
4692     typedef std::pair<UBool, int32_t> Result;
4693     std::vector<Result> expectedResults;
4694     bi->setText(testString);
4695     for (int i=0; i<testString.length(); ++i) {
4696         bool isboundary = bi->isBoundary(i);
4697         int  ruleStatus = bi->getRuleStatus();
4698         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4699     }
4700 
4701     for (int i=testString.length()-1; i>=0; --i) {
4702         bi->setText(testString);   // clears the internal break cache
4703         Result expected = expectedResults[i];
4704         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4705         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4706     }
4707 }
4708 
4709 
4710 // Ticket 13692 - finding word boundaries in very large numbers or words could
4711 //                be very time consuming. When the problem was present, this void test
4712 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4713 
TestBug13692()4714 void RBBITest::TestBug13692() {
4715     UErrorCode status = U_ZERO_ERROR;
4716     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4717             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4718     if (!assertSuccess(WHERE, status, true)) {
4719         return;
4720     }
4721     constexpr int32_t LENGTH = 1000000;
4722     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4723     for (int i=0; i<20; i+=2) {
4724         longNumber.setCharAt(i, u' ');
4725     }
4726     bi->setText(longNumber);
4727     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4728     assertSuccess(WHERE, status);
4729 }
4730 
4731 
TestProperties()4732 void RBBITest::TestProperties() {
4733     UErrorCode errorCode = U_ZERO_ERROR;
4734     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4735     if (!prependSet.isEmpty()) {
4736         errln(
4737             "[:GCB=Prepend:] is not empty any more. "
4738             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4739             "change this test to the opposite condition.");
4740     }
4741 }
4742 
4743 
4744 //
4745 //  TestDebug    -  A place-holder test for debugging purposes.
4746 //                  For putting in fragments of other tests that can be invoked
4747 //                  for tracing  without a lot of unwanted extra stuff happening.
4748 //
TestDebug(void)4749 void RBBITest::TestDebug(void) {
4750     UErrorCode status = U_ZERO_ERROR;
4751     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4752             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4753     if (!assertSuccess(WHERE, status, true)) {
4754         return;
4755     }
4756     const UnicodeString &rules = bi->getRules();
4757     UParseError pe;
4758     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4759     assertSuccess(WHERE, status);
4760 }
4761 
4762 
4763 //
4764 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4765 //                   Can be freely altered as needed or convenient.
4766 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4767 //                   data files may not be available in all environments.
4768 //                   Any permanent test cases should be moved to rbbitst.txt
4769 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4770 //
TestDebugRules()4771 void RBBITest::TestDebugRules() {
4772 #if 0
4773     const char16_t *rules = u""
4774         "!!quoted_literals_only; \n"
4775         "!!chain; \n"
4776         "!!lookAheadHardBreak; \n"
4777         " \n"
4778         // "[a] / ; \n"
4779         "[a] [b] / [c] [d]; \n"
4780         "[a] [b] / [c] [d] {100}; \n"
4781         "[x] [a] [b] / [c] [d] {100}; \n"
4782         "[a] [b] [c] / [d] {100}; \n"
4783         //" [c] [d] / [e] [f]; \n"
4784         //"[a] [b] / [c]; \n"
4785         ;
4786 
4787     UErrorCode status = U_ZERO_ERROR;
4788     CharString path(pathToDataDirectory(), status);
4789     path.appendPathPart("brkitr", status);
4790     path.appendPathPart("rules", status);
4791     path.appendPathPart("line.txt", status);
4792     int    len;
4793     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4794     if (!assertSuccess(WHERE, status)) {
4795         return;
4796     }
4797 
4798     UParseError pe;
4799     // rules = testFile.get();
4800     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4801 
4802     if (!assertSuccess(WHERE, status)) {
4803         delete bi;
4804         return;
4805     }
4806     // bi->dumpTables();
4807 
4808     delete bi;
4809 #endif
4810 }
4811 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4812 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4813     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4814     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4815     // Text are duplicate characters from U+4E00 to U+4FFF
4816     UnicodeString text;
4817     for (UChar c = 0x4e00; c < 0x5000; c++) {
4818         text.append(c).append(c);
4819     }
4820     // Generate rule which will caused length+4 character classes and
4821     // length+3 states
4822     UnicodeString rules(u"!!quoted_literals_only;");
4823     for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4824         rules.append(u'\'').append(c).append(c).append(u"';");
4825     }
4826     rules.append(u".;");
4827     UErrorCode status = U_ZERO_ERROR;
4828     UParseError parseError;
4829     RuleBasedBreakIterator bi(rules, parseError, status);
4830 
4831     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4832     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4833     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4834     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4835     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4836 
4837     bi.setText(text);
4838 
4839     int32_t pos;
4840     int32_t i = 0;
4841     while ((pos = bi.next()) > 0) {
4842         // The first numChar should not break between the pair
4843         if (i++ < numChar) {
4844             assertEquals(WHERE, i * 2, pos);
4845         } else {
4846             // After the first numChar next(), break on each character.
4847             assertEquals(WHERE, i + numChar, pos);
4848         }
4849     }
4850     while ((pos = bi.previous()) > 0) {
4851         // The first numChar should not break between the pair
4852         if (--i < numChar) {
4853             assertEquals(WHERE, i * 2, pos);
4854         } else {
4855             // After the first numChar next(), break on each character.
4856             assertEquals(WHERE, i + numChar, pos);
4857         }
4858     }
4859 }
4860 
Test8BitsTrieWith8BitStateTable()4861 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4862     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4863 }
4864 
Test16BitsTrieWith8BitStateTable()4865 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4866     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4867 }
4868 
Test16BitsTrieWith16BitStateTable()4869 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4870     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4871 }
4872 
Test8BitsTrieWith16BitStateTable()4873 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4874     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4875     // create state table in 16 bits.
4876 
4877     // Generate 510 'a' as text
4878     UnicodeString text;
4879     for (int32_t i = 0; i < 510; i++) {
4880         text.append(u'a');
4881     }
4882 
4883     UnicodeString rules(u"!!quoted_literals_only;'");
4884     // 254 'a' in the rule will cause 256 states
4885     for (int32_t i = 0; i < 254; i++) {
4886         rules.append(u'a');
4887     }
4888     rules.append(u"';.;");
4889 
4890     UErrorCode status = U_ZERO_ERROR;
4891     UParseError parseError;
4892     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4893 
4894     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4895     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4896     assertEquals(WHERE,
4897                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4898     bi->setText(text);
4899 
4900     // break positions:
4901     // 254, 508, 509, ... 510
4902     assertEquals("next()", 254, bi->next());
4903     int32_t i = 0;
4904     int32_t pos;
4905     while ((pos = bi->next()) > 0) {
4906         assertEquals(WHERE, 508 + i , pos);
4907         i++;
4908     }
4909     i = 0;
4910     while ((pos = bi->previous()) > 0) {
4911         i++;
4912         if (pos >= 508) {
4913             assertEquals(WHERE, 510 - i , pos);
4914         } else {
4915             assertEquals(WHERE, 254 , pos);
4916         }
4917     }
4918 }
4919 
4920 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4921 // that there are no problems with rules at the size that transitions between the two.
4922 //
4923 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4924 // one character class per character in the string. So we can make a rule to tickle the
4925 // boundaries by using literal strings of various lengths.
4926 //
4927 // For both the number of states and the number of character classes, the eight bit format
4928 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4929 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4930 // which allows some margin for changes to the number of values reserved by the rule builder
4931 // without breaking the test.
4932 
TestTable_8_16_Bits()4933 void RBBITest::TestTable_8_16_Bits() {
4934 
4935     // testStr serves as both the source of the rule string (truncated to the desired length)
4936     // and as test data to check matching behavior. A break rule consisting of the first 120
4937     // characters of testStr will match the first 120 chars of the full-length testStr.
4938     UnicodeString testStr;
4939     for (UChar c=0x3000; c<0x3200; ++c) {
4940         testStr.append(c);
4941     }
4942 
4943     const int32_t startLength = 120;   // The shortest rule string to test.
4944     const int32_t endLength = 260;     // The longest rule string to test
4945     const int32_t increment = this->quick ? endLength - startLength : 1;
4946 
4947     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4948         UParseError parseError;
4949         UErrorCode status = U_ZERO_ERROR;
4950 
4951         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4952         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4953         RuleBasedBreakIterator bi(ruleString, parseError, status);
4954         if (!assertSuccess(WHERE, status)) {
4955             errln(ruleString);
4956             break;
4957         }
4958         // bi.dumpTables();
4959 
4960         // Verify that the break iterator is functioning - that the first boundary found
4961         // in testStr is at the length of the rule string.
4962         bi.setText(testStr);
4963         assertEquals(WHERE, ruleLen, bi.next());
4964 
4965         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4966         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4967         bi.setText(testStr);
4968         int32_t result = bi.preceding(ruleLen);
4969         assertEquals(WHERE, 0, result);
4970 
4971         // Verify that the range of rule lengths being tested cover the translations
4972         // from 8 to 16 bit data.
4973         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4974         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4975 
4976         if (ruleLen == startLength) {
4977             assertEquals(WHERE, true, has8BitRowData);
4978             assertEquals(WHERE, true, has8BitsTrie);
4979         }
4980         if (ruleLen == endLength) {
4981             assertEquals(WHERE, false, has8BitRowData);
4982             assertEquals(WHERE, false, has8BitsTrie);
4983         }
4984     }
4985 }
4986 
4987 /* Test handling of a large number of look-ahead rules.
4988  * The number of rules in the test exceeds the implementation limits prior to the
4989  * improvements introduced with #13590.
4990  *
4991  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
4992  * The text being matched is sequential, "ABCDEFGHI..."
4993  *
4994  * The upshot is that the look-ahead rules all match on their preceding context,
4995  * and consequently must save a potential result, but then fail to match on their
4996  * trailing context, so that they don't actually cause a boundary.
4997  *
4998  * Additionally, add a ".*" rule, so there are no boundaries unless a
4999  * look-ahead hard-break rule forces one.
5000  */
TestBug13590()5001 void RBBITest::TestBug13590() {
5002     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5003 
5004     const int NUM_LOOKAHEAD_RULES = 50;
5005     const char16_t STARTING_CHAR = u'\u5000';
5006     char16_t firstChar;
5007     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5008         firstChar = STARTING_CHAR + ruleNum*2;
5009         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5010              .append(u' ') .append(u'/') .append(u' ')
5011              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5012              .append(u';') .append(u'\n');
5013     }
5014 
5015     // Change the last rule added from the form "UV / WY" to "UV / WX".
5016     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5017     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5018 
5019     UErrorCode status = U_ZERO_ERROR;
5020     UParseError parseError;
5021     RuleBasedBreakIterator bi(rules, parseError, status);
5022     if (!assertSuccess(WHERE, status)) {
5023         errln(rules);
5024         return;
5025     }
5026     // bi.dumpTables();
5027 
5028     UnicodeString testString;
5029     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5030         testString.append(c);
5031     }
5032     bi.setText(testString);
5033 
5034     int breaksFound = 0;
5035     while (bi.next() != UBRK_DONE) {
5036         ++breaksFound;
5037     }
5038 
5039     // Two matches are expected, one from the last rule that was explicitly modified,
5040     // and one at the end of the text.
5041     assertEquals(WHERE, 2, breaksFound);
5042 }
5043 
5044 
5045 #if U_ENABLE_TRACING
5046 static std::vector<std::string> gData;
5047 static std::vector<int32_t> gEntryFn;
5048 static std::vector<int32_t> gExitFn;
5049 static std::vector<int32_t> gDataFn;
5050 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5051 static void U_CALLCONV traceData(
5052         const void*,
5053         int32_t fnNumber,
5054         int32_t,
5055         const char *,
5056         va_list args) {
5057     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5058         const char* data = va_arg(args, const char*);
5059         gDataFn.push_back(fnNumber);
5060         gData.push_back(data);
5061     }
5062 }
5063 
traceEntry(const void *,int32_t fnNumber)5064 static void traceEntry(const void *, int32_t fnNumber) {
5065     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5066         gEntryFn.push_back(fnNumber);
5067     }
5068 }
5069 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5070 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5071     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5072         gExitFn.push_back(fnNumber);
5073     }
5074 }
5075 
5076 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5077 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5078     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5079     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5080     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5081     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5082 
5083     if (expectedData == nullptr) {
5084       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5085       assertEquals("utrace_data should not be called ", 0, gData.size());
5086     } else {
5087       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5088       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5089       assertEquals("utrace_data should be called ", 1, gData.size());
5090       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5091     }
5092 }
5093 
SetupTestTrace()5094 void SetupTestTrace() {
5095     gEntryFn.clear();
5096     gExitFn.clear();
5097     gDataFn.clear();
5098     gData.clear();
5099 
5100     const void* context = nullptr;
5101     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5102     utrace_setLevel(UTRACE_INFO);
5103 }
5104 
TestTraceCreateCharacter(void)5105 void RBBITest::TestTraceCreateCharacter(void) {
5106     SetupTestTrace();
5107     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5108     LocalPointer<BreakIterator> brkitr(
5109         BreakIterator::createCharacterInstance("zh-CN", status));
5110     status.errIfFailureAndReset();
5111     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5112 }
5113 
TestTraceCreateTitle(void)5114 void RBBITest::TestTraceCreateTitle(void) {
5115     SetupTestTrace();
5116     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5117     LocalPointer<BreakIterator> brkitr(
5118         BreakIterator::createTitleInstance("zh-CN", status));
5119     status.errIfFailureAndReset();
5120     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5121 }
5122 
TestTraceCreateSentence(void)5123 void RBBITest::TestTraceCreateSentence(void) {
5124     SetupTestTrace();
5125     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5126     LocalPointer<BreakIterator> brkitr(
5127         BreakIterator::createSentenceInstance("zh-CN", status));
5128     status.errIfFailureAndReset();
5129     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5130 }
5131 
TestTraceCreateWord(void)5132 void RBBITest::TestTraceCreateWord(void) {
5133     SetupTestTrace();
5134     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5135     LocalPointer<BreakIterator> brkitr(
5136         BreakIterator::createWordInstance("zh-CN", status));
5137     status.errIfFailureAndReset();
5138     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5139 }
5140 
TestTraceCreateLine(void)5141 void RBBITest::TestTraceCreateLine(void) {
5142     SetupTestTrace();
5143     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5144     LocalPointer<BreakIterator> brkitr(
5145         BreakIterator::createLineInstance("zh-CN", status));
5146     status.errIfFailureAndReset();
5147     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5148 }
5149 
TestTraceCreateLineStrict(void)5150 void RBBITest::TestTraceCreateLineStrict(void) {
5151     SetupTestTrace();
5152     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5153     LocalPointer<BreakIterator> brkitr(
5154         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5155     status.errIfFailureAndReset();
5156     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5157 }
5158 
TestTraceCreateLineNormal(void)5159 void RBBITest::TestTraceCreateLineNormal(void) {
5160     SetupTestTrace();
5161     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5162     LocalPointer<BreakIterator> brkitr(
5163         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5164     status.errIfFailureAndReset();
5165     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5166 }
5167 
TestTraceCreateLineLoose(void)5168 void RBBITest::TestTraceCreateLineLoose(void) {
5169     SetupTestTrace();
5170     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5171     LocalPointer<BreakIterator> brkitr(
5172         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5173     status.errIfFailureAndReset();
5174     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5175 }
5176 
TestTraceCreateBreakEngine(void)5177 void RBBITest::TestTraceCreateBreakEngine(void) {
5178     rbbi_cleanup();
5179     SetupTestTrace();
5180     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5181     LocalPointer<BreakIterator> brkitr(
5182         BreakIterator::createWordInstance("zh-CN", status));
5183     status.errIfFailureAndReset();
5184     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5185 
5186     // To word break the following text, BreakIterator will create 5 dictionary
5187     // break engine internally.
5188     brkitr->setText(
5189         u"test "
5190         u"測試 " // Hani
5191         u"សាកល្បង " // Khmr
5192         u"ທົດສອບ " // Laoo
5193         u"စမ်းသပ်မှု " // Mymr
5194         u"ทดสอบ " // Thai
5195         u"test "
5196     );
5197 
5198     // Loop through all the text.
5199     while (brkitr->next() > 0) ;
5200 
5201     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5202     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5203     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5204 
5205     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5206         assertEquals("utrace_entry should be called ",
5207                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5208         assertEquals("utrace_exit should be called ",
5209                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5210         assertEquals("utrace_data should be called ",
5211                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5212     }
5213 
5214     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5215     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5216     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5217     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5218     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5219 
5220 }
5221 #endif
5222 
TestUnpairedSurrogate()5223 void RBBITest::TestUnpairedSurrogate() {
5224     UnicodeString rules(u"ab;");
5225 
5226     UErrorCode status = U_ZERO_ERROR;
5227     UParseError pe;
5228     RuleBasedBreakIterator bi1(rules, pe, status);
5229     assertSuccess(WHERE, status);
5230     UnicodeString rtRules = bi1.getRules();
5231     // make sure the simple one work first.
5232     assertEquals(WHERE, rules,  rtRules);
5233 
5234 
5235     rules = UnicodeString(u"a\\ud800b;").unescape();
5236     pe.line = 0;
5237     pe.offset = 0;
5238     RuleBasedBreakIterator bi2(rules, pe, status);
5239     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5240     if (pe.line != 1 || pe.offset != 1) {
5241         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5242     }
5243 
5244     status = U_ZERO_ERROR;
5245     rules = UnicodeString(u"a\\ude00b;").unescape();
5246     pe.line = 0;
5247     pe.offset = 0;
5248     RuleBasedBreakIterator bi3(rules, pe, status);
5249     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5250     if (pe.line != 1 || pe.offset != 1) {
5251         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5252     }
5253 
5254     // make sure the surrogate one work too.
5255     status = U_ZERO_ERROR;
5256     rules = UnicodeString(u"a��b;");
5257     RuleBasedBreakIterator bi4(rules, pe, status);
5258     rtRules = bi4.getRules();
5259     assertEquals(WHERE, rules, rtRules);
5260 }
5261 
5262 // Read file generated by
5263 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5264 // as test cases and compare the Output.
5265 // Format of the file
5266 //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5267 //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5268 //   Input:\t[source text]
5269 //   Output:\t[expected output separated by | ]
5270 //   Input: ...
5271 //   Output: ...
5272 
runLSTMTestFromFile(const char * filename,UScriptCode script)5273 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5274     // The expectation in this test depends on LSTM, skip the test if the
5275     // configuration is not build with LSTM data.
5276     if (skipLSTMTest()) {
5277         return;
5278     }
5279     UErrorCode   status = U_ZERO_ERROR;
5280     LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5281     if (U_FAILURE(status)) {
5282         errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5283         return;
5284     }
5285     //  Open and read the test data file.
5286     const char *testDataDirectory = IntlTest::getSourceTestData(status);
5287     CharString testFileName(testDataDirectory, -1, status);
5288     testFileName.append(filename, -1, status);
5289 
5290     int len;
5291     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5292     if (U_FAILURE(status)) {
5293         errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5294         return;
5295     }
5296 
5297     //  Put the test data into a UnicodeString
5298     UnicodeString testString(FALSE, testFile, len);
5299 
5300     int32_t start = 0;
5301 
5302     UnicodeString line;
5303     int32_t end;
5304     std::string actual_sep_str;
5305     int32_t caseNum = 0;
5306     // Iterate through all the lines in the test file.
5307     do {
5308         int32_t cr = testString.indexOf(u'\r', start);
5309         int32_t lf = testString.indexOf(u'\n', start);
5310         end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5311         line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5312         if (line.length() > 0) {
5313             // Separate each line to key and value by TAB.
5314             int32_t tab = line.indexOf(u'\t');
5315             UnicodeString key = line.tempSubString(0, tab);
5316             const UnicodeString value = line.tempSubString(tab+1);
5317 
5318             if (key == "Model:") {
5319                 // Verify the expectation in the test file match the LSTM model
5320                 // we are using now.
5321                 const LSTMData* data = CreateLSTMDataForScript(script, status);
5322                 if (U_FAILURE(status)) {
5323                     dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5324                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5325                     return;
5326                 }
5327                 UnicodeString name(LSTMDataName(data));
5328                 DeleteLSTMData(data);
5329                 if (value != name) {
5330                     std::string utf8Name, utf8Value;
5331                     dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5332                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5333                               name.toUTF8String<std::string>(utf8Name).c_str(),
5334                               value.toUTF8String<std::string>(utf8Value).c_str());
5335                     return;
5336                 }
5337             } else if (key == "Input:") {
5338                 UnicodeString input("prefix ");
5339                 input += value + " suffix";
5340                 std::stringstream ss;
5341 
5342                 // Construct the UText which is expected by the the engine as
5343                 // input from the UnicodeString.
5344                 UText ut = UTEXT_INITIALIZER;
5345                 utext_openConstUnicodeString(&ut, &input, &status);
5346                 if (U_FAILURE(status)) {
5347                     dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5348                     return;
5349                 }
5350 
5351                 iterator->setText(&ut, status);
5352                 if (U_FAILURE(status)) {
5353                     errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5354                     return;
5355                 }
5356 
5357                 int32_t bp;
5358                 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5359                     ss << bp;
5360                     if (bp != input.length()) {
5361                         ss << ", ";
5362                     }
5363                 }
5364 
5365                 utext_close(&ut);
5366                 // Turn the break points into a string for easy comparison
5367                 // output.
5368                 actual_sep_str = "{" + ss.str() + "}";
5369             } else if (key == "Output:" && !actual_sep_str.empty()) {
5370                 UnicodeString input("prefix| |");
5371                 input += value + "| |suffix";
5372                 std::string d;
5373                 int32_t sep;
5374                 int32_t start = 0;
5375                 int32_t curr = 0;
5376                 std::stringstream ss;
5377                 // Include 0 as the break point.
5378                 ss << "0, ";
5379                 while ((sep = input.indexOf(u'|', start)) >= 0) {
5380                     int32_t len = sep - start;
5381                     if (len > 0) {
5382                         if (curr > 0) {
5383                             ss << ", ";
5384                         }
5385                         curr += len;
5386                         ss << curr;
5387                     }
5388                     start = sep + 1;
5389                 }
5390                 // Include end of the string as break point.
5391                 ss << ", " << curr + input.length() - start;
5392                 // Turn the break points into a string for easy comparison
5393                 // output.
5394                 std::string expected = "{" + ss.str() + "}";
5395                 std::string utf8;
5396 
5397                 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5398                              expected.c_str(), actual_sep_str.c_str());
5399                 actual_sep_str.clear();
5400             }
5401         }
5402         start = std::max(cr, lf) + 1;
5403     } while (end >= 0);
5404 
5405     delete [] testFile;
5406 }
5407 
TestLSTMThai()5408 void RBBITest::TestLSTMThai() {
5409     runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5410 }
5411 
TestLSTMBurmese()5412 void RBBITest::TestLSTMBurmese() {
5413     runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5414 }
5415 
5416 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5417