1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h" // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52
53
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59 if (!(x)) { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61 } \
62 } UPRV_BLOCK_MACRO_END
63
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67 } \
68 } UPRV_BLOCK_MACRO_END
69
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78
79
80 // Note: Before adding new tests to this file, check whether the desired test data can
81 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
82 // it's much less work than writing a new test, diagnostic output in the event of failures
83 // is good, and the test data file will is shared with ICU4J, so eventually the test
84 // will run there as well, without additional effort.
85
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89 fTestParams = params;
90
91 TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98 TESTCASE_AUTO(TestGetAvailableLocales);
99 TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestEndBehaviour);
102 TESTCASE_AUTO(TestWordBreaks);
103 TESTCASE_AUTO(TestWordBoundary);
104 TESTCASE_AUTO(TestLineBreaks);
105 TESTCASE_AUTO(TestSentBreaks);
106 TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109 TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112 TESTCASE_AUTO(TestBug3818);
113 #endif
114 TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116 TESTCASE_AUTO(TestBug5775);
117 #endif
118 TESTCASE_AUTO(TestBug9983);
119 TESTCASE_AUTO(TestDictRules);
120 TESTCASE_AUTO(TestBug5532);
121 TESTCASE_AUTO(TestBug7547);
122 TESTCASE_AUTO(TestBug12797);
123 TESTCASE_AUTO(TestBug12918);
124 TESTCASE_AUTO(TestBug12932);
125 TESTCASE_AUTO(TestEmoji);
126 TESTCASE_AUTO(TestBug12519);
127 TESTCASE_AUTO(TestBug12677);
128 TESTCASE_AUTO(TestTableRedundancies);
129 TESTCASE_AUTO(TestBug13447);
130 TESTCASE_AUTO(TestReverse);
131 TESTCASE_AUTO(TestBug13692);
132 TESTCASE_AUTO(TestDebugRules);
133 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137 TESTCASE_AUTO(TestTable_8_16_Bits);
138 TESTCASE_AUTO(TestBug13590);
139 TESTCASE_AUTO(TestUnpairedSurrogate);
140 TESTCASE_AUTO(TestLSTMThai);
141 TESTCASE_AUTO(TestLSTMBurmese);
142
143 #if U_ENABLE_TRACING
144 TESTCASE_AUTO(TestTraceCreateCharacter);
145 TESTCASE_AUTO(TestTraceCreateWord);
146 TESTCASE_AUTO(TestTraceCreateSentence);
147 TESTCASE_AUTO(TestTraceCreateTitle);
148 TESTCASE_AUTO(TestTraceCreateLine);
149 TESTCASE_AUTO(TestTraceCreateLineNormal);
150 TESTCASE_AUTO(TestTraceCreateLineLoose);
151 TESTCASE_AUTO(TestTraceCreateLineStrict);
152 TESTCASE_AUTO(TestTraceCreateBreakEngine);
153 #endif
154
155 TESTCASE_AUTO_END;
156 }
157
158
159 //--------------------------------------------------------------------------------------
160 //
161 // RBBITest constructor and destructor
162 //
163 //--------------------------------------------------------------------------------------
164
RBBITest()165 RBBITest::RBBITest() {
166 fTestParams = NULL;
167 }
168
169
~RBBITest()170 RBBITest::~RBBITest() {
171 }
172
173
printStringBreaks(UText * tstr,int expected[],int expectedCount)174 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
175 UErrorCode status = U_ZERO_ERROR;
176 char name[100];
177 printf("code alpha extend alphanum type word sent line name\n");
178 int nextExpectedIndex = 0;
179 utext_setNativeIndex(tstr, 0);
180 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
181 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
182 printf("------------------------------------------------ %d\n", j);
183 ++nextExpectedIndex;
184 }
185
186 UChar32 c = utext_next32(tstr);
187 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
188 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
189 u_isUAlphabetic(c),
190 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
191 u_isalnum(c),
192 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
193 u_charType(c),
194 U_SHORT_PROPERTY_NAME),
195 u_getPropertyValueName(UCHAR_WORD_BREAK,
196 u_getIntPropertyValue(c,
197 UCHAR_WORD_BREAK),
198 U_SHORT_PROPERTY_NAME),
199 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
200 u_getIntPropertyValue(c,
201 UCHAR_SENTENCE_BREAK),
202 U_SHORT_PROPERTY_NAME),
203 u_getPropertyValueName(UCHAR_LINE_BREAK,
204 u_getIntPropertyValue(c,
205 UCHAR_LINE_BREAK),
206 U_SHORT_PROPERTY_NAME),
207 name);
208 }
209 }
210
211
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)212 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
213 UErrorCode status = U_ZERO_ERROR;
214 UText *tstr = NULL;
215 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
216 if (U_FAILURE(status)) {
217 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
218 return;
219 }
220 printStringBreaks(tstr, expected, expectedCount);
221 utext_close(tstr);
222 }
223
224
TestBug3818()225 void RBBITest::TestBug3818() {
226 UErrorCode status = U_ZERO_ERROR;
227
228 // Four Thai words...
229 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
230 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
231 UnicodeString thaiStr(thaiWordData);
232
233 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
234 if (U_FAILURE(status) || bi == NULL) {
235 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
236 return;
237 }
238 bi->setText(thaiStr);
239
240 int32_t startOfSecondWord = bi->following(1);
241 if (startOfSecondWord != 4) {
242 errln("Fail at file %s, line %d expected start of word at 4, got %d",
243 __FILE__, __LINE__, startOfSecondWord);
244 }
245 startOfSecondWord = bi->following(0);
246 if (startOfSecondWord != 4) {
247 errln("Fail at file %s, line %d expected start of word at 4, got %d",
248 __FILE__, __LINE__, startOfSecondWord);
249 }
250 delete bi;
251 }
252
253
254 //---------------------------------------------
255 //
256 // other tests
257 //
258 //---------------------------------------------
259
TestGetAvailableLocales()260 void RBBITest::TestGetAvailableLocales()
261 {
262 int32_t locCount = 0;
263 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
264
265 if (locCount == 0)
266 dataerrln("getAvailableLocales() returned an empty list!");
267 // Just make sure that it's returning good memory.
268 int32_t i;
269 for (i = 0; i < locCount; ++i) {
270 logln(locList[i].getName());
271 }
272 }
273
274 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()275 void RBBITest::TestGetDisplayName()
276 {
277 UnicodeString result;
278
279 BreakIterator::getDisplayName(Locale::getUS(), result);
280 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
281 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
282 + result);
283
284 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
285 if (result != "French (France)")
286 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
287 + result);
288 }
289 /**
290 * Test End Behaviour
291 * @bug 4068137
292 */
TestEndBehaviour()293 void RBBITest::TestEndBehaviour()
294 {
295 UErrorCode status = U_ZERO_ERROR;
296 UnicodeString testString("boo.");
297 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
298 if (U_FAILURE(status))
299 {
300 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
301 return;
302 }
303 wb->setText(testString);
304
305 if (wb->first() != 0)
306 errln("Didn't get break at beginning of string.");
307 if (wb->next() != 3)
308 errln("Didn't get break before period in \"boo.\"");
309 if (wb->current() != 4 && wb->next() != 4)
310 errln("Didn't get break at end of string.");
311 delete wb;
312 }
313 /*
314 * @bug 4153072
315 */
TestBug4153072()316 void RBBITest::TestBug4153072() {
317 UErrorCode status = U_ZERO_ERROR;
318 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
319 if (U_FAILURE(status))
320 {
321 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
322 return;
323 }
324 UnicodeString str("...Hello, World!...");
325 int32_t begin = 3;
326 int32_t end = str.length() - 3;
327 UBool onBoundary;
328
329 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
330 iter->adoptText(textIterator);
331 int index;
332 // Note: with the switch to UText, there is no way to restrict the
333 // iteration range to begin at an index other than zero.
334 // String character iterators created with a non-zero bound are
335 // treated by RBBI as being empty.
336 for (index = -1; index < begin + 1; ++index) {
337 onBoundary = iter->isBoundary(index);
338 if (index == 0? !onBoundary : onBoundary) {
339 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
340 " and begin index = " + begin);
341 }
342 }
343 delete iter;
344 }
345
346
347 //
348 // Test for problem reported by Ashok Matoria on 9 July 2007
349 // One.<kSoftHyphen><kSpace>Two.
350 //
351 // Sentence break at start (0) and then on calling next() it breaks at
352 // 'T' of "Two". Now, at this point if I do next() and
353 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
354 //
TestBug5775()355 void RBBITest::TestBug5775() {
356 UErrorCode status = U_ZERO_ERROR;
357 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
358 TEST_ASSERT_SUCCESS(status);
359 if (U_FAILURE(status)) {
360 return;
361 }
362 // Check for status first for better handling of no data errors.
363 TEST_ASSERT(bi != NULL);
364 if (bi == NULL) {
365 return;
366 }
367
368 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
369 // 01234 56789
370 s = s.unescape();
371 bi->setText(s);
372 int pos = bi->next();
373 TEST_ASSERT(pos == 6);
374 pos = bi->next();
375 TEST_ASSERT(pos == 10);
376 pos = bi->previous();
377 TEST_ASSERT(pos == 6);
378 delete bi;
379 }
380
381
382
383 //------------------------------------------------------------------------------
384 //
385 // RBBITest::Extended Run RBBI Tests from an external test data file
386 //
387 //------------------------------------------------------------------------------
388
389 struct TestParams {
390 BreakIterator *bi; // Break iterator is set while parsing test source.
391 // Changed out whenever test data changes break type.
392
393 UnicodeString dataToBreak; // Data that is built up while parsing the test.
394 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
395 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
396 UVector32 *srcCol;
397
398 UText *textToBreak; // UText, could be UTF8 or UTF16.
399 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
400 CharString utf8String; // UTF-8 form of text to break.
401
TestParamsTestParams402 TestParams(UErrorCode &status) : dataToBreak() {
403 bi = NULL;
404 expectedBreaks = new UVector32(status);
405 srcLine = new UVector32(status);
406 srcCol = new UVector32(status);
407 textToBreak = NULL;
408 textMap = new UVector32(status);
409 }
410
~TestParamsTestParams411 ~TestParams() {
412 delete bi;
413 delete expectedBreaks;
414 delete srcLine;
415 delete srcCol;
416 utext_close(textToBreak);
417 delete textMap;
418 }
419
420 int32_t getSrcLine(int32_t bp);
421 int32_t getExpectedBreak(int32_t bp);
422 int32_t getSrcCol(int32_t bp);
423
424 void setUTF16(UErrorCode &status);
425 void setUTF8(UErrorCode &status);
426 };
427
428 // Append a UnicodeString to a CharString with UTF-8 encoding.
429 // Substitute any invalid chars.
430 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)431 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
432 if (U_FAILURE(status)) {
433 return;
434 }
435 int32_t utf8Length;
436 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
437 src.getBuffer(), src.length(), // UTF-16 data
438 0xfffd, NULL, // Substitution char, number of subs.
439 &status);
440 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
441 return;
442 }
443 status = U_ZERO_ERROR;
444 int32_t capacity;
445 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
446 u_strToUTF8WithSub(buffer, utf8Length, NULL,
447 src.getBuffer(), src.length(),
448 0xfffd, NULL, &status);
449 dest.append(buffer, utf8Length, status);
450 }
451
452
setUTF16(UErrorCode & status)453 void TestParams::setUTF16(UErrorCode &status) {
454 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
455 textMap->removeAllElements();
456 for (int32_t i=0; i<dataToBreak.length(); i++) {
457 if (i == dataToBreak.getChar32Start(i)) {
458 textMap->addElement(i, status);
459 } else {
460 textMap->addElement(-1, status);
461 }
462 }
463 textMap->addElement(dataToBreak.length(), status);
464 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
465 }
466
467
setUTF8(UErrorCode & status)468 void TestParams::setUTF8(UErrorCode &status) {
469 if (U_FAILURE(status)) {
470 return;
471 }
472 utf8String.clear();
473 CharStringAppend(utf8String, dataToBreak, status);
474 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
475 if (U_FAILURE(status)) {
476 return;
477 }
478
479 textMap->removeAllElements();
480 int32_t utf16Index = 0;
481 for (;;) {
482 textMap->addElement(utf16Index, status);
483 UChar32 c32 = utext_current32(textToBreak);
484 if (c32 < 0) {
485 break;
486 }
487 utf16Index += U16_LENGTH(c32);
488 utext_next32(textToBreak);
489 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
490 textMap->addElement(-1, status);
491 }
492 }
493 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
494 }
495
496
getSrcLine(int32_t bp)497 int32_t TestParams::getSrcLine(int32_t bp) {
498 if (bp >= textMap->size()) {
499 bp = textMap->size() - 1;
500 }
501 int32_t i = 0;
502 for(; bp >= 0 ; --bp) {
503 // Move to a character boundary if we are not on one already.
504 i = textMap->elementAti(bp);
505 if (i >= 0) {
506 break;
507 }
508 }
509 return srcLine->elementAti(i);
510 }
511
512
getExpectedBreak(int32_t bp)513 int32_t TestParams::getExpectedBreak(int32_t bp) {
514 if (bp >= textMap->size()) {
515 return 0;
516 }
517 int32_t i = textMap->elementAti(bp);
518 int32_t retVal = 0;
519 if (i >= 0) {
520 retVal = expectedBreaks->elementAti(i);
521 }
522 return retVal;
523 }
524
525
getSrcCol(int32_t bp)526 int32_t TestParams::getSrcCol(int32_t bp) {
527 if (bp >= textMap->size()) {
528 bp = textMap->size() - 1;
529 }
530 int32_t i = 0;
531 for(; bp >= 0; --bp) {
532 // Move bp to a character boundary if we are not on one already.
533 i = textMap->elementAti(bp);
534 if (i >= 0) {
535 break;
536 }
537 }
538 return srcCol->elementAti(i);
539 }
540
541
executeTest(TestParams * t,UErrorCode & status)542 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
543 int32_t bp;
544 int32_t prevBP;
545 int32_t i;
546
547 TEST_ASSERT_SUCCESS(status);
548 if (U_FAILURE(status)) {
549 return;
550 }
551
552 if (t->bi == NULL) {
553 return;
554 }
555
556 t->bi->setText(t->textToBreak, status);
557 //
558 // Run the iterator forward
559 //
560 prevBP = -1;
561 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
562 if (prevBP == bp) {
563 // Fail for lack of forward progress.
564 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
565 bp, t->getSrcLine(bp), t->getSrcCol(bp));
566 break;
567 }
568
569 // Check that there we didn't miss an expected break between the last one
570 // and this one.
571 for (i=prevBP+1; i<bp; i++) {
572 if (t->getExpectedBreak(i) != 0) {
573 int expected[] = {0, i};
574 printStringBreaks(t->dataToBreak, expected, 2);
575 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
576 i, t->getSrcLine(i), t->getSrcCol(i));
577 }
578 }
579
580 // Check that the break we did find was expected
581 if (t->getExpectedBreak(bp) == 0) {
582 int expected[] = {0, bp};
583 printStringBreaks(t->textToBreak, expected, 2);
584 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
585 bp, t->getSrcLine(bp), t->getSrcCol(bp));
586 } else {
587 // The break was expected.
588 // Check that the {nnn} tag value is correct.
589 int32_t expectedTagVal = t->getExpectedBreak(bp);
590 if (expectedTagVal == -1) {
591 expectedTagVal = 0;
592 }
593 int32_t line = t->getSrcLine(bp);
594 int32_t rs = t->bi->getRuleStatus();
595 if (rs != expectedTagVal) {
596 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
597 " Actual, Expected status = %4d, %4d",
598 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
599 }
600 }
601
602 prevBP = bp;
603 }
604
605 // Verify that there were no missed expected breaks after the last one found
606 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
607 if (t->getExpectedBreak(i) != 0) {
608 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
609 i, t->getSrcLine(i), t->getSrcCol(i));
610 }
611 }
612
613 //
614 // Run the iterator backwards, verify that the same breaks are found.
615 //
616 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
617 bp = t->bi->last();
618 while (bp != BreakIterator::DONE) {
619 if (prevBP == bp) {
620 // Fail for lack of progress.
621 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
622 bp, t->getSrcLine(bp), t->getSrcCol(bp));
623 break;
624 }
625
626 // Check that we didn't miss an expected break between the last one
627 // and this one. (UVector returns zeros for index out of bounds.)
628 for (i=prevBP-1; i>bp; i--) {
629 if (t->getExpectedBreak(i) != 0) {
630 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
631 i, t->getSrcLine(i), t->getSrcCol(i));
632 }
633 }
634
635 // Check that the break we did find was expected
636 if (t->getExpectedBreak(bp) == 0) {
637 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
638 bp, t->getSrcLine(bp), t->getSrcCol(bp));
639 } else {
640 // The break was expected.
641 // Check that the {nnn} tag value is correct.
642 int32_t expectedTagVal = t->getExpectedBreak(bp);
643 if (expectedTagVal == -1) {
644 expectedTagVal = 0;
645 }
646 int line = t->getSrcLine(bp);
647 int32_t rs = t->bi->getRuleStatus();
648 if (rs != expectedTagVal) {
649 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
650 " Actual, Expected status = %4d, %4d",
651 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
652 }
653 }
654
655 prevBP = bp;
656 bp = t->bi->previous();
657 }
658
659 // Verify that there were no missed breaks prior to the last one found
660 for (i=prevBP-1; i>=0; i--) {
661 if (t->getExpectedBreak(i) != 0) {
662 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
663 i, t->getSrcLine(i), t->getSrcCol(i));
664 }
665 }
666
667 // Check isBoundary()
668 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
669 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
670 UBool boundaryFound = t->bi->isBoundary(i);
671 if (boundaryExpected != boundaryFound) {
672 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
673 " Expected, Actual= %s, %s",
674 i, t->getSrcLine(i), t->getSrcCol(i),
675 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
676 }
677 }
678
679 // Check following()
680 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
681 int32_t actualBreak = t->bi->following(i);
682 int32_t expectedBreak = BreakIterator::DONE;
683 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
684 if (t->getExpectedBreak(j) != 0) {
685 expectedBreak = j;
686 break;
687 }
688 }
689 if (expectedBreak != actualBreak) {
690 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
691 " Expected, Actual= %d, %d",
692 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
693 }
694 }
695
696 // Check preceding()
697 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
698 int32_t actualBreak = t->bi->preceding(i);
699 int32_t expectedBreak = BreakIterator::DONE;
700
701 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
702 // preceding(trailing byte) will return the index of some preceding code point,
703 // not the lead byte of the current code point, even though that has a smaller index.
704 // Therefore, start looking at the expected break data not at i-1, but at
705 // the start of code point index - 1.
706 utext_setNativeIndex(t->textToBreak, i);
707 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
708 for (; j >= 0; j--) {
709 if (t->getExpectedBreak(j) != 0) {
710 expectedBreak = j;
711 break;
712 }
713 }
714 if (expectedBreak != actualBreak) {
715 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
716 " Expected, Actual= %d, %d",
717 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
718 }
719 }
720 }
721
TestExtended()722 void RBBITest::TestExtended() {
723 // The expectations in this test heavily depends on the Thai dictionary.
724 // Therefore, we skip this test under the LSTM configuration.
725 if (skipDictionaryTest()) {
726 return;
727 }
728 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
729 // data driven test closely entangles filtered and regular data.
730 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
731 UErrorCode status = U_ZERO_ERROR;
732 Locale locale("");
733
734 TestParams tp(status);
735
736 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
737 if (U_FAILURE(status)) {
738 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
739 }
740
741 //
742 // Open and read the test data file.
743 //
744 const char *testDataDirectory = IntlTest::getSourceTestData(status);
745 CharString testFileName(testDataDirectory, -1, status);
746 testFileName.append("rbbitst.txt", -1, status);
747
748 int len;
749 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
750 if (U_FAILURE(status)) {
751 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
752 return;
753 }
754
755 bool skipTest = false; // Skip this test?
756
757 //
758 // Put the test data into a UnicodeString
759 //
760 UnicodeString testString(FALSE, testFile, len);
761
762 enum EParseState{
763 PARSE_COMMENT,
764 PARSE_TAG,
765 PARSE_DATA,
766 PARSE_NUM,
767 PARSE_RULES
768 }
769 parseState = PARSE_TAG;
770
771 EParseState savedState = PARSE_TAG;
772
773 int32_t lineNum = 1;
774 int32_t colStart = 0;
775 int32_t column = 0;
776 int32_t charIdx = 0;
777
778 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
779
780 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
781 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
782
783 for (charIdx = 0; charIdx < len; ) {
784 status = U_ZERO_ERROR;
785 UChar c = testString.charAt(charIdx);
786 charIdx++;
787 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
788 // treat CRLF as a unit
789 c = u'\n';
790 charIdx++;
791 }
792 if (c == u'\n' || c == u'\r') {
793 lineNum++;
794 colStart = charIdx;
795 }
796 column = charIdx - colStart + 1;
797
798 switch (parseState) {
799 case PARSE_COMMENT:
800 if (c == u'\n' || c == u'\r') {
801 parseState = savedState;
802 }
803 break;
804
805 case PARSE_TAG:
806 {
807 if (c == u'#') {
808 parseState = PARSE_COMMENT;
809 savedState = PARSE_TAG;
810 break;
811 }
812 if (u_isUWhiteSpace(c)) {
813 break;
814 }
815 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
816 delete tp.bi;
817 tp.bi = BreakIterator::createWordInstance(locale, status);
818 skipTest = false;
819 charIdx += 5;
820 break;
821 }
822 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
823 delete tp.bi;
824 tp.bi = BreakIterator::createCharacterInstance(locale, status);
825 skipTest = false;
826 charIdx += 5;
827 break;
828 }
829 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
830 delete tp.bi;
831 tp.bi = BreakIterator::createLineInstance(locale, status);
832 skipTest = false;
833 charIdx += 5;
834 break;
835 }
836 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
837 delete tp.bi;
838 tp.bi = BreakIterator::createSentenceInstance(locale, status);
839 skipTest = false;
840 charIdx += 5;
841 break;
842 }
843 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
844 delete tp.bi;
845 tp.bi = BreakIterator::createTitleInstance(locale, status);
846 charIdx += 6;
847 break;
848 }
849
850 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
851 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
852 charIdx = testString.indexOf(u'>', charIdx) + 1;
853 parseState = PARSE_RULES;
854 rules.remove();
855 rulesFirstLine = lineNum;
856 break;
857 }
858
859 // <locale loc_name>
860 localeMatcher.reset(testString);
861 if (localeMatcher.lookingAt(charIdx-1, status)) {
862 UnicodeString localeName = localeMatcher.group(1, status);
863 char localeName8[100];
864 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
865 locale = Locale::createFromName(localeName8);
866 charIdx += localeMatcher.group(0, status).length() - 1;
867 TEST_ASSERT_SUCCESS(status);
868 break;
869 }
870 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
871 parseState = PARSE_DATA;
872 charIdx += 5;
873 tp.dataToBreak = "";
874 tp.expectedBreaks->removeAllElements();
875 tp.srcCol ->removeAllElements();
876 tp.srcLine->removeAllElements();
877 break;
878 }
879
880 errln("line %d: Tag expected in test file.", lineNum);
881 parseState = PARSE_COMMENT;
882 savedState = PARSE_DATA;
883 goto end_test; // Stop the test.
884 }
885 break;
886
887 case PARSE_RULES:
888 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
889 charIdx += 7;
890 parseState = PARSE_TAG;
891 delete tp.bi;
892 UParseError pe;
893 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
894 skipTest = U_FAILURE(status);
895 if (U_FAILURE(status)) {
896 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
897 rulesFirstLine + pe.line - 1, u_errorName(status));
898 }
899 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
900 charIdx += 10;
901 parseState = PARSE_TAG;
902 UErrorCode ec = U_ZERO_ERROR;
903 UParseError pe;
904 RuleBasedBreakIterator bi(rules, pe, ec);
905 if (U_SUCCESS(ec)) {
906 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
907 rulesFirstLine + pe.line - 1);
908 }
909 } else {
910 rules.append(c);
911 }
912 break;
913
914 case PARSE_DATA:
915 if (c == u'•') {
916 int32_t breakIdx = tp.dataToBreak.length();
917 if (tp.expectedBreaks->size() > breakIdx) {
918 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
919 lineNum, column);
920 }
921 tp.expectedBreaks->setSize(breakIdx+1);
922 tp.expectedBreaks->setElementAt(-1, breakIdx);
923 tp.srcLine->setSize(breakIdx+1);
924 tp.srcLine->setElementAt(lineNum, breakIdx);
925 tp.srcCol ->setSize(breakIdx+1);
926 tp.srcCol ->setElementAt(column, breakIdx);
927 break;
928 }
929
930 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
931 // Add final entry to mappings from break location to source file position.
932 // Need one extra because last break position returned is after the
933 // last char in the data, not at the last char.
934 tp.srcLine->addElement(lineNum, status);
935 tp.srcCol ->addElement(column, status);
936
937 parseState = PARSE_TAG;
938 charIdx += 6;
939
940 if (!skipTest) {
941 // RUN THE TEST!
942 status = U_ZERO_ERROR;
943 tp.setUTF16(status);
944 executeTest(&tp, status);
945 TEST_ASSERT_SUCCESS(status);
946
947 // Run again, this time with UTF-8 text wrapped in a UText.
948 status = U_ZERO_ERROR;
949 tp.setUTF8(status);
950 TEST_ASSERT_SUCCESS(status);
951 executeTest(&tp, status);
952 }
953 break;
954 }
955
956 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
957 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
958 // Get the code point from the name and insert it into the test data.
959 // (Damn, no API takes names in Unicode !!!
960 // we've got to take it back to char *)
961 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
962 int32_t nameLength = nameEndIdx - (charIdx+2);
963 char charNameBuf[200];
964 UChar32 theChar = -1;
965 if (nameEndIdx != -1) {
966 UErrorCode status = U_ZERO_ERROR;
967 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
968 charNameBuf[sizeof(charNameBuf)-1] = 0;
969 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
970 if (U_FAILURE(status)) {
971 theChar = -1;
972 }
973 }
974 if (theChar == -1) {
975 errln("Error in named character in test file at line %d, col %d",
976 lineNum, column);
977 } else {
978 // Named code point was recognized. Insert it
979 // into the test data.
980 tp.dataToBreak.append(theChar);
981 while (tp.dataToBreak.length() > tp.srcLine->size()) {
982 tp.srcLine->addElement(lineNum, status);
983 tp.srcCol ->addElement(column, status);
984 }
985 }
986 if (nameEndIdx > charIdx) {
987 charIdx = nameEndIdx+1;
988
989 }
990 break;
991 }
992
993
994
995 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
996 charIdx++;
997 int32_t breakIdx = tp.dataToBreak.length();
998 tp.expectedBreaks->setSize(breakIdx+1);
999 tp.expectedBreaks->setElementAt(-1, breakIdx);
1000 tp.srcLine->setSize(breakIdx+1);
1001 tp.srcLine->setElementAt(lineNum, breakIdx);
1002 tp.srcCol ->setSize(breakIdx+1);
1003 tp.srcCol ->setElementAt(column, breakIdx);
1004 break;
1005 }
1006
1007 if (c == u'<') {
1008 tagValue = 0;
1009 parseState = PARSE_NUM;
1010 break;
1011 }
1012
1013 if (c == u'#' && column==3) { // TODO: why is column off so far?
1014 parseState = PARSE_COMMENT;
1015 savedState = PARSE_DATA;
1016 break;
1017 }
1018
1019 if (c == u'\\') {
1020 // Check for \ at end of line, a line continuation.
1021 // Advance over (discard) the newline
1022 UChar32 cp = testString.char32At(charIdx);
1023 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1024 // We have a CR LF
1025 // Need an extra increment of the input ptr to move over both of them
1026 charIdx++;
1027 }
1028 if (cp == u'\n' || cp == u'\r') {
1029 lineNum++;
1030 colStart = charIdx;
1031 charIdx++;
1032 break;
1033 }
1034
1035 // Let unescape handle the back slash.
1036 cp = testString.unescapeAt(charIdx);
1037 if (cp != -1) {
1038 // Escape sequence was recognized. Insert the char
1039 // into the test data.
1040 tp.dataToBreak.append(cp);
1041 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1042 tp.srcLine->addElement(lineNum, status);
1043 tp.srcCol ->addElement(column, status);
1044 }
1045 break;
1046 }
1047
1048
1049 // Not a recognized backslash escape sequence.
1050 // Take the next char as a literal.
1051 // TODO: Should this be an error?
1052 c = testString.charAt(charIdx);
1053 charIdx = testString.moveIndex32(charIdx, 1);
1054 }
1055
1056 // Normal, non-escaped data char.
1057 tp.dataToBreak.append(c);
1058
1059 // Save the mapping from offset in the data to line/column numbers in
1060 // the original input file. Will be used for better error messages only.
1061 // If there's an expected break before this char, the slot in the mapping
1062 // vector will already be set for this char; don't overwrite it.
1063 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1064 tp.srcLine->addElement(lineNum, status);
1065 tp.srcCol ->addElement(column, status);
1066 }
1067 break;
1068
1069
1070 case PARSE_NUM:
1071 // We are parsing an expected numeric tag value, like <1234>,
1072 // within a chunk of data.
1073 if (u_isUWhiteSpace(c)) {
1074 break;
1075 }
1076
1077 if (c == u'>') {
1078 // Finished the number. Add the info to the expected break data,
1079 // and switch parse state back to doing plain data.
1080 parseState = PARSE_DATA;
1081 if (tagValue == 0) {
1082 tagValue = -1;
1083 }
1084 int32_t breakIdx = tp.dataToBreak.length();
1085 if (tp.expectedBreaks->size() > breakIdx) {
1086 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1087 lineNum, column);
1088 }
1089 tp.expectedBreaks->setSize(breakIdx+1);
1090 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1091 tp.srcLine->setSize(breakIdx+1);
1092 tp.srcLine->setElementAt(lineNum, breakIdx);
1093 tp.srcCol ->setSize(breakIdx+1);
1094 tp.srcCol ->setElementAt(column, breakIdx);
1095 break;
1096 }
1097
1098 if (u_isdigit(c)) {
1099 tagValue = tagValue*10 + u_charDigitValue(c);
1100 break;
1101 }
1102
1103 errln("Syntax Error in test file at line %d, col %d",
1104 lineNum, column);
1105 parseState = PARSE_COMMENT;
1106 goto end_test; // Stop the test
1107 break;
1108 }
1109
1110
1111 if (U_FAILURE(status)) {
1112 dataerrln("ICU Error %s while parsing test file at line %d.",
1113 u_errorName(status), lineNum);
1114 status = U_ZERO_ERROR;
1115 goto end_test; // Stop the test
1116 }
1117
1118 }
1119
1120 // Reached end of test file. Raise an error if parseState indicates that we are
1121 // within a block that should have been terminated.
1122
1123 if (parseState == PARSE_RULES) {
1124 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1125 lineNum, rulesFirstLine);
1126 }
1127 if (parseState == PARSE_DATA) {
1128 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1129 }
1130
1131
1132 end_test:
1133 delete [] testFile;
1134 #endif
1135 }
1136
1137 //-------------------------------------------------------------------------------
1138 //
1139 // TestDictRules create a break iterator from source rules that includes a
1140 // dictionary range. Regression for bug #7130. Source rules
1141 // do not declare a break iterator type (word, line, sentence, etc.
1142 // but the dictionary code, without a type, would loop.
1143 //
1144 //-------------------------------------------------------------------------------
TestDictRules()1145 void RBBITest::TestDictRules() {
1146 const char *rules = "$dictionary = [a-z]; \n"
1147 "!!forward; \n"
1148 "$dictionary $dictionary; \n"
1149 "!!reverse; \n"
1150 "$dictionary $dictionary; \n";
1151 const char *text = "aa";
1152 UErrorCode status = U_ZERO_ERROR;
1153 UParseError parseError;
1154
1155 RuleBasedBreakIterator bi(rules, parseError, status);
1156 if (U_SUCCESS(status)) {
1157 UnicodeString utext = text;
1158 bi.setText(utext);
1159 int32_t position;
1160 int32_t loops;
1161 for (loops = 0; loops<10; loops++) {
1162 position = bi.next();
1163 if (position == RuleBasedBreakIterator::DONE) {
1164 break;
1165 }
1166 }
1167 TEST_ASSERT(loops == 1);
1168 } else {
1169 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1170 }
1171 }
1172
1173
1174
1175 //--------------------------------------------------------------------------------------------
1176 //
1177 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1178 //
1179 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1180 void RBBITest::TestUnicodeFiles() {
1181 RuleBasedBreakIterator *bi;
1182 UErrorCode status = U_ZERO_ERROR;
1183
1184 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1185 TEST_ASSERT_SUCCESS(status);
1186 if (U_SUCCESS(status)) {
1187 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1188 }
1189 delete bi;
1190
1191 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1192 TEST_ASSERT_SUCCESS(status);
1193 if (U_SUCCESS(status)) {
1194 runUnicodeTestData("WordBreakTest.txt", bi);
1195 }
1196 delete bi;
1197
1198 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1199 TEST_ASSERT_SUCCESS(status);
1200 if (U_SUCCESS(status)) {
1201 runUnicodeTestData("SentenceBreakTest.txt", bi);
1202 }
1203 delete bi;
1204
1205 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1206 TEST_ASSERT_SUCCESS(status);
1207 if (U_SUCCESS(status)) {
1208 runUnicodeTestData("LineBreakTest.txt", bi);
1209 }
1210 delete bi;
1211 }
1212
1213
1214 // Check for test cases from the Unicode test data files that are known to fail
1215 // and should be skipped as known issues because ICU does not fully implement
1216 // the Unicode specifications, or because ICU includes tailorings that differ from
1217 // the Unicode standard.
1218 //
1219 // Test cases are identified by the test data sequence, which tends to be more stable
1220 // across Unicode versions than the test file line numbers.
1221 //
1222 // The test case with ticket "10666" is a dummy, included as an example.
1223
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1224 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1225 static struct TestCase {
1226 const char *fTicketNum;
1227 const char *fFileName;
1228 const UChar *fString;
1229 } badTestCases[] = {
1230 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1231 // The following tests were originally for
1232 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1233 // However, that ticket has been closed as fixed but these tests still fail, so
1234 // ICU-21097 has been created to investigate and address these remaining issues.
1235 {"21097", "LineBreakTest.txt", u"-#"},
1236 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1237 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1238 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1239 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1240 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1241 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1242 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1243
1244 // The following tests were originally for
1245 // Issue ICU-12017 Improve line break around numbers.
1246 // However, that ticket has been closed as fixed but these tests still fail, so
1247 // ICU-21097 has been created to investigate and address these remaining issues.
1248 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1249 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1250 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1251 {"21097", "LineBreakTest.txt", u"a.2 "},
1252 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1253 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1254 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1255 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1256 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1257 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1258 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1259 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1260 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1261 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1262 };
1263
1264 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1265 const TestCase &badCase = badTestCases[n];
1266 if (!strcmp(fileName, badCase.fFileName) &&
1267 testCase == UnicodeString(badCase.fString)) {
1268 return logKnownIssue(badCase.fTicketNum);
1269 }
1270 }
1271 return FALSE;
1272 }
1273
1274
1275 //--------------------------------------------------------------------------------------------
1276 //
1277 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1278 //
1279 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1280 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1281 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1282 UErrorCode status = U_ZERO_ERROR;
1283
1284 //
1285 // Open and read the test data file, put it into a UnicodeString.
1286 //
1287 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1288 char testFileName[1000];
1289 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1290 dataerrln("Can't open test data. Path too long.");
1291 return;
1292 }
1293 strcpy(testFileName, testDataDirectory);
1294 strcat(testFileName, fileName);
1295
1296 logln("Opening data file %s\n", fileName);
1297
1298 int len;
1299 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1300 if (status != U_FILE_ACCESS_ERROR) {
1301 TEST_ASSERT_SUCCESS(status);
1302 TEST_ASSERT(testFile != NULL);
1303 }
1304 if (U_FAILURE(status) || testFile == NULL) {
1305 return; /* something went wrong, error already output */
1306 }
1307 UnicodeString testFileAsString(TRUE, testFile, len);
1308
1309 //
1310 // Parse the test data file using a regular expression.
1311 // Each kind of token is recognized in its own capture group; what type of item was scanned
1312 // is identified by which group had a match.
1313 //
1314 // Capture Group # 1 2 3 4 5
1315 // Parses this item: divide x hex digits comment \n unrecognized \n
1316 //
1317 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1318 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1319 UnicodeString testString;
1320 UVector32 breakPositions(status);
1321 int lineNumber = 1;
1322 TEST_ASSERT_SUCCESS(status);
1323 if (U_FAILURE(status)) {
1324 return;
1325 }
1326
1327 //
1328 // Scan through each test case, building up the string to be broken in testString,
1329 // and the positions that should be boundaries in the breakPositions vector.
1330 //
1331 int spin = 0;
1332 while (tokenMatcher.find()) {
1333 if(tokenMatcher.hitEnd()) {
1334 /* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
1335 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1336 and caused an infinite loop here on EBCDIC systems!
1337 */
1338 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1339 // return;
1340 }
1341 if (tokenMatcher.start(1, status) >= 0) {
1342 // Scanned a divide sign, indicating a break position in the test data.
1343 if (testString.length()>0) {
1344 breakPositions.addElement(testString.length(), status);
1345 }
1346 }
1347 else if (tokenMatcher.start(2, status) >= 0) {
1348 // Scanned an 'x', meaning no break at this position in the test data
1349 // Nothing to be done here.
1350 }
1351 else if (tokenMatcher.start(3, status) >= 0) {
1352 // Scanned Hex digits. Convert them to binary, append to the character data string.
1353 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1354 int length = hexNumber.length();
1355 if (length<=8) {
1356 char buf[10];
1357 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1358 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1359 if (c<=0x10ffff) {
1360 testString.append(c);
1361 } else {
1362 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1363 fileName, lineNumber);
1364 }
1365 } else {
1366 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1367 fileName, lineNumber);
1368 }
1369 }
1370 else if (tokenMatcher.start(4, status) >= 0) {
1371 // Scanned to end of a line, possibly skipping over a comment in the process.
1372 // If the line from the file contained test data, run the test now.
1373 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1374 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1375 }
1376
1377 // Clear out this test case.
1378 // The string and breakPositions vector will be refilled as the next
1379 // test case is parsed.
1380 testString.remove();
1381 breakPositions.removeAllElements();
1382 lineNumber++;
1383 } else {
1384 // Scanner catchall. Something unrecognized appeared on the line.
1385 char token[16];
1386 UnicodeString uToken = tokenMatcher.group(0, status);
1387 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1388 token[sizeof(token)-1] = 0;
1389 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1390
1391 // Clean up, in preparation for continuing with the next line.
1392 testString.remove();
1393 breakPositions.removeAllElements();
1394 lineNumber++;
1395 }
1396 TEST_ASSERT_SUCCESS(status);
1397 if (U_FAILURE(status)) {
1398 break;
1399 }
1400 }
1401
1402 delete [] testFile;
1403 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1404 }
1405
1406 //--------------------------------------------------------------------------------------------
1407 //
1408 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1409 // test data files. Do only a simple, forward-only check -
1410 // this test is mostly to check that ICU and the Unicode
1411 // data agree with each other.
1412 //
1413 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1414 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1415 const UnicodeString &testString, // Text data to be broken
1416 UVector32 *breakPositions, // Positions where breaks should be found.
1417 RuleBasedBreakIterator *bi) {
1418 int32_t pos; // Break Position in the test string
1419 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1420 int32_t expectedPos; // Expected break position (index into test string)
1421
1422 bi->setText(testString);
1423 pos = bi->first();
1424 pos = bi->next();
1425
1426 while (pos != BreakIterator::DONE) {
1427 if (expectedI >= breakPositions->size()) {
1428 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1429 testFileName, lineNumber, pos);
1430 break;
1431 }
1432 expectedPos = breakPositions->elementAti(expectedI);
1433 if (pos < expectedPos) {
1434 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1435 testFileName, lineNumber, pos);
1436 break;
1437 }
1438 if (pos > expectedPos) {
1439 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1440 testFileName, lineNumber, expectedPos);
1441 break;
1442 }
1443 pos = bi->next();
1444 expectedI++;
1445 }
1446
1447 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1448 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1449 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1450 }
1451 }
1452
1453
1454
1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1456 //---------------------------------------------------------------------------------------
1457 //
1458 // class RBBIMonkeyKind
1459 //
1460 // Monkey Test for Break Iteration
1461 // Abstract interface class. Concrete derived classes independently
1462 // implement the break rules for different iterator types.
1463 //
1464 // The Monkey Test itself uses doesn't know which type of break iterator it is
1465 // testing, but works purely in terms of the interface defined here.
1466 //
1467 //---------------------------------------------------------------------------------------
1468 class RBBIMonkeyKind {
1469 public:
1470 // Return a UVector of UnicodeSets, representing the character classes used
1471 // for this type of iterator.
1472 virtual UVector *charClasses() = 0;
1473
1474 // Set the test text on which subsequent calls to next() will operate
1475 virtual void setText(const UnicodeString &s) = 0;
1476
1477 // Find the next break position, starting from the prev break position, or from zero.
1478 // Return -1 after reaching end of string.
1479 virtual int32_t next(int32_t i) = 0;
1480
1481 // Name of each character class, parallel with charClasses. Used for debugging output
1482 // of characters.
1483 virtual std::vector<std::string>& characterClassNames();
1484
1485 void setAppliedRule(int32_t position, const char* value);
1486
1487 std::string getAppliedRule(int32_t position);
1488
1489 virtual ~RBBIMonkeyKind();
1490 UErrorCode deferredStatus;
1491
1492 std::string classNameFromCodepoint(const UChar32 c);
1493 unsigned int maxClassNameSize();
1494
1495 protected:
1496 RBBIMonkeyKind();
1497 std::vector<std::string> classNames;
1498 std::vector<std::string> appliedRules;
1499
1500 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1501 void prepareAppliedRules(int32_t size );
1502
1503 private:
1504
1505 };
1506
RBBIMonkeyKind()1507 RBBIMonkeyKind::RBBIMonkeyKind() {
1508 deferredStatus = U_ZERO_ERROR;
1509 }
1510
~RBBIMonkeyKind()1511 RBBIMonkeyKind::~RBBIMonkeyKind() {
1512 }
1513
characterClassNames()1514 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1515 return classNames;
1516 }
1517
prepareAppliedRules(int32_t size)1518 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1519 // Remove all the information in the `appliedRules`.
1520 appliedRules.clear();
1521 appliedRules.resize(size + 1);
1522 }
1523
setAppliedRule(int32_t position,const char * value)1524 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1525 appliedRules[position] = value;
1526 }
1527
getAppliedRule(int32_t position)1528 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1529 return appliedRules[position];
1530 }
1531
classNameFromCodepoint(const UChar32 c)1532 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1533 // Simply iterate through charClasses to find character's class
1534 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1535 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1536 if (classSet->contains(c)) {
1537 return classNames[aClassNum];
1538 }
1539 }
1540 U_ASSERT(FALSE); // This should not happen.
1541 return "bad class name";
1542 }
1543
maxClassNameSize()1544 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1545 unsigned int maxSize = 0;
1546 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1547 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1548 if (aClassNumSize > maxSize) {
1549 maxSize = aClassNumSize;
1550 }
1551 }
1552 return maxSize;
1553 }
1554
1555 //----------------------------------------------------------------------------------------
1556 //
1557 // Random Numbers. Similar to standard lib rand() and srand()
1558 // Not using library to
1559 // 1. Get same results on all platforms.
1560 // 2. Get access to current seed, to more easily reproduce failures.
1561 //
1562 //---------------------------------------------------------------------------------------
1563 static uint32_t m_seed = 1;
1564
m_rand()1565 static uint32_t m_rand()
1566 {
1567 m_seed = m_seed * 1103515245 + 12345;
1568 return (uint32_t)(m_seed/65536) % 32768;
1569 }
1570
1571
1572 //------------------------------------------------------------------------------------------
1573 //
1574 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1575 // of RBBIMonkeyKind.
1576 //
1577 //------------------------------------------------------------------------------------------
1578 class RBBICharMonkey: public RBBIMonkeyKind {
1579 public:
1580 RBBICharMonkey();
1581 virtual ~RBBICharMonkey();
1582 virtual UVector *charClasses() override;
1583 virtual void setText(const UnicodeString &s) override;
1584 virtual int32_t next(int32_t i) override;
1585 private:
1586 UVector *fSets;
1587
1588 UnicodeSet *fCRLFSet;
1589 UnicodeSet *fControlSet;
1590 UnicodeSet *fExtendSet;
1591 UnicodeSet *fZWJSet;
1592 UnicodeSet *fRegionalIndicatorSet;
1593 UnicodeSet *fPrependSet;
1594 UnicodeSet *fSpacingSet;
1595 UnicodeSet *fLSet;
1596 UnicodeSet *fVSet;
1597 UnicodeSet *fTSet;
1598 UnicodeSet *fLVSet;
1599 UnicodeSet *fLVTSet;
1600 UnicodeSet *fHangulSet;
1601 UnicodeSet *fExtendedPictSet;
1602 UnicodeSet *fViramaSet;
1603 UnicodeSet *fLinkingConsonantSet;
1604 UnicodeSet *fExtCccZwjSet;
1605 UnicodeSet *fAnySet;
1606
1607 const UnicodeString *fText;
1608 };
1609
1610
RBBICharMonkey()1611 RBBICharMonkey::RBBICharMonkey() {
1612 UErrorCode status = U_ZERO_ERROR;
1613
1614 fText = NULL;
1615
1616 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1617 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1618 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1619 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1620 fRegionalIndicatorSet =
1621 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1622 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1623 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1624 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1625 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1626 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1627 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1628 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1629 fHangulSet = new UnicodeSet();
1630 fHangulSet->addAll(*fLSet);
1631 fHangulSet->addAll(*fVSet);
1632 fHangulSet->addAll(*fTSet);
1633 fHangulSet->addAll(*fLVSet);
1634 fHangulSet->addAll(*fLVTSet);
1635
1636 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1637 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1638 "\\p{Indic_Syllabic_Category=Virama}]", status);
1639 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1640 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1641 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1642 fAnySet = new UnicodeSet(0, 0x10ffff);
1643
1644 // Create sets of characters, and add the names of the above character sets.
1645 // In each new ICU release, add new names corresponding to the sets above.
1646 fSets = new UVector(status);
1647
1648 // Important: Keep class names the same as the class contents.
1649 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1650 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1651 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1652 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1653 if (!fPrependSet->isEmpty()) {
1654 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1655 }
1656 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1657 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1658 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1659 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1660 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1661 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1662 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1663 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1664
1665 if (U_FAILURE(status)) {
1666 deferredStatus = status;
1667 }
1668 }
1669
1670
setText(const UnicodeString & s)1671 void RBBICharMonkey::setText(const UnicodeString &s) {
1672 fText = &s;
1673 prepareAppliedRules(s.length());
1674 }
1675
1676
1677
next(int32_t prevPos)1678 int32_t RBBICharMonkey::next(int32_t prevPos) {
1679 int p0, p1, p2, p3; // Indices of the significant code points around the
1680 // break position being tested. The candidate break
1681 // location is before p2.
1682
1683 int breakPos = -1;
1684
1685 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1686 UChar32 cBase; // for (X Extend*) patterns, the X character.
1687
1688 if (U_FAILURE(deferredStatus)) {
1689 return -1;
1690 }
1691
1692 // Previous break at end of string. return DONE.
1693 if (prevPos >= fText->length()) {
1694 return -1;
1695 }
1696
1697 p0 = p1 = p2 = p3 = prevPos;
1698 c3 = fText->char32At(prevPos);
1699 c0 = c1 = c2 = cBase = 0;
1700 (void)p0; // suppress set but not used warning.
1701 (void)c0;
1702
1703 // Loop runs once per "significant" character position in the input text.
1704 for (;;) {
1705 // Move all of the positions forward in the input string.
1706 p0 = p1; c0 = c1;
1707 p1 = p2; c1 = c2;
1708 p2 = p3; c2 = c3;
1709
1710 // Advance p3 by one codepoint
1711 p3 = fText->moveIndex32(p3, 1);
1712 c3 = fText->char32At(p3);
1713
1714 if (p1 == p2) {
1715 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1716 continue;
1717 }
1718
1719 if (p2 == fText->length()) {
1720 setAppliedRule(p2, "End of String");
1721 break;
1722 }
1723
1724 // No Extend or Format characters may appear between the CR and LF,
1725 // which requires the additional check for p2 immediately following p1.
1726 //
1727 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1728 setAppliedRule(p2, "GB3 CR x LF");
1729 continue;
1730 }
1731
1732 if (fControlSet->contains(c1) ||
1733 c1 == 0x0D ||
1734 c1 == 0x0A) {
1735 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1736 break;
1737 }
1738
1739 if (fControlSet->contains(c2) ||
1740 c2 == 0x0D ||
1741 c2 == 0x0A) {
1742 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1743 break;
1744 }
1745
1746 if (fLSet->contains(c1) &&
1747 (fLSet->contains(c2) ||
1748 fVSet->contains(c2) ||
1749 fLVSet->contains(c2) ||
1750 fLVTSet->contains(c2))) {
1751 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1752 continue;
1753 }
1754
1755 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1756 (fVSet->contains(c2) || fTSet->contains(c2))) {
1757 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1758 continue;
1759 }
1760
1761 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1762 fTSet->contains(c2)) {
1763 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1764 continue;
1765 }
1766
1767 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1768 if (!fExtendSet->contains(c1)) {
1769 cBase = c1;
1770 }
1771 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1772 continue;
1773 }
1774
1775 if (fSpacingSet->contains(c2)) {
1776 setAppliedRule(p2, "GB9a x SpacingMark");
1777 continue;
1778 }
1779
1780 if (fPrependSet->contains(c1)) {
1781 setAppliedRule(p2, "GB9b Prepend x");
1782 continue;
1783 }
1784
1785 // Note: Viramas are also included in the ExtCccZwj class.
1786 if (fLinkingConsonantSet->contains(c2)) {
1787 int pi = p1;
1788 bool sawVirama = false;
1789 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1790 if (fViramaSet->contains(fText->char32At(pi))) {
1791 sawVirama = true;
1792 }
1793 pi = fText->moveIndex32(pi, -1);
1794 }
1795 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1796 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1797 continue;
1798 }
1799 }
1800
1801 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1802 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1803 continue;
1804 }
1805
1806 // Note: The first if condition is a little tricky. We only need to force
1807 // a break if there are three or more contiguous RIs. If there are
1808 // only two, a break following will occur via other rules, and will include
1809 // any trailing extend characters, which is needed behavior.
1810 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1811 && fRegionalIndicatorSet->contains(c2)) {
1812 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1813 break;
1814 }
1815 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1816 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1817 continue;
1818 }
1819
1820 setAppliedRule(p2, "GB999 Any <break> Any");
1821 break;
1822 }
1823
1824 breakPos = p2;
1825 return breakPos;
1826 }
1827
1828
1829
charClasses()1830 UVector *RBBICharMonkey::charClasses() {
1831 return fSets;
1832 }
1833
~RBBICharMonkey()1834 RBBICharMonkey::~RBBICharMonkey() {
1835 delete fSets;
1836 delete fCRLFSet;
1837 delete fControlSet;
1838 delete fExtendSet;
1839 delete fRegionalIndicatorSet;
1840 delete fPrependSet;
1841 delete fSpacingSet;
1842 delete fLSet;
1843 delete fVSet;
1844 delete fTSet;
1845 delete fLVSet;
1846 delete fLVTSet;
1847 delete fHangulSet;
1848 delete fAnySet;
1849 delete fZWJSet;
1850 delete fExtendedPictSet;
1851 delete fViramaSet;
1852 delete fLinkingConsonantSet;
1853 delete fExtCccZwjSet;
1854 }
1855
1856 //------------------------------------------------------------------------------------------
1857 //
1858 // class RBBIWordMonkey Word Break specific implementation
1859 // of RBBIMonkeyKind.
1860 //
1861 //------------------------------------------------------------------------------------------
1862 class RBBIWordMonkey: public RBBIMonkeyKind {
1863 public:
1864 RBBIWordMonkey();
1865 virtual ~RBBIWordMonkey();
1866 virtual UVector *charClasses() override;
1867 virtual void setText(const UnicodeString &s) override;
1868 virtual int32_t next(int32_t i) override;
1869 private:
1870 UVector *fSets;
1871
1872 UnicodeSet *fCRSet;
1873 UnicodeSet *fLFSet;
1874 UnicodeSet *fNewlineSet;
1875 UnicodeSet *fRegionalIndicatorSet;
1876 UnicodeSet *fKatakanaSet;
1877 UnicodeSet *fHebrew_LetterSet;
1878 UnicodeSet *fALetterSet;
1879 UnicodeSet *fSingle_QuoteSet;
1880 UnicodeSet *fDouble_QuoteSet;
1881 UnicodeSet *fMidNumLetSet;
1882 UnicodeSet *fMidLetterSet;
1883 UnicodeSet *fMidNumSet;
1884 UnicodeSet *fNumericSet;
1885 UnicodeSet *fFormatSet;
1886 UnicodeSet *fOtherSet = nullptr;
1887 UnicodeSet *fExtendSet;
1888 UnicodeSet *fExtendNumLetSet;
1889 UnicodeSet *fWSegSpaceSet;
1890 UnicodeSet *fDictionarySet = nullptr;
1891 UnicodeSet *fZWJSet;
1892 UnicodeSet *fExtendedPictSet;
1893
1894 const UnicodeString *fText;
1895 };
1896
1897
RBBIWordMonkey()1898 RBBIWordMonkey::RBBIWordMonkey()
1899 {
1900 UErrorCode status = U_ZERO_ERROR;
1901
1902 fSets = new UVector(status);
1903
1904 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1905 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1906 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1907 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1908 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1909 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1910 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1911 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1912 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1913 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1914 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
1915 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1916 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1917 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1918 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1919 // There are some sc=Hani characters with WB=Extend.
1920 // The break rules need to pick one or the other because
1921 // Extend overlapping with something else is messy.
1922 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1923 // in $Han (for $dictionary) and out of $Extend.
1924 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1925 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1926
1927 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1928 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1929 if(U_FAILURE(status)) {
1930 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1931 deferredStatus = status;
1932 return;
1933 }
1934
1935 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1936 fDictionarySet->addAll(*fKatakanaSet);
1937 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1938
1939 fALetterSet->removeAll(*fDictionarySet);
1940
1941 fOtherSet = new UnicodeSet();
1942 if(U_FAILURE(status)) {
1943 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1944 deferredStatus = status;
1945 return;
1946 }
1947
1948 fOtherSet->complement();
1949 fOtherSet->removeAll(*fCRSet);
1950 fOtherSet->removeAll(*fLFSet);
1951 fOtherSet->removeAll(*fNewlineSet);
1952 fOtherSet->removeAll(*fKatakanaSet);
1953 fOtherSet->removeAll(*fHebrew_LetterSet);
1954 fOtherSet->removeAll(*fALetterSet);
1955 fOtherSet->removeAll(*fSingle_QuoteSet);
1956 fOtherSet->removeAll(*fDouble_QuoteSet);
1957 fOtherSet->removeAll(*fMidLetterSet);
1958 fOtherSet->removeAll(*fMidNumSet);
1959 fOtherSet->removeAll(*fNumericSet);
1960 fOtherSet->removeAll(*fExtendNumLetSet);
1961 fOtherSet->removeAll(*fWSegSpaceSet);
1962 fOtherSet->removeAll(*fFormatSet);
1963 fOtherSet->removeAll(*fExtendSet);
1964 fOtherSet->removeAll(*fRegionalIndicatorSet);
1965 fOtherSet->removeAll(*fZWJSet);
1966 fOtherSet->removeAll(*fExtendedPictSet);
1967
1968 // Inhibit dictionary characters from being tested at all.
1969 fOtherSet->removeAll(*fDictionarySet);
1970
1971 // Add classes and their names
1972 fSets->addElement(fCRSet, status); classNames.push_back("CR");
1973 fSets->addElement(fLFSet, status); classNames.push_back("LF");
1974 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1975 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1976 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1977 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1978 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1979 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1980 // Omit Katakana from fSets, which omits Katakana characters
1981 // from the test data. They are all in the dictionary set,
1982 // which this (old, to be retired) monkey test cannot handle.
1983 //fSets->addElement(fKatakanaSet, status);
1984
1985 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1986 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1987 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1988 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1989 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
1990 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
1991 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
1992 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
1993 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
1994
1995 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1996 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1997
1998 if (U_FAILURE(status)) {
1999 deferredStatus = status;
2000 }
2001 }
2002
setText(const UnicodeString & s)2003 void RBBIWordMonkey::setText(const UnicodeString &s) {
2004 fText = &s;
2005 prepareAppliedRules(s.length());
2006 }
2007
2008
next(int32_t prevPos)2009 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2010 int p0, p1, p2, p3; // Indices of the significant code points around the
2011 // break position being tested. The candidate break
2012 // location is before p2.
2013
2014 int breakPos = -1;
2015
2016 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2017
2018 if (U_FAILURE(deferredStatus)) {
2019 return -1;
2020 }
2021
2022 // Prev break at end of string. return DONE.
2023 if (prevPos >= fText->length()) {
2024 return -1;
2025 }
2026 p0 = p1 = p2 = p3 = prevPos;
2027 c3 = fText->char32At(prevPos);
2028 c0 = c1 = c2 = 0;
2029 (void)p0; // Suppress set but not used warning.
2030
2031 // Loop runs once per "significant" character position in the input text.
2032 for (;;) {
2033 // Move all of the positions forward in the input string.
2034 p0 = p1; c0 = c1;
2035 p1 = p2; c1 = c2;
2036 p2 = p3; c2 = c3;
2037
2038 // Advance p3 by X(Extend | Format)* Rule 4
2039 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2040 do {
2041 p3 = fText->moveIndex32(p3, 1);
2042 c3 = fText->char32At(p3);
2043 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2044 break;
2045 }
2046 }
2047 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2048
2049
2050 if (p1 == p2) {
2051 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2052 continue;
2053 }
2054
2055 if (p2 == fText->length()) {
2056 // Reached end of string. Always a break position.
2057 break;
2058 }
2059
2060 // No Extend or Format characters may appear between the CR and LF,
2061 // which requires the additional check for p2 immediately following p1.
2062 //
2063 if (c1==0x0D && c2==0x0A) {
2064 setAppliedRule(p2, "WB3 CR x LF");
2065 continue;
2066 }
2067
2068 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2069 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2070 break;
2071 }
2072 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2073 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2074 break;
2075 }
2076
2077 // Not ignoring extend chars, so peek into input text to
2078 // get the potential ZWJ, the character immediately preceding c2.
2079 // Sloppy UChar32 indexing: p2-1 may reference trail half
2080 // but char32At will get the full code point.
2081 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2082 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2083 continue;
2084 }
2085
2086 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2087 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2088 continue;
2089 }
2090
2091 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2092 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2093 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2094 continue;
2095 }
2096
2097 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2098 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2099 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2100 setAppliedRule(p2,
2101 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2102 continue;
2103 }
2104
2105 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2106 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2107 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2108 setAppliedRule(p2,
2109 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2110 continue;
2111 }
2112
2113 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2114 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2115 continue;
2116 }
2117
2118 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2119 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2120 continue;
2121 }
2122
2123 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2124 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2125 continue;
2126 }
2127
2128 if (fNumericSet->contains(c1) &&
2129 fNumericSet->contains(c2)) {
2130 setAppliedRule(p2, "WB8 Numeric x Numeric");
2131 continue;
2132 }
2133
2134 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2135 fNumericSet->contains(c2)) {
2136 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2137 continue;
2138 }
2139
2140 if (fNumericSet->contains(c1) &&
2141 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2142 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2143 continue;
2144 }
2145
2146 if (fNumericSet->contains(c0) &&
2147 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2148 fNumericSet->contains(c2)) {
2149 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2150 continue;
2151 }
2152
2153 if (fNumericSet->contains(c1) &&
2154 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2155 fNumericSet->contains(c3)) {
2156 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2157 continue;
2158 }
2159
2160 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2161 // all Katakana are handled by the dictionary breaker.
2162 if (fKatakanaSet->contains(c1) &&
2163 fKatakanaSet->contains(c2)) {
2164 setAppliedRule(p2, "WB13 Katakana x Katakana");
2165 continue;
2166 }
2167
2168 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2169 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2170 fExtendNumLetSet->contains(c2)) {
2171 setAppliedRule(p2,
2172 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2173 continue;
2174 }
2175
2176 if (fExtendNumLetSet->contains(c1) &&
2177 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2178 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2179 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2180 continue;
2181 }
2182
2183 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2184 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2185 break;
2186 }
2187 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2188 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2189 continue;
2190 }
2191
2192 setAppliedRule(p2, "WB999");
2193 break;
2194 }
2195
2196 breakPos = p2;
2197 return breakPos;
2198 }
2199
2200
charClasses()2201 UVector *RBBIWordMonkey::charClasses() {
2202 return fSets;
2203 }
2204
~RBBIWordMonkey()2205 RBBIWordMonkey::~RBBIWordMonkey() {
2206 delete fSets;
2207 delete fCRSet;
2208 delete fLFSet;
2209 delete fNewlineSet;
2210 delete fKatakanaSet;
2211 delete fHebrew_LetterSet;
2212 delete fALetterSet;
2213 delete fSingle_QuoteSet;
2214 delete fDouble_QuoteSet;
2215 delete fMidNumLetSet;
2216 delete fMidLetterSet;
2217 delete fMidNumSet;
2218 delete fNumericSet;
2219 delete fFormatSet;
2220 delete fExtendSet;
2221 delete fExtendNumLetSet;
2222 delete fWSegSpaceSet;
2223 delete fRegionalIndicatorSet;
2224 delete fDictionarySet;
2225 delete fOtherSet;
2226 delete fZWJSet;
2227 delete fExtendedPictSet;
2228 }
2229
2230
2231
2232
2233 //------------------------------------------------------------------------------------------
2234 //
2235 // class RBBISentMonkey Sentence Break specific implementation
2236 // of RBBIMonkeyKind.
2237 //
2238 //------------------------------------------------------------------------------------------
2239 class RBBISentMonkey: public RBBIMonkeyKind {
2240 public:
2241 RBBISentMonkey();
2242 virtual ~RBBISentMonkey();
2243 virtual UVector *charClasses() override;
2244 virtual void setText(const UnicodeString &s) override;
2245 virtual int32_t next(int32_t i) override;
2246 private:
2247 int moveBack(int posFrom);
2248 int moveForward(int posFrom);
2249 UChar32 cAt(int pos);
2250
2251 UVector *fSets;
2252
2253 UnicodeSet *fSepSet;
2254 UnicodeSet *fFormatSet;
2255 UnicodeSet *fSpSet;
2256 UnicodeSet *fLowerSet;
2257 UnicodeSet *fUpperSet;
2258 UnicodeSet *fOLetterSet;
2259 UnicodeSet *fNumericSet;
2260 UnicodeSet *fATermSet;
2261 UnicodeSet *fSContinueSet;
2262 UnicodeSet *fSTermSet;
2263 UnicodeSet *fCloseSet;
2264 UnicodeSet *fOtherSet;
2265 UnicodeSet *fExtendSet;
2266
2267 const UnicodeString *fText;
2268 };
2269
RBBISentMonkey()2270 RBBISentMonkey::RBBISentMonkey()
2271 {
2272 UErrorCode status = U_ZERO_ERROR;
2273
2274 fSets = new UVector(status);
2275
2276 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2277 // set and made into character classes of their own. For the monkey impl,
2278 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2279 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2280 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2281 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2282 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2283 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2284 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2285 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2286 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2287 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2288 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2289 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2290 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2291 fOtherSet = new UnicodeSet();
2292
2293 if(U_FAILURE(status)) {
2294 deferredStatus = status;
2295 return;
2296 }
2297
2298 fOtherSet->complement();
2299 fOtherSet->removeAll(*fSepSet);
2300 fOtherSet->removeAll(*fFormatSet);
2301 fOtherSet->removeAll(*fSpSet);
2302 fOtherSet->removeAll(*fLowerSet);
2303 fOtherSet->removeAll(*fUpperSet);
2304 fOtherSet->removeAll(*fOLetterSet);
2305 fOtherSet->removeAll(*fNumericSet);
2306 fOtherSet->removeAll(*fATermSet);
2307 fOtherSet->removeAll(*fSContinueSet);
2308 fOtherSet->removeAll(*fSTermSet);
2309 fOtherSet->removeAll(*fCloseSet);
2310 fOtherSet->removeAll(*fExtendSet);
2311
2312 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2313 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2314 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2315 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2316 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2317 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2318 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2319 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2320 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2321 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2322 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2323 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2324 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2325
2326 if (U_FAILURE(status)) {
2327 deferredStatus = status;
2328 }
2329 }
2330
2331
2332
setText(const UnicodeString & s)2333 void RBBISentMonkey::setText(const UnicodeString &s) {
2334 fText = &s;
2335 prepareAppliedRules(s.length());
2336 }
2337
charClasses()2338 UVector *RBBISentMonkey::charClasses() {
2339 return fSets;
2340 }
2341
2342 // moveBack() Find the "significant" code point preceding the index i.
2343 // Skips over ($Extend | $Format)* .
2344 //
moveBack(int i)2345 int RBBISentMonkey::moveBack(int i) {
2346 if (i <= 0) {
2347 return -1;
2348 }
2349 UChar32 c;
2350 int32_t j = i;
2351 do {
2352 j = fText->moveIndex32(j, -1);
2353 c = fText->char32At(j);
2354 }
2355 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2356 return j;
2357
2358 }
2359
2360
moveForward(int i)2361 int RBBISentMonkey::moveForward(int i) {
2362 if (i>=fText->length()) {
2363 return fText->length();
2364 }
2365 UChar32 c;
2366 int32_t j = i;
2367 do {
2368 j = fText->moveIndex32(j, 1);
2369 c = cAt(j);
2370 }
2371 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2372 return j;
2373 }
2374
cAt(int pos)2375 UChar32 RBBISentMonkey::cAt(int pos) {
2376 if (pos<0 || pos>=fText->length()) {
2377 return -1;
2378 } else {
2379 return fText->char32At(pos);
2380 }
2381 }
2382
next(int32_t prevPos)2383 int32_t RBBISentMonkey::next(int32_t prevPos) {
2384 int p0, p1, p2, p3; // Indices of the significant code points around the
2385 // break position being tested. The candidate break
2386 // location is before p2.
2387
2388 int breakPos = -1;
2389
2390 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2391 UChar32 c;
2392
2393 if (U_FAILURE(deferredStatus)) {
2394 return -1;
2395 }
2396
2397 // Prev break at end of string. return DONE.
2398 if (prevPos >= fText->length()) {
2399 return -1;
2400 }
2401 p0 = p1 = p2 = p3 = prevPos;
2402 c3 = fText->char32At(prevPos);
2403 c0 = c1 = c2 = 0;
2404 (void)p0; // Suppress set but not used warning.
2405
2406 // Loop runs once per "significant" character position in the input text.
2407 for (;;) {
2408 // Move all of the positions forward in the input string.
2409 p0 = p1; c0 = c1;
2410 p1 = p2; c1 = c2;
2411 p2 = p3; c2 = c3;
2412
2413 // Advance p3 by X(Extend | Format)* Rule 4
2414 p3 = moveForward(p3);
2415 c3 = cAt(p3);
2416
2417 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2418 setAppliedRule(p2, "SB3 CR x LF");
2419 continue;
2420 }
2421
2422 if (fSepSet->contains(c1)) {
2423 p2 = p1+1; // Separators don't combine with Extend or Format.
2424
2425 setAppliedRule(p2, "SB4 Sep <break>");
2426 break;
2427 }
2428
2429 if (p2 >= fText->length()) {
2430 // Reached end of string. Always a break position.
2431 setAppliedRule(p2, "SB4 Sep <break>");
2432 break;
2433 }
2434
2435 if (p2 == prevPos) {
2436 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2437 setAppliedRule(p2, "SB4 Sep <break>");
2438 continue;
2439 }
2440
2441 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2442 setAppliedRule(p2, "SB6 ATerm x Numeric");
2443 continue;
2444 }
2445
2446 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2447 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2448 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2449 continue;
2450 }
2451
2452 // Note: STerm | ATerm are added to the negated part of the expression by a
2453 // note to the Unicode 5.0 documents.
2454 int p8 = p1;
2455 while (fSpSet->contains(cAt(p8))) {
2456 p8 = moveBack(p8);
2457 }
2458 while (fCloseSet->contains(cAt(p8))) {
2459 p8 = moveBack(p8);
2460 }
2461 if (fATermSet->contains(cAt(p8))) {
2462 p8=p2;
2463 for (;;) {
2464 c = cAt(p8);
2465 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2466 fLowerSet->contains(c) || fSepSet->contains(c) ||
2467 fATermSet->contains(c) || fSTermSet->contains(c)) {
2468
2469 setAppliedRule(p2,
2470 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2471 break;
2472 }
2473 p8 = moveForward(p8);
2474 }
2475 if (fLowerSet->contains(cAt(p8))) {
2476
2477 setAppliedRule(p2,
2478 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2479 continue;
2480 }
2481 }
2482
2483 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2484 p8 = p1;
2485 while (fSpSet->contains(cAt(p8))) {
2486 p8 = moveBack(p8);
2487 }
2488 while (fCloseSet->contains(cAt(p8))) {
2489 p8 = moveBack(p8);
2490 }
2491 c = cAt(p8);
2492 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2493 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2494 continue;
2495 }
2496 }
2497
2498 int p9 = p1;
2499 while (fCloseSet->contains(cAt(p9))) {
2500 p9 = moveBack(p9);
2501 }
2502 c = cAt(p9);
2503 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2504 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2505
2506 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2507 continue;
2508 }
2509 }
2510
2511 int p10 = p1;
2512 while (fSpSet->contains(cAt(p10))) {
2513 p10 = moveBack(p10);
2514 }
2515 while (fCloseSet->contains(cAt(p10))) {
2516 p10 = moveBack(p10);
2517 }
2518 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2519 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2520 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2521 continue;
2522 }
2523 }
2524
2525 int p11 = p1;
2526 if (fSepSet->contains(cAt(p11))) {
2527 p11 = moveBack(p11);
2528 }
2529 while (fSpSet->contains(cAt(p11))) {
2530 p11 = moveBack(p11);
2531 }
2532 while (fCloseSet->contains(cAt(p11))) {
2533 p11 = moveBack(p11);
2534 }
2535 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2536 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2537 break;
2538 }
2539
2540 setAppliedRule(p2, "SB12 Any x Any");
2541 continue;
2542 }
2543
2544 breakPos = p2;
2545 return breakPos;
2546 }
2547
~RBBISentMonkey()2548 RBBISentMonkey::~RBBISentMonkey() {
2549 delete fSets;
2550 delete fSepSet;
2551 delete fFormatSet;
2552 delete fSpSet;
2553 delete fLowerSet;
2554 delete fUpperSet;
2555 delete fOLetterSet;
2556 delete fNumericSet;
2557 delete fATermSet;
2558 delete fSContinueSet;
2559 delete fSTermSet;
2560 delete fCloseSet;
2561 delete fOtherSet;
2562 delete fExtendSet;
2563 }
2564
2565
2566
2567 //-------------------------------------------------------------------------------------------
2568 //
2569 // RBBILineMonkey
2570 //
2571 //-------------------------------------------------------------------------------------------
2572
2573 class RBBILineMonkey: public RBBIMonkeyKind {
2574 public:
2575 RBBILineMonkey();
2576 virtual ~RBBILineMonkey();
2577 virtual UVector *charClasses() override;
2578 virtual void setText(const UnicodeString &s) override;
2579 virtual int32_t next(int32_t i) override;
2580 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2581 private:
2582 UVector *fSets;
2583
2584 UnicodeSet *fBK;
2585 UnicodeSet *fCR;
2586 UnicodeSet *fLF;
2587 UnicodeSet *fCM;
2588 UnicodeSet *fNL;
2589 UnicodeSet *fSG;
2590 UnicodeSet *fWJ;
2591 UnicodeSet *fZW;
2592 UnicodeSet *fGL;
2593 UnicodeSet *fCB;
2594 UnicodeSet *fSP;
2595 UnicodeSet *fB2;
2596 UnicodeSet *fBA;
2597 UnicodeSet *fBB;
2598 UnicodeSet *fHH;
2599 UnicodeSet *fHY;
2600 UnicodeSet *fH2;
2601 UnicodeSet *fH3;
2602 UnicodeSet *fCL;
2603 UnicodeSet *fCP;
2604 UnicodeSet *fEX;
2605 UnicodeSet *fIN;
2606 UnicodeSet *fJL;
2607 UnicodeSet *fJV;
2608 UnicodeSet *fJT;
2609 UnicodeSet *fNS;
2610 UnicodeSet *fOP;
2611 UnicodeSet *fQU;
2612 UnicodeSet *fIS;
2613 UnicodeSet *fNU;
2614 UnicodeSet *fPO;
2615 UnicodeSet *fPR;
2616 UnicodeSet *fSY;
2617 UnicodeSet *fAI;
2618 UnicodeSet *fAL;
2619 UnicodeSet *fCJ;
2620 UnicodeSet *fHL;
2621 UnicodeSet *fID;
2622 UnicodeSet *fRI;
2623 UnicodeSet *fXX;
2624 UnicodeSet *fEB;
2625 UnicodeSet *fEM;
2626 UnicodeSet *fZWJ;
2627 UnicodeSet *fOP30;
2628 UnicodeSet *fCP30;
2629 UnicodeSet *fExtPictUnassigned;
2630
2631 BreakIterator *fCharBI;
2632 const UnicodeString *fText;
2633 RegexMatcher *fNumberMatcher;
2634 };
2635
RBBILineMonkey()2636 RBBILineMonkey::RBBILineMonkey() :
2637 RBBIMonkeyKind(),
2638 fSets(NULL),
2639
2640 fCharBI(NULL),
2641 fText(NULL),
2642 fNumberMatcher(NULL)
2643
2644 {
2645 if (U_FAILURE(deferredStatus)) {
2646 return;
2647 }
2648
2649 UErrorCode status = U_ZERO_ERROR;
2650
2651 fSets = new UVector(status);
2652
2653 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2654 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2655 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2656 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2657 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2658 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2659 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2660 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2661 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2662 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2663 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2664 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2665 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2666 fHH = new UnicodeSet();
2667 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2668 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2669 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2670 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2671 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2672 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2673 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2674 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2675 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2676 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2677 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2678 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2679 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2680 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2681 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2682 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2683 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2684 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2685 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2686 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2687 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2688 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2689 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2690 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2691 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2692 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2693 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2694 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2695 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2696 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2697 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2698 fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2699
2700 if (U_FAILURE(status)) {
2701 deferredStatus = status;
2702 return;
2703 }
2704
2705 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2706 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2707 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2708
2709 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2710 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2711
2712 fHH->add(u'\u2010'); // Hyphen, '‐'
2713
2714 // Sets and names.
2715 fSets->addElement(fBK, status); classNames.push_back("fBK");
2716 fSets->addElement(fCR, status); classNames.push_back("fCR");
2717 fSets->addElement(fLF, status); classNames.push_back("fLF");
2718 fSets->addElement(fCM, status); classNames.push_back("fCM");
2719 fSets->addElement(fNL, status); classNames.push_back("fNL");
2720 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2721 fSets->addElement(fZW, status); classNames.push_back("fZW");
2722 fSets->addElement(fGL, status); classNames.push_back("fGL");
2723 fSets->addElement(fCB, status); classNames.push_back("fCB");
2724 fSets->addElement(fSP, status); classNames.push_back("fSP");
2725 fSets->addElement(fB2, status); classNames.push_back("fB2");
2726 fSets->addElement(fBA, status); classNames.push_back("fBA");
2727 fSets->addElement(fBB, status); classNames.push_back("fBB");
2728 fSets->addElement(fHY, status); classNames.push_back("fHY");
2729 fSets->addElement(fH2, status); classNames.push_back("fH2");
2730 fSets->addElement(fH3, status); classNames.push_back("fH3");
2731 fSets->addElement(fCL, status); classNames.push_back("fCL");
2732 fSets->addElement(fCP, status); classNames.push_back("fCP");
2733 fSets->addElement(fEX, status); classNames.push_back("fEX");
2734 fSets->addElement(fIN, status); classNames.push_back("fIN");
2735 fSets->addElement(fJL, status); classNames.push_back("fJL");
2736 fSets->addElement(fJT, status); classNames.push_back("fJT");
2737 fSets->addElement(fJV, status); classNames.push_back("fJV");
2738 fSets->addElement(fNS, status); classNames.push_back("fNS");
2739 fSets->addElement(fOP, status); classNames.push_back("fOP");
2740 fSets->addElement(fQU, status); classNames.push_back("fQU");
2741 fSets->addElement(fIS, status); classNames.push_back("fIS");
2742 fSets->addElement(fNU, status); classNames.push_back("fNU");
2743 fSets->addElement(fPO, status); classNames.push_back("fPO");
2744 fSets->addElement(fPR, status); classNames.push_back("fPR");
2745 fSets->addElement(fSY, status); classNames.push_back("fSY");
2746 fSets->addElement(fAI, status); classNames.push_back("fAI");
2747 fSets->addElement(fAL, status); classNames.push_back("fAL");
2748 fSets->addElement(fHL, status); classNames.push_back("fHL");
2749 fSets->addElement(fID, status); classNames.push_back("fID");
2750 fSets->addElement(fRI, status); classNames.push_back("fRI");
2751 fSets->addElement(fSG, status); classNames.push_back("fSG");
2752 fSets->addElement(fEB, status); classNames.push_back("fEB");
2753 fSets->addElement(fEM, status); classNames.push_back("fEM");
2754 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2755 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2756 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2757 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2758 fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2759
2760 const char *rules =
2761 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2762 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2763 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2764 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2765 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2766 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2767 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2768
2769 fNumberMatcher = new RegexMatcher(
2770 UnicodeString(rules, -1, US_INV), 0, status);
2771
2772 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2773
2774 if (U_FAILURE(status)) {
2775 deferredStatus = status;
2776 }
2777
2778 }
2779
2780
setText(const UnicodeString & s)2781 void RBBILineMonkey::setText(const UnicodeString &s) {
2782 fText = &s;
2783 fCharBI->setText(s);
2784 prepareAppliedRules(s.length());
2785 fNumberMatcher->reset(s);
2786 }
2787
2788 //
2789 // rule9Adjust
2790 // Line Break TR rules 9 and 10 implementation.
2791 // This deals with combining marks and other sequences that
2792 // that must be treated as if they were something other than what they actually are.
2793 //
2794 // This is factored out into a separate function because it must be applied twice for
2795 // each potential break, once to the chars before the position being checked, then
2796 // again to the text following the possible break.
2797 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2798 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2799 if (pos == -1) {
2800 // Invalid initial position. Happens during the warmup iteration of the
2801 // main loop in next().
2802 return;
2803 }
2804
2805 int32_t nPos = *nextPos;
2806
2807 // LB 9 Keep combining sequences together.
2808 // advance over any CM class chars. Note that Line Break CM is different
2809 // from the normal Grapheme Extend property.
2810 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2811 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2812 for (;;) {
2813 *nextChar = fText->char32At(nPos);
2814 if (!fCM->contains(*nextChar)) {
2815 break;
2816 }
2817 nPos = fText->moveIndex32(nPos, 1);
2818 }
2819 }
2820
2821
2822 // LB 9 Treat X CM* as if it were x.
2823 // No explicit action required.
2824
2825 // LB 10 Treat any remaining combining mark as AL
2826 if (fCM->contains(*posChar)) {
2827 *posChar = u'A';
2828 }
2829
2830 // Push the updated nextPos and nextChar back to our caller.
2831 // This only makes a difference if posChar got bigger by consuming a
2832 // combining sequence.
2833 *nextPos = nPos;
2834 *nextChar = fText->char32At(nPos);
2835 }
2836
2837
2838
next(int32_t startPos)2839 int32_t RBBILineMonkey::next(int32_t startPos) {
2840 UErrorCode status = U_ZERO_ERROR;
2841 int32_t pos; // Index of the char following a potential break position
2842 UChar32 thisChar; // Character at above position "pos"
2843
2844 int32_t prevPos; // Index of the char preceding a potential break position
2845 UChar32 prevChar; // Character at above position. Note that prevChar
2846 // and thisChar may not be adjacent because combining
2847 // characters between them will be ignored.
2848
2849 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2850 UChar32 prevCharX2;
2851
2852 int32_t nextPos; // Index of the next character following pos.
2853 // Usually skips over combining marks.
2854 int32_t nextCPPos; // Index of the code point following "pos."
2855 // May point to a combining mark.
2856 int32_t tPos; // temp value.
2857 UChar32 c;
2858
2859 if (U_FAILURE(deferredStatus)) {
2860 return -1;
2861 }
2862
2863 if (startPos >= fText->length()) {
2864 return -1;
2865 }
2866
2867
2868 // Initial values for loop. Loop will run the first time without finding breaks,
2869 // while the invalid values shift out and the "this" and
2870 // "prev" positions are filled in with good values.
2871 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2872 thisChar = prevChar = prevCharX2 = 0;
2873 nextPos = nextCPPos = startPos;
2874
2875
2876 // Loop runs once per position in the test text, until a break position
2877 // is found.
2878 for (;;) {
2879 prevPosX2 = prevPos;
2880 prevCharX2 = prevChar;
2881
2882 prevPos = pos;
2883 prevChar = thisChar;
2884
2885 pos = nextPos;
2886 thisChar = fText->char32At(pos);
2887
2888 nextCPPos = fText->moveIndex32(pos, 1);
2889 nextPos = nextCPPos;
2890
2891
2892 if (pos >= fText->length()) {
2893 setAppliedRule(pos, "LB2 - Break at end of text.");
2894 break;
2895 }
2896
2897
2898 // We do this one out-of-order because the adjustment does not change anything
2899 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2900 // be applied.
2901 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2902 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2903 c = fText->char32At(nextPos);
2904 rule9Adjust(pos, &thisChar, &nextPos, &c);
2905
2906 // If the loop is still warming up - if we haven't shifted the initial
2907 // -1 positions out of prevPos yet - loop back to advance the
2908 // position in the input without any further looking for breaks.
2909 if (prevPos == -1) {
2910 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2911 continue;
2912 }
2913
2914
2915 if (fBK->contains(prevChar)) {
2916 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
2917 break;
2918 }
2919
2920
2921 if (prevChar == 0x0d && thisChar == 0x0a) {
2922 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2923 continue;
2924 }
2925 if (prevChar == 0x0d ||
2926 prevChar == 0x0a ||
2927 prevChar == 0x85) {
2928 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2929 break;
2930 }
2931
2932
2933 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2934 fBK->contains(thisChar)) {
2935 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
2936 continue;
2937 }
2938
2939
2940 if (fSP->contains(thisChar)) {
2941 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2942 continue;
2943 }
2944
2945 // !!! ??? Is this the right text for the applied rule?
2946 if (fZW->contains(thisChar)) {
2947 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2948 continue;
2949 }
2950
2951
2952 // ZW SP* ÷
2953 // Scan backwards from prevChar for SP* ZW
2954 tPos = prevPos;
2955 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2956 tPos = fText->moveIndex32(tPos, -1);
2957 }
2958 if (fZW->contains(fText->char32At(tPos))) {
2959 setAppliedRule(pos, "LB 8 Break after zero width space");
2960 break;
2961 }
2962
2963
2964 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2965 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2966 if (fNumberMatcher->lookingAt(prevPos, status)) {
2967 if (U_FAILURE(status)) {
2968 setAppliedRule(pos, "LB 25 Numbers");
2969 break;
2970 }
2971 // Matched a number. But could have been just a single digit, which would
2972 // not represent a "no break here" between prevChar and thisChar
2973 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2974 if (numEndIdx > pos) {
2975 // Number match includes at least our two chars being checked
2976 if (numEndIdx > nextPos) {
2977 // Number match includes additional chars. Update pos and nextPos
2978 // so that next loop iteration will continue at the end of the number,
2979 // checking for breaks between last char in number & whatever follows.
2980 pos = nextPos = numEndIdx;
2981 do {
2982 pos = fText->moveIndex32(pos, -1);
2983 thisChar = fText->char32At(pos);
2984 } while (fCM->contains(thisChar));
2985 }
2986 setAppliedRule(pos, "LB 25 Numbers");
2987 continue;
2988 }
2989 }
2990
2991
2992 // The monkey test's way of ignoring combining characters doesn't work
2993 // for this rule. ZJ is also a CM. Need to get the actual character
2994 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2995 {
2996 int32_t prevIdx = fText->moveIndex32(pos, -1);
2997 UChar32 prevC = fText->char32At(prevIdx);
2998 if (fZWJ->contains(prevC)) {
2999 setAppliedRule(pos, "LB 8a ZWJ x");
3000 continue;
3001 }
3002 }
3003
3004
3005 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3006 //
3007
3008
3009 // x WJ
3010 // WJ x
3011 //
3012 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3013 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3014 continue;
3015 }
3016
3017
3018 if (fGL->contains(prevChar)) {
3019 setAppliedRule(pos, "LB 12 GL x");
3020 continue;
3021 }
3022
3023
3024 if (!(fSP->contains(prevChar) ||
3025 fBA->contains(prevChar) ||
3026 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3027 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3028 continue;
3029 }
3030
3031
3032 if (fCL->contains(thisChar) ||
3033 fCP->contains(thisChar) ||
3034 fEX->contains(thisChar) ||
3035 fSY->contains(thisChar)) {
3036 setAppliedRule(pos, "LB 13 Don't break before closings.");
3037 continue;
3038 }
3039
3040
3041 // Scan backwards, checking for this sequence.
3042 // The OP char could include combining marks, so we actually check for
3043 // OP CM* SP*
3044 // Another Twist: The Rule 9 fixes may have changed a SP CM
3045 // sequence into a ID char, so before scanning back through spaces,
3046 // verify that prevChar is indeed a space. The prevChar variable
3047 // may differ from fText[prevPos]
3048 tPos = prevPos;
3049 if (fSP->contains(prevChar)) {
3050 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3051 tPos=fText->moveIndex32(tPos, -1);
3052 }
3053 }
3054 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3055 tPos=fText->moveIndex32(tPos, -1);
3056 }
3057 if (fOP->contains(fText->char32At(tPos))) {
3058 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3059 continue;
3060 }
3061
3062
3063 if (nextPos < fText->length()) {
3064 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3065 // from a legit ffff character. So test length separately.
3066 UChar32 nextChar = fText->char32At(nextPos);
3067 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3068 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3069 break;
3070 }
3071 }
3072
3073
3074 if (fIS->contains(thisChar)) {
3075 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3076 continue;
3077 }
3078
3079
3080 if (fOP->contains(thisChar)) {
3081 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3082 int tPos = prevPos;
3083 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3084 tPos = fText->moveIndex32(tPos, -1);
3085 }
3086 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3087 tPos = fText->moveIndex32(tPos, -1);
3088 }
3089 if (fQU->contains(fText->char32At(tPos))) {
3090 setAppliedRule(pos, "LB 15 QU SP* x OP");
3091 continue;
3092 }
3093 }
3094
3095
3096 // Scan backwards for SP* CM* (CL | CP)
3097 if (fNS->contains(thisChar)) {
3098 int tPos = prevPos;
3099 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3100 tPos = fText->moveIndex32(tPos, -1);
3101 }
3102 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3103 tPos = fText->moveIndex32(tPos, -1);
3104 }
3105 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3106 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3107 continue;
3108 }
3109 }
3110
3111
3112 if (fB2->contains(thisChar)) {
3113 // Scan backwards, checking for the B2 CM* SP* sequence.
3114 tPos = prevPos;
3115 if (fSP->contains(prevChar)) {
3116 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3117 tPos=fText->moveIndex32(tPos, -1);
3118 }
3119 }
3120 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3121 tPos=fText->moveIndex32(tPos, -1);
3122 }
3123 if (fB2->contains(fText->char32At(tPos))) {
3124 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3125 continue;
3126 }
3127 }
3128
3129
3130 if (fSP->contains(prevChar)) {
3131 setAppliedRule(pos, "LB 18 break after space");
3132 break;
3133 }
3134
3135 // x QU
3136 // QU x
3137 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3138 setAppliedRule(pos, "LB 19");
3139 continue;
3140 }
3141
3142 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3143 setAppliedRule(pos, "LB 20 Break around a CB");
3144 break;
3145 }
3146
3147 // Don't break between Hyphens and letters if a break precedes the hyphen.
3148 // Formerly this was a Finnish tailoring.
3149 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3150 // ^($HY | $HH) $AL;
3151 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3152 prevPosX2 == -1) {
3153 setAppliedRule(pos, "LB 20.09");
3154 continue;
3155 }
3156
3157 if (fBA->contains(thisChar) ||
3158 fHY->contains(thisChar) ||
3159 fNS->contains(thisChar) ||
3160 fBB->contains(prevChar) ) {
3161 setAppliedRule(pos, "LB 21");
3162 continue;
3163 }
3164
3165 if (fHL->contains(prevCharX2) &&
3166 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3167 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3168 continue;
3169 }
3170
3171 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3172 setAppliedRule(pos, "LB 21b SY x HL");
3173 continue;
3174 }
3175
3176 if (fIN->contains(thisChar)) {
3177 setAppliedRule(pos, "LB 22");
3178 continue;
3179 }
3180
3181
3182 // (AL | HL) x NU
3183 // NU x (AL | HL)
3184 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3185 setAppliedRule(pos, "LB 23");
3186 continue;
3187 }
3188 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3189 setAppliedRule(pos, "LB 23");
3190 continue;
3191 }
3192
3193 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3194 // PR x (ID | EB | EM)
3195 // (ID | EB | EM) x PO
3196 if (fPR->contains(prevChar) &&
3197 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3198 setAppliedRule(pos, "LB 23a");
3199 continue;
3200 }
3201 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3202 fPO->contains(thisChar)) {
3203 setAppliedRule(pos, "LB 23a");
3204 continue;
3205 }
3206
3207 // Do not break between prefix and letters or ideographs.
3208 // (PR | PO) x (AL | HL)
3209 // (AL | HL) x (PR | PO)
3210 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3211 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3212 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3213 continue;
3214 }
3215 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3216 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3217 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3218 continue;
3219 }
3220
3221 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3222
3223 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3224 fJV->contains(thisChar) ||
3225 fH2->contains(thisChar) ||
3226 fH3->contains(thisChar))) {
3227 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3228 continue;
3229 }
3230
3231 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3232 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3233 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3234 continue;
3235 }
3236
3237 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3238 fJT->contains(thisChar)) {
3239 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3240 continue;
3241 }
3242
3243 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3244 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3245 fPO->contains(thisChar)) {
3246 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3247 continue;
3248 }
3249 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3250 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3251 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3252 continue;
3253 }
3254
3255
3256
3257 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3258 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3259 continue;
3260 }
3261
3262 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3263 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3264 continue;
3265 }
3266
3267 // (AL | NU) x OP
3268 // CP x (AL | NU)
3269 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3270 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3271 continue;
3272 }
3273 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3274 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3275 continue;
3276 }
3277
3278 // RI x RI
3279 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3280 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3281 break;
3282 }
3283 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3284 // Two Regional Indicators have been paired.
3285 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3286 // following RI. This is a hack.
3287 thisChar = -1;
3288 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3289 continue;
3290 }
3291
3292 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3293 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3294 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3295 continue;
3296 }
3297
3298 if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3299 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3300 continue;
3301 }
3302
3303 setAppliedRule(pos, "LB 31 Break everywhere else");
3304 break;
3305 }
3306
3307 return pos;
3308 }
3309
3310
charClasses()3311 UVector *RBBILineMonkey::charClasses() {
3312 return fSets;
3313 }
3314
3315
~RBBILineMonkey()3316 RBBILineMonkey::~RBBILineMonkey() {
3317 delete fSets;
3318
3319 delete fBK;
3320 delete fCR;
3321 delete fLF;
3322 delete fCM;
3323 delete fNL;
3324 delete fWJ;
3325 delete fZW;
3326 delete fGL;
3327 delete fCB;
3328 delete fSP;
3329 delete fB2;
3330 delete fBA;
3331 delete fBB;
3332 delete fHH;
3333 delete fHY;
3334 delete fH2;
3335 delete fH3;
3336 delete fCL;
3337 delete fCP;
3338 delete fEX;
3339 delete fIN;
3340 delete fJL;
3341 delete fJV;
3342 delete fJT;
3343 delete fNS;
3344 delete fOP;
3345 delete fQU;
3346 delete fIS;
3347 delete fNU;
3348 delete fPO;
3349 delete fPR;
3350 delete fSY;
3351 delete fAI;
3352 delete fAL;
3353 delete fCJ;
3354 delete fHL;
3355 delete fID;
3356 delete fRI;
3357 delete fSG;
3358 delete fXX;
3359 delete fEB;
3360 delete fEM;
3361 delete fZWJ;
3362 delete fOP30;
3363 delete fCP30;
3364 delete fExtPictUnassigned;
3365
3366 delete fCharBI;
3367 delete fNumberMatcher;
3368 }
3369
3370
3371 //-------------------------------------------------------------------------------------------
3372 //
3373 // TestMonkey
3374 //
3375 // params
3376 // seed=nnnnn Random number starting seed.
3377 // Setting the seed allows errors to be reproduced.
3378 // loop=nnn Looping count. Controls running time.
3379 // -1: run forever.
3380 // 0 or greater: run length.
3381 //
3382 // type = char | word | line | sent | title
3383 //
3384 // Example:
3385 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3386 //
3387 //-------------------------------------------------------------------------------------------
3388
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3389 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3390 int32_t val = defaultVal;
3391 name.append(" *= *(-?\\d+)");
3392 UErrorCode status = U_ZERO_ERROR;
3393 RegexMatcher m(name, params, 0, status);
3394 if (m.find()) {
3395 // The param exists. Convert the string to an int.
3396 char valString[100];
3397 int32_t paramLength = m.end(1, status) - m.start(1, status);
3398 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3399 paramLength = (int32_t)(sizeof(valString)-2);
3400 }
3401 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3402 val = strtol(valString, NULL, 10);
3403
3404 // Delete this parameter from the params string.
3405 m.reset();
3406 params = m.replaceFirst("", status);
3407 }
3408 U_ASSERT(U_SUCCESS(status));
3409 return val;
3410 }
3411 #endif
3412
3413 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3414 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3415 BreakIterator *bi,
3416 int expected[],
3417 int expectedcount)
3418 {
3419 int count = 0;
3420 int i = 0;
3421 int forward[50];
3422 bi->setText(ustr);
3423 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3424 forward[count] = i;
3425 if (count < expectedcount && expected[count] != i) {
3426 test->errln("%s:%d break forward test failed: expected %d but got %d",
3427 __FILE__, __LINE__, expected[count], i);
3428 break;
3429 }
3430 count ++;
3431 }
3432 if (count != expectedcount) {
3433 printStringBreaks(ustr, expected, expectedcount);
3434 test->errln("%s:%d break forward test failed: missed %d match",
3435 __FILE__, __LINE__, expectedcount - count);
3436 return;
3437 }
3438 // testing boundaries
3439 for (i = 1; i < expectedcount; i ++) {
3440 int j = expected[i - 1];
3441 if (!bi->isBoundary(j)) {
3442 printStringBreaks(ustr, expected, expectedcount);
3443 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3444 __FILE__, __LINE__, j);
3445 return;
3446 }
3447 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3448 if (bi->isBoundary(j)) {
3449 printStringBreaks(ustr, expected, expectedcount);
3450 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3451 __FILE__, __LINE__, j);
3452 return;
3453 }
3454 }
3455 }
3456
3457 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3458 count --;
3459 if (forward[count] != i) {
3460 printStringBreaks(ustr, expected, expectedcount);
3461 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3462 __FILE__, __LINE__, forward[count], i);
3463 break;
3464 }
3465 }
3466 if (count != 0) {
3467 printStringBreaks(ustr, expected, expectedcount);
3468 test->errln("break test previous() failed: missed a match");
3469 return;
3470 }
3471
3472 // testing preceding
3473 for (i = 0; i < expectedcount - 1; i ++) {
3474 // int j = expected[i] + 1;
3475 int j = ustr.moveIndex32(expected[i], 1);
3476 for (; j <= expected[i + 1]; j ++) {
3477 int32_t expectedPreceding = expected[i];
3478 int32_t actualPreceding = bi->preceding(j);
3479 if (actualPreceding != expectedPreceding) {
3480 printStringBreaks(ustr, expected, expectedcount);
3481 test->errln("%s:%d preceding(%d): expected %d, got %d",
3482 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3483 return;
3484 }
3485 }
3486 }
3487 }
3488 #endif
3489
TestWordBreaks(void)3490 void RBBITest::TestWordBreaks(void)
3491 {
3492 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3493
3494 Locale locale("en");
3495 UErrorCode status = U_ZERO_ERROR;
3496 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3497 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3498 // Replaced any C+J characters in a row with a random sequence of characters
3499 // of the same length to make our C+J segmentation not get in the way.
3500 static const char *strlist[] =
3501 {
3502 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3503 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3504 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3505 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3506 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3507 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3508 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3509 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3510 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3511 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3512 "\\u2027\\U000e0067\\u0a47\\u00b7",
3513 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3514 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3515 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3516 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3517 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3518 "\\u0027\\u11af\\U000e0057\\u0602",
3519 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3520 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3521 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3522 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3523 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3524 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3525 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3526 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3527 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3528 "\\u18f4\\U000e0049\\u20e7\\u2027",
3529 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3530 "\\ua183\\u102d\\u0bec\\u003a",
3531 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3532 "\\u003a\\u0e57\\u0fad\\u002e",
3533 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3534 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3535 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3536 "\\u003a\\u0664\\u00b7\\u1fba",
3537 "\\u003b\\u0027\\u00b7\\u47a3",
3538 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3539 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3540 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3541 };
3542 int loop;
3543 if (U_FAILURE(status)) {
3544 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3545 return;
3546 }
3547 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3548 // printf("looping %d\n", loop);
3549 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3550 // RBBICharMonkey monkey;
3551 RBBIWordMonkey monkey;
3552
3553 int expected[50];
3554 int expectedcount = 0;
3555
3556 monkey.setText(ustr);
3557 int i;
3558 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3559 expected[expectedcount ++] = i;
3560 }
3561
3562 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3563 }
3564 delete bi;
3565 #endif
3566 }
3567
TestWordBoundary(void)3568 void RBBITest::TestWordBoundary(void)
3569 {
3570 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3571 Locale locale("en");
3572 UErrorCode status = U_ZERO_ERROR;
3573 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3574 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3575 if (U_FAILURE(status)) {
3576 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3577 __FILE__, __LINE__, u_errorName(status));
3578 return;
3579 }
3580 UChar str[50];
3581 static const char *strlist[] =
3582 {
3583 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586 "\\u2027\\U000e0067\\u0a47\\u00b7",
3587 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592 "\\u0027\\u11af\\U000e0057\\u0602",
3593 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3598 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3599 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602 "\\u58f4\\U000e0049\\u20e7\\u2027",
3603 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3604 "\\ua183\\u102d\\u0bec\\u003a",
3605 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606 "\\u003a\\u0e57\\u0fad\\u002e",
3607 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610 "\\u003a\\u0664\\u00b7\\u1fba",
3611 "\\u003b\\u0027\\u00b7\\u47a3",
3612 };
3613 int loop;
3614 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3615 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3616 UnicodeString ustr(str);
3617 int forward[50];
3618 int count = 0;
3619
3620 bi->setText(ustr);
3621 int prev = -1;
3622 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3623 ++count;
3624 if (count >= UPRV_LENGTHOF(forward)) {
3625 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3626 __FILE__, __LINE__, loop, count, boundary);
3627 return;
3628 }
3629 forward[count] = boundary;
3630 if (boundary <= prev) {
3631 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3632 __FILE__, __LINE__, loop, prev, boundary);
3633 break;
3634 }
3635 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3636 if (bi->isBoundary(nonBoundary)) {
3637 printStringBreaks(ustr, forward, count);
3638 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3639 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3640 return;
3641 }
3642 }
3643 if (!bi->isBoundary(boundary)) {
3644 printStringBreaks(ustr, forward, count);
3645 errln("%s:%d happy boundary test failed: expected %d a boundary",
3646 __FILE__, __LINE__, boundary);
3647 return;
3648 }
3649 prev = boundary;
3650 }
3651 }
3652 }
3653
TestLineBreaks(void)3654 void RBBITest::TestLineBreaks(void)
3655 {
3656 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3657 Locale locale("en");
3658 UErrorCode status = U_ZERO_ERROR;
3659 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3660 const int32_t STRSIZE = 50;
3661 UChar str[STRSIZE];
3662 static const char *strlist[] =
3663 {
3664 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3665 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3666 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3667 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3668 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3669 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3670 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3671 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3672 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3673 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3674 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3690 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3691 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3692 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3693 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3694 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3695 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3696 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3697 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3698 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3699 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3700 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3701 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3702 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3703 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3704 };
3705 int loop;
3706 TEST_ASSERT_SUCCESS(status);
3707 if (U_FAILURE(status)) {
3708 return;
3709 }
3710 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3711 // printf("looping %d\n", loop);
3712 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3713 if (t >= STRSIZE) {
3714 TEST_ASSERT(FALSE);
3715 continue;
3716 }
3717
3718
3719 UnicodeString ustr(str);
3720 RBBILineMonkey monkey;
3721 if (U_FAILURE(monkey.deferredStatus)) {
3722 continue;
3723 }
3724
3725 const int EXPECTEDSIZE = 50;
3726 int expected[EXPECTEDSIZE];
3727 int expectedcount = 0;
3728
3729 monkey.setText(ustr);
3730
3731 int i;
3732 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3733 if (expectedcount >= EXPECTEDSIZE) {
3734 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3735 return;
3736 }
3737 expected[expectedcount ++] = i;
3738 }
3739
3740 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3741 }
3742 delete bi;
3743 #endif
3744 }
3745
TestSentBreaks(void)3746 void RBBITest::TestSentBreaks(void)
3747 {
3748 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3749 Locale locale("en");
3750 UErrorCode status = U_ZERO_ERROR;
3751 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3752 UChar str[200];
3753 static const char *strlist[] =
3754 {
3755 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3756 "This\n",
3757 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3758 "\"Sentence ending with a quote.\" Bye.",
3759 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3760 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3761 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3762 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3763 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3764 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3765 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3766 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3767 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3768 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3769 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3770 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3771 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3772 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3773 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3774 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3775 };
3776 int loop;
3777 if (U_FAILURE(status)) {
3778 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3779 return;
3780 }
3781 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3782 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3783 UnicodeString ustr(str);
3784
3785 RBBISentMonkey monkey;
3786 if (U_FAILURE(monkey.deferredStatus)) {
3787 continue;
3788 }
3789
3790 const int EXPECTEDSIZE = 50;
3791 int expected[EXPECTEDSIZE];
3792 int expectedcount = 0;
3793
3794 monkey.setText(ustr);
3795
3796 int i;
3797 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3798 if (expectedcount >= EXPECTEDSIZE) {
3799 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3800 return;
3801 }
3802 expected[expectedcount ++] = i;
3803 }
3804
3805 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3806 }
3807 delete bi;
3808 #endif
3809 }
3810
TestMonkey()3811 void RBBITest::TestMonkey() {
3812 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3813
3814 UErrorCode status = U_ZERO_ERROR;
3815 int32_t loopCount = 500;
3816 int32_t seed = 1;
3817 UnicodeString breakType = "all";
3818 Locale locale("en");
3819 UBool useUText = FALSE;
3820
3821 if (quick == FALSE) {
3822 loopCount = 10000;
3823 }
3824
3825 if (fTestParams) {
3826 UnicodeString p(fTestParams);
3827 loopCount = getIntParam("loop", p, loopCount);
3828 seed = getIntParam("seed", p, seed);
3829
3830 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3831 if (m.find()) {
3832 breakType = m.group(1, status);
3833 m.reset();
3834 p = m.replaceFirst("", status);
3835 }
3836
3837 RegexMatcher u(" *utext", p, 0, status);
3838 if (u.find()) {
3839 useUText = TRUE;
3840 u.reset();
3841 p = u.replaceFirst("", status);
3842 }
3843
3844
3845 // m.reset(p);
3846 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3847 // Each option is stripped out of the option string as it is processed.
3848 // All options have been checked. The option string should have been completely emptied..
3849 char buf[100];
3850 p.extract(buf, sizeof(buf), NULL, status);
3851 buf[sizeof(buf)-1] = 0;
3852 errln("Unrecognized or extra parameter: %s\n", buf);
3853 return;
3854 }
3855
3856 }
3857
3858 if (breakType == "char" || breakType == "all") {
3859 RBBICharMonkey m;
3860 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3861 if (U_SUCCESS(status)) {
3862 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3863 if (breakType == "all" && useUText==FALSE) {
3864 // Also run a quick test with UText when "all" is specified
3865 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3866 }
3867 }
3868 else {
3869 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3870 }
3871 delete bi;
3872 }
3873
3874 if (breakType == "word" || breakType == "all") {
3875 logln("Word Break Monkey Test");
3876 RBBIWordMonkey m;
3877 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3878 if (U_SUCCESS(status)) {
3879 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3880 }
3881 else {
3882 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3883 }
3884 delete bi;
3885 }
3886
3887 if (breakType == "line" || breakType == "all") {
3888 logln("Line Break Monkey Test");
3889 RBBILineMonkey m;
3890 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3891 if (loopCount >= 10) {
3892 loopCount = loopCount / 5; // Line break runs slower than the others.
3893 }
3894 if (U_SUCCESS(status)) {
3895 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3896 }
3897 else {
3898 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3899 }
3900 delete bi;
3901 }
3902
3903 if (breakType == "sent" || breakType == "all" ) {
3904 logln("Sentence Break Monkey Test");
3905 RBBISentMonkey m;
3906 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3907 if (loopCount >= 10) {
3908 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3909 }
3910 if (U_SUCCESS(status)) {
3911 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3912 }
3913 else {
3914 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3915 }
3916 delete bi;
3917 }
3918
3919 #endif
3920 }
3921
3922 //
3923 // Run a RBBI monkey test. Common routine, for all break iterator types.
3924 // Parameters:
3925 // bi - the break iterator to use
3926 // mk - MonkeyKind, abstraction for obtaining expected results
3927 // name - Name of test (char, word, etc.) for use in error messages
3928 // seed - Seed for starting random number generator (parameter from user)
3929 // numIterations
3930 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3931 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3932 int32_t numIterations, UBool useUText) {
3933
3934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3935
3936 const int32_t TESTSTRINGLEN = 500;
3937 UnicodeString testText;
3938 int32_t numCharClasses;
3939 UVector *chClasses;
3940 int expectedCount = 0;
3941 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3942 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3943 char reverseBreaks[TESTSTRINGLEN*2+1];
3944 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3945 char followingBreaks[TESTSTRINGLEN*2+1];
3946 char precedingBreaks[TESTSTRINGLEN*2+1];
3947 int i;
3948 int loopCount = 0;
3949
3950
3951 m_seed = seed;
3952
3953 numCharClasses = mk.charClasses()->size();
3954 chClasses = mk.charClasses();
3955
3956 // Check for errors that occurred during the construction of the MonkeyKind object.
3957 // Can't report them where they occurred because errln() is a method coming from intlTest,
3958 // and is not visible outside of RBBITest :-(
3959 if (U_FAILURE(mk.deferredStatus)) {
3960 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3961 return;
3962 }
3963
3964 // Verify that the character classes all have at least one member.
3965 for (i=0; i<numCharClasses; i++) {
3966 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3967 if (s == NULL || s->size() == 0) {
3968 errln("Character Class #%d is null or of zero size.", i);
3969 return;
3970 }
3971 }
3972
3973 // For minimizing width of class name output.
3974 int classNameSize = mk.maxClassNameSize();
3975
3976 while (loopCount < numIterations || numIterations == -1) {
3977 if (numIterations == -1 && loopCount % 10 == 0) {
3978 // If test is running in an infinite loop, display a periodic tic so
3979 // we can tell that it is making progress.
3980 fprintf(stderr, ".");
3981 }
3982 // Save current random number seed, so that we can recreate the random numbers
3983 // for this loop iteration in event of an error.
3984 seed = m_seed;
3985
3986 // Populate a test string with data.
3987 testText.truncate(0);
3988 for (i=0; i<TESTSTRINGLEN; i++) {
3989 int32_t aClassNum = m_rand() % numCharClasses;
3990 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3991 int32_t charIdx = m_rand() % classSet->size();
3992 UChar32 c = classSet->charAt(charIdx);
3993 if (c < 0) { // TODO: deal with sets containing strings.
3994 errln("%s:%d c < 0", __FILE__, __LINE__);
3995 break;
3996 }
3997 // Do not assemble a supplementary character from randomly generated separate surrogates.
3998 // (It could be a dictionary character)
3999 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4000 continue;
4001 }
4002
4003 testText.append(c);
4004 }
4005
4006 // Calculate the expected results for this test string and reset applied rules.
4007 mk.setText(testText);
4008
4009 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4010 expectedBreaks[0] = 1;
4011 int32_t breakPos = 0;
4012 expectedCount = 0;
4013 for (;;) {
4014 breakPos = mk.next(breakPos);
4015 if (breakPos == -1) {
4016 break;
4017 }
4018 if (breakPos > testText.length()) {
4019 errln("breakPos > testText.length()");
4020 }
4021 expectedBreaks[breakPos] = 1;
4022 expectedCount++;
4023 U_ASSERT(expectedCount<testText.length());
4024 }
4025
4026 // Find the break positions using forward iteration
4027 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4028 if (useUText) {
4029 UErrorCode status = U_ZERO_ERROR;
4030 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4031 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4032 bi->setText(testUText, status);
4033 TEST_ASSERT_SUCCESS(status);
4034 utext_close(testUText); // The break iterator does a shallow clone of the UText
4035 // This UText can be closed immediately, so long as the
4036 // testText string continues to exist.
4037 } else {
4038 bi->setText(testText);
4039 }
4040
4041 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4042 if (i < 0 || i > testText.length()) {
4043 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4044 break;
4045 }
4046 forwardBreaks[i] = 1;
4047 }
4048
4049 // Find the break positions using reverse iteration
4050 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4051 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4052 if (i < 0 || i > testText.length()) {
4053 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4054 break;
4055 }
4056 reverseBreaks[i] = 1;
4057 }
4058
4059 // Find the break positions using isBoundary() tests.
4060 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4061 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4062 for (i=0; i<=testText.length(); i++) {
4063 isBoundaryBreaks[i] = bi->isBoundary(i);
4064 }
4065
4066
4067 // Find the break positions using the following() function.
4068 // printf(".");
4069 memset(followingBreaks, 0, sizeof(followingBreaks));
4070 int32_t lastBreakPos = 0;
4071 followingBreaks[0] = 1;
4072 for (i=0; i<testText.length(); i++) {
4073 breakPos = bi->following(i);
4074 if (breakPos <= i ||
4075 breakPos < lastBreakPos ||
4076 breakPos > testText.length() ||
4077 (breakPos > lastBreakPos && lastBreakPos > i)) {
4078 errln("%s break monkey test: "
4079 "Out of range value returned by BreakIterator::following().\n"
4080 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4081 name, seed, i, breakPos, lastBreakPos);
4082 break;
4083 }
4084 followingBreaks[breakPos] = 1;
4085 lastBreakPos = breakPos;
4086 }
4087
4088 // Find the break positions using the preceding() function.
4089 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4090 lastBreakPos = testText.length();
4091 precedingBreaks[testText.length()] = 1;
4092 for (i=testText.length(); i>0; i--) {
4093 breakPos = bi->preceding(i);
4094 if (breakPos >= i ||
4095 breakPos > lastBreakPos ||
4096 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4097 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4098 errln("%s break monkey test: "
4099 "Out of range value returned by BreakIterator::preceding().\n"
4100 "index=%d; prev returned %d; lastBreak=%d" ,
4101 name, i, breakPos, lastBreakPos);
4102 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4103 precedingBreaks[i] = 2; // Forces an error.
4104 }
4105 } else {
4106 if (breakPos >= 0) {
4107 precedingBreaks[breakPos] = 1;
4108 }
4109 lastBreakPos = breakPos;
4110 }
4111 }
4112
4113 // Compare the expected and actual results.
4114 for (i=0; i<=testText.length(); i++) {
4115 const char *errorType = NULL;
4116 const char* currentBreakData = NULL;
4117 if (forwardBreaks[i] != expectedBreaks[i]) {
4118 errorType = "next()";
4119 currentBreakData = forwardBreaks;
4120 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4121 errorType = "previous()";
4122 currentBreakData = reverseBreaks;
4123 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4124 errorType = "isBoundary()";
4125 currentBreakData = isBoundaryBreaks;
4126 } else if (followingBreaks[i] != expectedBreaks[i]) {
4127 errorType = "following()";
4128 currentBreakData = followingBreaks;
4129 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4130 errorType = "preceding()";
4131 currentBreakData = precedingBreaks;
4132 }
4133
4134 if (errorType != NULL) {
4135 // Format a range of the test text that includes the failure as
4136 // a data item that can be included in the rbbi test data file.
4137
4138 // Start of the range is the last point where expected and actual results
4139 // both agreed that there was a break position.
4140
4141 int startContext = i;
4142 int32_t count = 0;
4143 for (;;) {
4144 if (startContext==0) { break; }
4145 startContext --;
4146 if (expectedBreaks[startContext] != 0) {
4147 if (count == 2) break;
4148 count ++;
4149 }
4150 }
4151
4152 // End of range is two expected breaks past the start position.
4153 int endContext = i + 1;
4154 int ci;
4155 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4156 for (;;) {
4157 if (endContext >= testText.length()) {break;}
4158 if (expectedBreaks[endContext-1] != 0) {
4159 if (count == 0) break;
4160 count --;
4161 }
4162 endContext ++;
4163 }
4164 }
4165
4166 // Formatting of each line includes:
4167 // character code
4168 // reference break: '|' -> a break, '.' -> no break
4169 // actual break: '|' -> a break, '.' -> no break
4170 // (name of character clase)
4171 // Unicode name of character
4172 // '-->' indicates location of the difference.
4173
4174 MONKEY_ERROR(
4175 (expectedBreaks[i] ? "Break expected but not found" :
4176 "Break found but not expected"),
4177 name, i, seed);
4178
4179 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4180 UChar32 c;
4181 c = testText.char32At(ci);
4182
4183 std::string currentLineFlag = " ";
4184 if (ci == i) {
4185 currentLineFlag = "-->"; // Error position
4186 }
4187
4188 // BMP or SMP character in hex
4189 char hexCodePoint[12];
4190 std::string format = " \\u%04x";
4191 if (c >= 0x10000) {
4192 format = "\\U%08x";
4193 }
4194 sprintf(hexCodePoint, format.c_str(), c);
4195
4196 // Get the class name and character name for the character.
4197 char cName[200];
4198 UErrorCode status = U_ZERO_ERROR;
4199 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4200
4201 char buffer[200];
4202 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4203 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4204 currentLineFlag.c_str(),
4205 ci,
4206 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4207 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4208 hexCodePoint,
4209 classNameSize,
4210 mk.classNameFromCodepoint(c).c_str(),
4211 mk.getAppliedRule(ci).c_str(), cName);
4212 (void)ret;
4213 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4214
4215 // Output the error
4216 if (ci == i) {
4217 errln(buffer);
4218 } else {
4219 infoln(buffer);
4220 }
4221
4222 if (ci >= endContext) { break; }
4223 }
4224 break;
4225 }
4226 }
4227
4228 loopCount++;
4229 }
4230 #endif
4231 }
4232
4233
4234 // Bug 5532. UTF-8 based UText fails in dictionary code.
4235 // This test checks the initial patch,
4236 // which is to just keep it from crashing. Correct word boundaries
4237 // await a proper fix to the dictionary code.
4238 //
TestBug5532(void)4239 void RBBITest::TestBug5532(void) {
4240 // Text includes a mixture of Thai and Latin.
4241 const unsigned char utf8Data[] = {
4242 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4243 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4244 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4245 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4246 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4247 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4248 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4249 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4250 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4251 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4252 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4253
4254 UErrorCode status = U_ZERO_ERROR;
4255 UText utext=UTEXT_INITIALIZER;
4256 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4257 TEST_ASSERT_SUCCESS(status);
4258
4259 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4260 TEST_ASSERT_SUCCESS(status);
4261 if (U_SUCCESS(status)) {
4262 bi->setText(&utext, status);
4263 TEST_ASSERT_SUCCESS(status);
4264
4265 int32_t breakCount = 0;
4266 int32_t previousBreak = -1;
4267 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4268 // For now, just make sure that the break iterator doesn't hang.
4269 TEST_ASSERT(previousBreak < bi->current());
4270 previousBreak = bi->current();
4271 }
4272 TEST_ASSERT(breakCount > 0);
4273 }
4274 delete bi;
4275 utext_close(&utext);
4276 }
4277
4278
TestBug9983(void)4279 void RBBITest::TestBug9983(void) {
4280 UnicodeString text = UnicodeString("\\u002A" // * Other
4281 "\\uFF65" // Other
4282 "\\u309C" // Katakana
4283 "\\uFF9F" // Extend
4284 "\\uFF65" // Other
4285 "\\u0020" // Other
4286 "\\u0000").unescape();
4287
4288 UErrorCode status = U_ZERO_ERROR;
4289 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4290 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4291 TEST_ASSERT_SUCCESS(status);
4292 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4293 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4294 TEST_ASSERT_SUCCESS(status);
4295 if (U_FAILURE(status)) {
4296 return;
4297 }
4298 int32_t offset, rstatus, iterationCount;
4299
4300 brkiter->setText(text);
4301 brkiter->last();
4302 iterationCount = 0;
4303 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4304 iterationCount++;
4305 rstatus = brkiter->getRuleStatus();
4306 (void)rstatus; // Suppress set but not used warning.
4307 if (iterationCount >= 10) {
4308 break;
4309 }
4310 }
4311 TEST_ASSERT(iterationCount == 6);
4312
4313 brkiterPOSIX->setText(text);
4314 brkiterPOSIX->last();
4315 iterationCount = 0;
4316 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4317 iterationCount++;
4318 rstatus = brkiterPOSIX->getRuleStatus();
4319 (void)rstatus; // Suppress set but not used warning.
4320 if (iterationCount >= 10) {
4321 break;
4322 }
4323 }
4324 TEST_ASSERT(iterationCount == 6);
4325 }
4326
4327 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4328 //
TestBug7547()4329 void RBBITest::TestBug7547() {
4330 UnicodeString rules;
4331 UErrorCode status = U_ZERO_ERROR;
4332 UParseError parseError;
4333 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4334 if (status != U_BRK_RULE_SYNTAX) {
4335 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4336 }
4337 if (parseError.line != 1 || parseError.offset != 0) {
4338 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4339 }
4340 }
4341
4342
TestBug12797()4343 void RBBITest::TestBug12797() {
4344 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4345 UErrorCode status = U_ZERO_ERROR;
4346 UParseError parseError;
4347 RuleBasedBreakIterator bi(rules, parseError, status);
4348 if (U_FAILURE(status)) {
4349 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4350 return;
4351 }
4352 UnicodeString text = "abc";
4353 bi.setText(text);
4354 bi.first();
4355 int32_t boundary = bi.next();
4356 if (boundary != 3) {
4357 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4358 }
4359 }
4360
TestBug12918()4361 void RBBITest::TestBug12918() {
4362 // This test triggers an assertion failure in dictbe.cpp
4363 const UChar *crasherString = u"\u3325\u4a16";
4364 UErrorCode status = U_ZERO_ERROR;
4365 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4366 if (U_FAILURE(status)) {
4367 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4368 return;
4369 }
4370 ubrk_first(iter);
4371 int32_t pos = 0;
4372 int32_t lastPos = -1;
4373 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4374 if (pos <= lastPos) {
4375 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4376 break;
4377 }
4378 }
4379 ubrk_close(iter);
4380 }
4381
TestBug12932()4382 void RBBITest::TestBug12932() {
4383 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4384 UnicodeString ruleStr(
4385 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4386 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4387 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4388 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4389 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4390 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4391
4392 UErrorCode status = U_ZERO_ERROR;
4393 UParseError parseError;
4394 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4395 if (status != U_BRK_RULE_SYNTAX) {
4396 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4397 __FILE__, __LINE__, u_errorName(status));
4398 }
4399 }
4400
4401
4402 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4403 // remain undevided by ICU char, word and line break.
TestEmoji()4404 void RBBITest::TestEmoji() {
4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4406 UErrorCode status = U_ZERO_ERROR;
4407
4408 CharString testFileName;
4409 testFileName.append(IntlTest::getSourceTestData(status), status);
4410 testFileName.appendPathPart("emoji-test.txt", status);
4411 if (U_FAILURE(status)) {
4412 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4413 return;
4414 }
4415 logln("Opening data file %s\n", testFileName.data());
4416
4417 int len;
4418 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4419 if (U_FAILURE(status) || testFile == NULL) {
4420 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4421 return;
4422 }
4423 UnicodeString testFileAsString(testFile, len);
4424 delete [] testFile;
4425
4426 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4427 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4428 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4429 int32_t lineNumber = 0;
4430
4431 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4432 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4433 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4434 if (U_FAILURE(status)) {
4435 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4436 return;
4437 }
4438
4439 while (lineMatcher.find()) {
4440 ++lineNumber;
4441 UnicodeString line = lineMatcher.group(status);
4442 hexMatcher.reset(line);
4443 UnicodeString testString; // accumulates the emoji sequence.
4444 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4445 UnicodeString hex = hexMatcher.group(1, status);
4446 if (hex.length() > 8) {
4447 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4448 break;
4449 }
4450 CharString hex8;
4451 hex8.appendInvariantChars(hex, status);
4452 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4453 if (c<=0x10ffff) {
4454 testString.append(c);
4455 } else {
4456 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4457 __FILE__, __LINE__, lineNumber, hex8.data());
4458 break;
4459 }
4460 }
4461
4462 if (testString.length() > 1) {
4463 charBreaks->setText(testString);
4464 charBreaks->first();
4465 int32_t firstBreak = charBreaks->next();
4466 if (testString.length() != firstBreak) {
4467 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4468 __FILE__, __LINE__, lineNumber, firstBreak);
4469 }
4470 wordBreaks->setText(testString);
4471 wordBreaks->first();
4472 firstBreak = wordBreaks->next();
4473 if (testString.length() != firstBreak) {
4474 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4475 __FILE__, __LINE__, lineNumber, firstBreak);
4476 }
4477 lineBreaks->setText(testString);
4478 lineBreaks->first();
4479 firstBreak = lineBreaks->next();
4480 if (testString.length() != firstBreak) {
4481 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4482 __FILE__, __LINE__, lineNumber, firstBreak);
4483 }
4484 }
4485 }
4486 #endif
4487 }
4488
4489
4490 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4491
TestBug12519()4492 void RBBITest::TestBug12519() {
4493 UErrorCode status = U_ZERO_ERROR;
4494 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4495 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4496 if (!assertSuccess(WHERE, status)) {
4497 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4498 return;
4499 }
4500 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4501
4502 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4503 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4504
4505 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4506 assertTrue(WHERE, *biEn == *cloneEn);
4507 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4508
4509 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4510 assertTrue(WHERE, *biFr == *cloneFr);
4511 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4512
4513 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4514 UnicodeString text("Hallo Welt");
4515 biDe->setText(text);
4516 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4517 *biDe = *biFr;
4518 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4519 }
4520
TestBug12677()4521 void RBBITest::TestBug12677() {
4522 // Check that stripping of comments from rules for getRules() is not confused by
4523 // the presence of '#' characters in the rules that do not introduce comments.
4524 UnicodeString rules(u"!!forward; \n"
4525 "$x = [ab#]; # a set with a # literal. \n"
4526 " # .; # a comment that looks sort of like a rule. \n"
4527 " '#' '?'; # a rule with a quoted # \n"
4528 );
4529
4530 UErrorCode status = U_ZERO_ERROR;
4531 UParseError pe;
4532 RuleBasedBreakIterator bi(rules, pe, status);
4533 assertSuccess(WHERE, status);
4534 UnicodeString rtRules = bi.getRules();
4535 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4536 }
4537
4538
TestTableRedundancies()4539 void RBBITest::TestTableRedundancies() {
4540 UErrorCode status = U_ZERO_ERROR;
4541
4542 LocalPointer<RuleBasedBreakIterator> bi (
4543 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4544 assertSuccess(WHERE, status);
4545 if (U_FAILURE(status)) return;
4546
4547 RBBIDataWrapper *dw = bi->fData;
4548 const RBBIStateTable *fwtbl = dw->fForwardTable;
4549 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4550 int32_t numCharClasses = dw->fHeader->fCatCount;
4551 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4552
4553 // Check for duplicate columns (character categories)
4554
4555 std::vector<UnicodeString> columns;
4556 for (int32_t column = 0; column < numCharClasses; column++) {
4557 UnicodeString s;
4558 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4559 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4560 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4561 }
4562 columns.push_back(s);
4563 }
4564 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4565 for (int c1=1; c1<numCharClasses; c1++) {
4566 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4567 for (int c2 = c1+1; c2 < limit; c2++) {
4568 if (columns.at(c1) == columns.at(c2)) {
4569 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4570 goto out;
4571 }
4572 }
4573 }
4574 out:
4575
4576 // Check for duplicate states
4577 std::vector<UnicodeString> rows;
4578 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4579 UnicodeString s;
4580 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4581 if (in8Bits) {
4582 s.append(row->r8.fAccepting);
4583 s.append(row->r8.fLookAhead);
4584 s.append(row->r8.fTagsIdx);
4585 for (int32_t column = 0; column < numCharClasses; column++) {
4586 s.append(row->r8.fNextState[column]);
4587 }
4588 } else {
4589 s.append(row->r16.fAccepting);
4590 s.append(row->r16.fLookAhead);
4591 s.append(row->r16.fTagsIdx);
4592 for (int32_t column = 0; column < numCharClasses; column++) {
4593 s.append(row->r16.fNextState[column]);
4594 }
4595 }
4596 rows.push_back(s);
4597 }
4598 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4599 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4600 if (rows.at(r1) == rows.at(r2)) {
4601 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4602 return;
4603 }
4604 }
4605 }
4606 }
4607
4608 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4609 // even after next() has returned DONE.
4610
TestBug13447()4611 void RBBITest::TestBug13447() {
4612 UErrorCode status = U_ZERO_ERROR;
4613 LocalPointer<RuleBasedBreakIterator> bi(
4614 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4615 assertSuccess(WHERE, status);
4616 if (U_FAILURE(status)) return;
4617 UnicodeString data(u"1234");
4618 bi->setText(data);
4619 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4620 assertEquals(WHERE, 4, bi->next());
4621 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4622 assertEquals(WHERE, UBRK_DONE, bi->next());
4623 assertEquals(WHERE, 4, bi->current());
4624 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4625 }
4626
4627 // TestReverse exercises both the synthesized safe reverse rules and the logic
4628 // for filling the break iterator cache when starting from random positions
4629 // in the text.
4630 //
4631 // It's a monkey test, working on random data, with the expected data obtained
4632 // from forward iteration (no safe rules involved), comparing with results
4633 // when indexing into the interior of the string (safe rules needed).
4634
TestReverse()4635 void RBBITest::TestReverse() {
4636 UErrorCode status = U_ZERO_ERROR;
4637
4638 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4639 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4640 assertSuccess(WHERE, status, true);
4641 status = U_ZERO_ERROR;
4642 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4643 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4644 assertSuccess(WHERE, status, true);
4645 status = U_ZERO_ERROR;
4646 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4647 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4648 assertSuccess(WHERE, status, true);
4649 status = U_ZERO_ERROR;
4650 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4651 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4652 assertSuccess(WHERE, status, true);
4653 }
4654
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4655 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4656 if (!bi) {
4657 return;
4658 }
4659
4660 // From the mapping trie in the break iterator's internal data, create a
4661 // vector of UnicodeStrings, one for each character category, containing
4662 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4663 // to avoid an execess of unassigned code points.
4664
4665 RBBIDataWrapper *data = bi->fData;
4666 int32_t categoryCount = data->fHeader->fCatCount;
4667 UCPTrie *trie = data->fTrie;
4668 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4669 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4670
4671 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4672 for (int cp=0; cp<0x1fff0; ++cp) {
4673 int cat = ucptrie_get(trie, cp);
4674 cat &= ~dictBit; // And off the dictionary bit from the category.
4675 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4676 if (cat < 0 || cat >= categoryCount) return;
4677 strings[cat].append(cp);
4678 }
4679
4680 icu_rand randomGen;
4681 const int testStringLength = 10000;
4682 UnicodeString testString;
4683
4684 for (int i=0; i<testStringLength; ++i) {
4685 int charClass = randomGen() % categoryCount;
4686 if (strings[charClass].length() > 0) {
4687 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4688 testString.append(cp);
4689 }
4690 }
4691
4692 typedef std::pair<UBool, int32_t> Result;
4693 std::vector<Result> expectedResults;
4694 bi->setText(testString);
4695 for (int i=0; i<testString.length(); ++i) {
4696 bool isboundary = bi->isBoundary(i);
4697 int ruleStatus = bi->getRuleStatus();
4698 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4699 }
4700
4701 for (int i=testString.length()-1; i>=0; --i) {
4702 bi->setText(testString); // clears the internal break cache
4703 Result expected = expectedResults[i];
4704 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4705 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4706 }
4707 }
4708
4709
4710 // Ticket 13692 - finding word boundaries in very large numbers or words could
4711 // be very time consuming. When the problem was present, this void test
4712 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4713
TestBug13692()4714 void RBBITest::TestBug13692() {
4715 UErrorCode status = U_ZERO_ERROR;
4716 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4717 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4718 if (!assertSuccess(WHERE, status, true)) {
4719 return;
4720 }
4721 constexpr int32_t LENGTH = 1000000;
4722 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4723 for (int i=0; i<20; i+=2) {
4724 longNumber.setCharAt(i, u' ');
4725 }
4726 bi->setText(longNumber);
4727 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4728 assertSuccess(WHERE, status);
4729 }
4730
4731
TestProperties()4732 void RBBITest::TestProperties() {
4733 UErrorCode errorCode = U_ZERO_ERROR;
4734 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4735 if (!prependSet.isEmpty()) {
4736 errln(
4737 "[:GCB=Prepend:] is not empty any more. "
4738 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4739 "change this test to the opposite condition.");
4740 }
4741 }
4742
4743
4744 //
4745 // TestDebug - A place-holder test for debugging purposes.
4746 // For putting in fragments of other tests that can be invoked
4747 // for tracing without a lot of unwanted extra stuff happening.
4748 //
TestDebug(void)4749 void RBBITest::TestDebug(void) {
4750 UErrorCode status = U_ZERO_ERROR;
4751 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4752 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4753 if (!assertSuccess(WHERE, status, true)) {
4754 return;
4755 }
4756 const UnicodeString &rules = bi->getRules();
4757 UParseError pe;
4758 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4759 assertSuccess(WHERE, status);
4760 }
4761
4762
4763 //
4764 // TestDebugRules A stub test for use in debugging rule compilation problems.
4765 // Can be freely altered as needed or convenient.
4766 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4767 // data files may not be available in all environments.
4768 // Any permanent test cases should be moved to rbbitst.txt
4769 // (see Bug 20303 in that file, for example), or to another test function in this file.
4770 //
TestDebugRules()4771 void RBBITest::TestDebugRules() {
4772 #if 0
4773 const char16_t *rules = u""
4774 "!!quoted_literals_only; \n"
4775 "!!chain; \n"
4776 "!!lookAheadHardBreak; \n"
4777 " \n"
4778 // "[a] / ; \n"
4779 "[a] [b] / [c] [d]; \n"
4780 "[a] [b] / [c] [d] {100}; \n"
4781 "[x] [a] [b] / [c] [d] {100}; \n"
4782 "[a] [b] [c] / [d] {100}; \n"
4783 //" [c] [d] / [e] [f]; \n"
4784 //"[a] [b] / [c]; \n"
4785 ;
4786
4787 UErrorCode status = U_ZERO_ERROR;
4788 CharString path(pathToDataDirectory(), status);
4789 path.appendPathPart("brkitr", status);
4790 path.appendPathPart("rules", status);
4791 path.appendPathPart("line.txt", status);
4792 int len;
4793 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4794 if (!assertSuccess(WHERE, status)) {
4795 return;
4796 }
4797
4798 UParseError pe;
4799 // rules = testFile.get();
4800 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4801
4802 if (!assertSuccess(WHERE, status)) {
4803 delete bi;
4804 return;
4805 }
4806 // bi->dumpTables();
4807
4808 delete bi;
4809 #endif
4810 }
4811
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4812 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4813 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4814 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4815 // Text are duplicate characters from U+4E00 to U+4FFF
4816 UnicodeString text;
4817 for (UChar c = 0x4e00; c < 0x5000; c++) {
4818 text.append(c).append(c);
4819 }
4820 // Generate rule which will caused length+4 character classes and
4821 // length+3 states
4822 UnicodeString rules(u"!!quoted_literals_only;");
4823 for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4824 rules.append(u'\'').append(c).append(c).append(u"';");
4825 }
4826 rules.append(u".;");
4827 UErrorCode status = U_ZERO_ERROR;
4828 UParseError parseError;
4829 RuleBasedBreakIterator bi(rules, parseError, status);
4830
4831 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4832 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4833 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4834 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4835 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4836
4837 bi.setText(text);
4838
4839 int32_t pos;
4840 int32_t i = 0;
4841 while ((pos = bi.next()) > 0) {
4842 // The first numChar should not break between the pair
4843 if (i++ < numChar) {
4844 assertEquals(WHERE, i * 2, pos);
4845 } else {
4846 // After the first numChar next(), break on each character.
4847 assertEquals(WHERE, i + numChar, pos);
4848 }
4849 }
4850 while ((pos = bi.previous()) > 0) {
4851 // The first numChar should not break between the pair
4852 if (--i < numChar) {
4853 assertEquals(WHERE, i * 2, pos);
4854 } else {
4855 // After the first numChar next(), break on each character.
4856 assertEquals(WHERE, i + numChar, pos);
4857 }
4858 }
4859 }
4860
Test8BitsTrieWith8BitStateTable()4861 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4862 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4863 }
4864
Test16BitsTrieWith8BitStateTable()4865 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4866 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4867 }
4868
Test16BitsTrieWith16BitStateTable()4869 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4870 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4871 }
4872
Test8BitsTrieWith16BitStateTable()4873 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4874 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4875 // create state table in 16 bits.
4876
4877 // Generate 510 'a' as text
4878 UnicodeString text;
4879 for (int32_t i = 0; i < 510; i++) {
4880 text.append(u'a');
4881 }
4882
4883 UnicodeString rules(u"!!quoted_literals_only;'");
4884 // 254 'a' in the rule will cause 256 states
4885 for (int32_t i = 0; i < 254; i++) {
4886 rules.append(u'a');
4887 }
4888 rules.append(u"';.;");
4889
4890 UErrorCode status = U_ZERO_ERROR;
4891 UParseError parseError;
4892 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4893
4894 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4895 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4896 assertEquals(WHERE,
4897 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4898 bi->setText(text);
4899
4900 // break positions:
4901 // 254, 508, 509, ... 510
4902 assertEquals("next()", 254, bi->next());
4903 int32_t i = 0;
4904 int32_t pos;
4905 while ((pos = bi->next()) > 0) {
4906 assertEquals(WHERE, 508 + i , pos);
4907 i++;
4908 }
4909 i = 0;
4910 while ((pos = bi->previous()) > 0) {
4911 i++;
4912 if (pos >= 508) {
4913 assertEquals(WHERE, 510 - i , pos);
4914 } else {
4915 assertEquals(WHERE, 254 , pos);
4916 }
4917 }
4918 }
4919
4920 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4921 // that there are no problems with rules at the size that transitions between the two.
4922 //
4923 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4924 // one character class per character in the string. So we can make a rule to tickle the
4925 // boundaries by using literal strings of various lengths.
4926 //
4927 // For both the number of states and the number of character classes, the eight bit format
4928 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4929 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4930 // which allows some margin for changes to the number of values reserved by the rule builder
4931 // without breaking the test.
4932
TestTable_8_16_Bits()4933 void RBBITest::TestTable_8_16_Bits() {
4934
4935 // testStr serves as both the source of the rule string (truncated to the desired length)
4936 // and as test data to check matching behavior. A break rule consisting of the first 120
4937 // characters of testStr will match the first 120 chars of the full-length testStr.
4938 UnicodeString testStr;
4939 for (UChar c=0x3000; c<0x3200; ++c) {
4940 testStr.append(c);
4941 }
4942
4943 const int32_t startLength = 120; // The shortest rule string to test.
4944 const int32_t endLength = 260; // The longest rule string to test
4945 const int32_t increment = this->quick ? endLength - startLength : 1;
4946
4947 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4948 UParseError parseError;
4949 UErrorCode status = U_ZERO_ERROR;
4950
4951 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4952 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4953 RuleBasedBreakIterator bi(ruleString, parseError, status);
4954 if (!assertSuccess(WHERE, status)) {
4955 errln(ruleString);
4956 break;
4957 }
4958 // bi.dumpTables();
4959
4960 // Verify that the break iterator is functioning - that the first boundary found
4961 // in testStr is at the length of the rule string.
4962 bi.setText(testStr);
4963 assertEquals(WHERE, ruleLen, bi.next());
4964
4965 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4966 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4967 bi.setText(testStr);
4968 int32_t result = bi.preceding(ruleLen);
4969 assertEquals(WHERE, 0, result);
4970
4971 // Verify that the range of rule lengths being tested cover the translations
4972 // from 8 to 16 bit data.
4973 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4974 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4975
4976 if (ruleLen == startLength) {
4977 assertEquals(WHERE, true, has8BitRowData);
4978 assertEquals(WHERE, true, has8BitsTrie);
4979 }
4980 if (ruleLen == endLength) {
4981 assertEquals(WHERE, false, has8BitRowData);
4982 assertEquals(WHERE, false, has8BitsTrie);
4983 }
4984 }
4985 }
4986
4987 /* Test handling of a large number of look-ahead rules.
4988 * The number of rules in the test exceeds the implementation limits prior to the
4989 * improvements introduced with #13590.
4990 *
4991 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
4992 * The text being matched is sequential, "ABCDEFGHI..."
4993 *
4994 * The upshot is that the look-ahead rules all match on their preceding context,
4995 * and consequently must save a potential result, but then fail to match on their
4996 * trailing context, so that they don't actually cause a boundary.
4997 *
4998 * Additionally, add a ".*" rule, so there are no boundaries unless a
4999 * look-ahead hard-break rule forces one.
5000 */
TestBug13590()5001 void RBBITest::TestBug13590() {
5002 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5003
5004 const int NUM_LOOKAHEAD_RULES = 50;
5005 const char16_t STARTING_CHAR = u'\u5000';
5006 char16_t firstChar;
5007 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5008 firstChar = STARTING_CHAR + ruleNum*2;
5009 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5010 .append(u' ') .append(u'/') .append(u' ')
5011 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5012 .append(u';') .append(u'\n');
5013 }
5014
5015 // Change the last rule added from the form "UV / WY" to "UV / WX".
5016 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5017 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5018
5019 UErrorCode status = U_ZERO_ERROR;
5020 UParseError parseError;
5021 RuleBasedBreakIterator bi(rules, parseError, status);
5022 if (!assertSuccess(WHERE, status)) {
5023 errln(rules);
5024 return;
5025 }
5026 // bi.dumpTables();
5027
5028 UnicodeString testString;
5029 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5030 testString.append(c);
5031 }
5032 bi.setText(testString);
5033
5034 int breaksFound = 0;
5035 while (bi.next() != UBRK_DONE) {
5036 ++breaksFound;
5037 }
5038
5039 // Two matches are expected, one from the last rule that was explicitly modified,
5040 // and one at the end of the text.
5041 assertEquals(WHERE, 2, breaksFound);
5042 }
5043
5044
5045 #if U_ENABLE_TRACING
5046 static std::vector<std::string> gData;
5047 static std::vector<int32_t> gEntryFn;
5048 static std::vector<int32_t> gExitFn;
5049 static std::vector<int32_t> gDataFn;
5050
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5051 static void U_CALLCONV traceData(
5052 const void*,
5053 int32_t fnNumber,
5054 int32_t,
5055 const char *,
5056 va_list args) {
5057 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5058 const char* data = va_arg(args, const char*);
5059 gDataFn.push_back(fnNumber);
5060 gData.push_back(data);
5061 }
5062 }
5063
traceEntry(const void *,int32_t fnNumber)5064 static void traceEntry(const void *, int32_t fnNumber) {
5065 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5066 gEntryFn.push_back(fnNumber);
5067 }
5068 }
5069
traceExit(const void *,int32_t fnNumber,const char *,va_list)5070 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5071 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5072 gExitFn.push_back(fnNumber);
5073 }
5074 }
5075
5076
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5077 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5078 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5079 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5080 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5081 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5082
5083 if (expectedData == nullptr) {
5084 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5085 assertEquals("utrace_data should not be called ", 0, gData.size());
5086 } else {
5087 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5088 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5089 assertEquals("utrace_data should be called ", 1, gData.size());
5090 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5091 }
5092 }
5093
SetupTestTrace()5094 void SetupTestTrace() {
5095 gEntryFn.clear();
5096 gExitFn.clear();
5097 gDataFn.clear();
5098 gData.clear();
5099
5100 const void* context = nullptr;
5101 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5102 utrace_setLevel(UTRACE_INFO);
5103 }
5104
TestTraceCreateCharacter(void)5105 void RBBITest::TestTraceCreateCharacter(void) {
5106 SetupTestTrace();
5107 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5108 LocalPointer<BreakIterator> brkitr(
5109 BreakIterator::createCharacterInstance("zh-CN", status));
5110 status.errIfFailureAndReset();
5111 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5112 }
5113
TestTraceCreateTitle(void)5114 void RBBITest::TestTraceCreateTitle(void) {
5115 SetupTestTrace();
5116 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5117 LocalPointer<BreakIterator> brkitr(
5118 BreakIterator::createTitleInstance("zh-CN", status));
5119 status.errIfFailureAndReset();
5120 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5121 }
5122
TestTraceCreateSentence(void)5123 void RBBITest::TestTraceCreateSentence(void) {
5124 SetupTestTrace();
5125 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5126 LocalPointer<BreakIterator> brkitr(
5127 BreakIterator::createSentenceInstance("zh-CN", status));
5128 status.errIfFailureAndReset();
5129 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5130 }
5131
TestTraceCreateWord(void)5132 void RBBITest::TestTraceCreateWord(void) {
5133 SetupTestTrace();
5134 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5135 LocalPointer<BreakIterator> brkitr(
5136 BreakIterator::createWordInstance("zh-CN", status));
5137 status.errIfFailureAndReset();
5138 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5139 }
5140
TestTraceCreateLine(void)5141 void RBBITest::TestTraceCreateLine(void) {
5142 SetupTestTrace();
5143 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5144 LocalPointer<BreakIterator> brkitr(
5145 BreakIterator::createLineInstance("zh-CN", status));
5146 status.errIfFailureAndReset();
5147 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5148 }
5149
TestTraceCreateLineStrict(void)5150 void RBBITest::TestTraceCreateLineStrict(void) {
5151 SetupTestTrace();
5152 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5153 LocalPointer<BreakIterator> brkitr(
5154 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5155 status.errIfFailureAndReset();
5156 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5157 }
5158
TestTraceCreateLineNormal(void)5159 void RBBITest::TestTraceCreateLineNormal(void) {
5160 SetupTestTrace();
5161 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5162 LocalPointer<BreakIterator> brkitr(
5163 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5164 status.errIfFailureAndReset();
5165 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5166 }
5167
TestTraceCreateLineLoose(void)5168 void RBBITest::TestTraceCreateLineLoose(void) {
5169 SetupTestTrace();
5170 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5171 LocalPointer<BreakIterator> brkitr(
5172 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5173 status.errIfFailureAndReset();
5174 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5175 }
5176
TestTraceCreateBreakEngine(void)5177 void RBBITest::TestTraceCreateBreakEngine(void) {
5178 rbbi_cleanup();
5179 SetupTestTrace();
5180 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5181 LocalPointer<BreakIterator> brkitr(
5182 BreakIterator::createWordInstance("zh-CN", status));
5183 status.errIfFailureAndReset();
5184 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5185
5186 // To word break the following text, BreakIterator will create 5 dictionary
5187 // break engine internally.
5188 brkitr->setText(
5189 u"test "
5190 u"測試 " // Hani
5191 u"សាកល្បង " // Khmr
5192 u"ທົດສອບ " // Laoo
5193 u"စမ်းသပ်မှု " // Mymr
5194 u"ทดสอบ " // Thai
5195 u"test "
5196 );
5197
5198 // Loop through all the text.
5199 while (brkitr->next() > 0) ;
5200
5201 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5202 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5203 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5204
5205 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5206 assertEquals("utrace_entry should be called ",
5207 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5208 assertEquals("utrace_exit should be called ",
5209 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5210 assertEquals("utrace_data should be called ",
5211 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5212 }
5213
5214 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5215 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5216 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5217 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5218 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5219
5220 }
5221 #endif
5222
TestUnpairedSurrogate()5223 void RBBITest::TestUnpairedSurrogate() {
5224 UnicodeString rules(u"ab;");
5225
5226 UErrorCode status = U_ZERO_ERROR;
5227 UParseError pe;
5228 RuleBasedBreakIterator bi1(rules, pe, status);
5229 assertSuccess(WHERE, status);
5230 UnicodeString rtRules = bi1.getRules();
5231 // make sure the simple one work first.
5232 assertEquals(WHERE, rules, rtRules);
5233
5234
5235 rules = UnicodeString(u"a\\ud800b;").unescape();
5236 pe.line = 0;
5237 pe.offset = 0;
5238 RuleBasedBreakIterator bi2(rules, pe, status);
5239 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5240 if (pe.line != 1 || pe.offset != 1) {
5241 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5242 }
5243
5244 status = U_ZERO_ERROR;
5245 rules = UnicodeString(u"a\\ude00b;").unescape();
5246 pe.line = 0;
5247 pe.offset = 0;
5248 RuleBasedBreakIterator bi3(rules, pe, status);
5249 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5250 if (pe.line != 1 || pe.offset != 1) {
5251 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5252 }
5253
5254 // make sure the surrogate one work too.
5255 status = U_ZERO_ERROR;
5256 rules = UnicodeString(u"ab;");
5257 RuleBasedBreakIterator bi4(rules, pe, status);
5258 rtRules = bi4.getRules();
5259 assertEquals(WHERE, rules, rtRules);
5260 }
5261
5262 // Read file generated by
5263 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5264 // as test cases and compare the Output.
5265 // Format of the file
5266 // Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5267 // Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5268 // Input:\t[source text]
5269 // Output:\t[expected output separated by | ]
5270 // Input: ...
5271 // Output: ...
5272
runLSTMTestFromFile(const char * filename,UScriptCode script)5273 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5274 // The expectation in this test depends on LSTM, skip the test if the
5275 // configuration is not build with LSTM data.
5276 if (skipLSTMTest()) {
5277 return;
5278 }
5279 UErrorCode status = U_ZERO_ERROR;
5280 LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5281 if (U_FAILURE(status)) {
5282 errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5283 return;
5284 }
5285 // Open and read the test data file.
5286 const char *testDataDirectory = IntlTest::getSourceTestData(status);
5287 CharString testFileName(testDataDirectory, -1, status);
5288 testFileName.append(filename, -1, status);
5289
5290 int len;
5291 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5292 if (U_FAILURE(status)) {
5293 errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5294 return;
5295 }
5296
5297 // Put the test data into a UnicodeString
5298 UnicodeString testString(FALSE, testFile, len);
5299
5300 int32_t start = 0;
5301
5302 UnicodeString line;
5303 int32_t end;
5304 std::string actual_sep_str;
5305 int32_t caseNum = 0;
5306 // Iterate through all the lines in the test file.
5307 do {
5308 int32_t cr = testString.indexOf(u'\r', start);
5309 int32_t lf = testString.indexOf(u'\n', start);
5310 end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5311 line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5312 if (line.length() > 0) {
5313 // Separate each line to key and value by TAB.
5314 int32_t tab = line.indexOf(u'\t');
5315 UnicodeString key = line.tempSubString(0, tab);
5316 const UnicodeString value = line.tempSubString(tab+1);
5317
5318 if (key == "Model:") {
5319 // Verify the expectation in the test file match the LSTM model
5320 // we are using now.
5321 const LSTMData* data = CreateLSTMDataForScript(script, status);
5322 if (U_FAILURE(status)) {
5323 dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5324 __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5325 return;
5326 }
5327 UnicodeString name(LSTMDataName(data));
5328 DeleteLSTMData(data);
5329 if (value != name) {
5330 std::string utf8Name, utf8Value;
5331 dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5332 __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5333 name.toUTF8String<std::string>(utf8Name).c_str(),
5334 value.toUTF8String<std::string>(utf8Value).c_str());
5335 return;
5336 }
5337 } else if (key == "Input:") {
5338 UnicodeString input("prefix ");
5339 input += value + " suffix";
5340 std::stringstream ss;
5341
5342 // Construct the UText which is expected by the the engine as
5343 // input from the UnicodeString.
5344 UText ut = UTEXT_INITIALIZER;
5345 utext_openConstUnicodeString(&ut, &input, &status);
5346 if (U_FAILURE(status)) {
5347 dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5348 return;
5349 }
5350
5351 iterator->setText(&ut, status);
5352 if (U_FAILURE(status)) {
5353 errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5354 return;
5355 }
5356
5357 int32_t bp;
5358 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5359 ss << bp;
5360 if (bp != input.length()) {
5361 ss << ", ";
5362 }
5363 }
5364
5365 utext_close(&ut);
5366 // Turn the break points into a string for easy comparison
5367 // output.
5368 actual_sep_str = "{" + ss.str() + "}";
5369 } else if (key == "Output:" && !actual_sep_str.empty()) {
5370 UnicodeString input("prefix| |");
5371 input += value + "| |suffix";
5372 std::string d;
5373 int32_t sep;
5374 int32_t start = 0;
5375 int32_t curr = 0;
5376 std::stringstream ss;
5377 // Include 0 as the break point.
5378 ss << "0, ";
5379 while ((sep = input.indexOf(u'|', start)) >= 0) {
5380 int32_t len = sep - start;
5381 if (len > 0) {
5382 if (curr > 0) {
5383 ss << ", ";
5384 }
5385 curr += len;
5386 ss << curr;
5387 }
5388 start = sep + 1;
5389 }
5390 // Include end of the string as break point.
5391 ss << ", " << curr + input.length() - start;
5392 // Turn the break points into a string for easy comparison
5393 // output.
5394 std::string expected = "{" + ss.str() + "}";
5395 std::string utf8;
5396
5397 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5398 expected.c_str(), actual_sep_str.c_str());
5399 actual_sep_str.clear();
5400 }
5401 }
5402 start = std::max(cr, lf) + 1;
5403 } while (end >= 0);
5404
5405 delete [] testFile;
5406 }
5407
TestLSTMThai()5408 void RBBITest::TestLSTMThai() {
5409 runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5410 }
5411
TestLSTMBurmese()5412 void RBBITest::TestLSTMBurmese() {
5413 runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5414 }
5415
5416 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5417