1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "brkeng.h"
15 #include "dictbe.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ubrk.h"
19 #include "uvectr32.h"
20 #include "uvector.h"
21 #include "uassert.h"
22 #include "unicode/normlzr.h"
23 #include "cmemory.h"
24 #include "dictionarydata.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 /*
29  ******************************************************************
30  */
31 
DictionaryBreakEngine(uint32_t breakTypes)32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
33     fTypes = breakTypes;
34 }
35 
~DictionaryBreakEngine()36 DictionaryBreakEngine::~DictionaryBreakEngine() {
37 }
38 
39 UBool
handles(UChar32 c,int32_t breakType) const40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
41     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
42             && fSet.contains(c));
43 }
44 
45 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,int32_t breakType,UVector32 & foundBreaks) const46 DictionaryBreakEngine::findBreaks( UText *text,
47                                  int32_t startPos,
48                                  int32_t endPos,
49                                  int32_t breakType,
50                                  UVector32 &foundBreaks ) const {
51     (void)startPos;            // TODO: remove this param?
52     int32_t result = 0;
53 
54     // Find the span of characters included in the set.
55     //   The span to break begins at the current position in the text, and
56     //   extends towards the start or end of the text, depending on 'reverse'.
57 
58     int32_t start = (int32_t)utext_getNativeIndex(text);
59     int32_t current;
60     int32_t rangeStart;
61     int32_t rangeEnd;
62     UChar32 c = utext_current32(text);
63     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64         utext_next32(text);         // TODO:  recast loop for postincrement
65         c = utext_current32(text);
66     }
67     rangeStart = start;
68     rangeEnd = current;
69     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
70         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
71         utext_setNativeIndex(text, current);
72     }
73 
74     return result;
75 }
76 
77 void
setCharacters(const UnicodeSet & set)78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
79     fSet = set;
80     // Compact for caching
81     fSet.compact();
82 }
83 
84 /*
85  ******************************************************************
86  * PossibleWord
87  */
88 
89 // Helper class for improving readability of the Thai/Lao/Khmer word break
90 // algorithm. The implementation is completely inline.
91 
92 // List size, limited by the maximum number of words in the dictionary
93 // that form a nested sequence.
94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
95 
96 class PossibleWord {
97 private:
98     // list of word candidate lengths, in increasing length order
99     // TODO: bytes would be sufficient for word lengths.
100     int32_t   count;      // Count of candidates
101     int32_t   prefix;     // The longest match with a dictionary word
102     int32_t   offset;     // Offset in the text of these candidates
103     int32_t   mark;       // The preferred candidate's offset
104     int32_t   current;    // The candidate we're currently looking at
105     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
106     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
107 
108 public:
PossibleWord()109     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
~PossibleWord()110     ~PossibleWord() {};
111 
112     // Fill the list of candidates if needed, select the longest, and return the number found
113     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
114 
115     // Select the currently marked candidate, point after it in the text, and invalidate self
116     int32_t   acceptMarked( UText *text );
117 
118     // Back up from the current candidate to the next shorter one; return TRUE if that exists
119     // and point the text after it
120     UBool     backUp( UText *text );
121 
122     // Return the longest prefix this candidate location shares with a dictionary word
123     // Return value is in code points.
longestPrefix()124     int32_t   longestPrefix() { return prefix; };
125 
126     // Mark the current candidate as the one we like
markCurrent()127     void      markCurrent() { mark = current; };
128 
129     // Get length in code points of the marked word.
markedCPLength()130     int32_t   markedCPLength() { return cpLengths[mark]; };
131 };
132 
133 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
135     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
136     int32_t start = (int32_t)utext_getNativeIndex(text);
137     if (start != offset) {
138         offset = start;
139         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
140         // Dictionary leaves text after longest prefix, not longest word. Back up.
141         if (count <= 0) {
142             utext_setNativeIndex(text, start);
143         }
144     }
145     if (count > 0) {
146         utext_setNativeIndex(text, start+cuLengths[count-1]);
147     }
148     current = count-1;
149     mark = current;
150     return count;
151 }
152 
153 int32_t
acceptMarked(UText * text)154 PossibleWord::acceptMarked( UText *text ) {
155     utext_setNativeIndex(text, offset + cuLengths[mark]);
156     return cuLengths[mark];
157 }
158 
159 
160 UBool
backUp(UText * text)161 PossibleWord::backUp( UText *text ) {
162     if (current > 0) {
163         utext_setNativeIndex(text, offset + cuLengths[--current]);
164         return TRUE;
165     }
166     return FALSE;
167 }
168 
169 /*
170  ******************************************************************
171  * ThaiBreakEngine
172  */
173 
174 // How many words in a row are "good enough"?
175 static const int32_t THAI_LOOKAHEAD = 3;
176 
177 // Will not combine a non-word with a preceding dictionary word longer than this
178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
179 
180 // Will not combine a non-word that shares at least this much prefix with a
181 // dictionary word, with a preceding word
182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
183 
184 // Ellision character
185 static const int32_t THAI_PAIYANNOI = 0x0E2F;
186 
187 // Repeat character
188 static const int32_t THAI_MAIYAMOK = 0x0E46;
189 
190 // Minimum word size
191 static const int32_t THAI_MIN_WORD = 2;
192 
193 // Minimum number of characters for two words
194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
195 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
197     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
198       fDictionary(adoptDictionary)
199 {
200     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201     if (U_SUCCESS(status)) {
202         setCharacters(fThaiWordSet);
203     }
204     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205     fMarkSet.add(0x0020);
206     fEndWordSet = fThaiWordSet;
207     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
208     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
209     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
210     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
211     fSuffixSet.add(THAI_PAIYANNOI);
212     fSuffixSet.add(THAI_MAIYAMOK);
213 
214     // Compact for caching.
215     fMarkSet.compact();
216     fEndWordSet.compact();
217     fBeginWordSet.compact();
218     fSuffixSet.compact();
219 }
220 
~ThaiBreakEngine()221 ThaiBreakEngine::~ThaiBreakEngine() {
222     delete fDictionary;
223 }
224 
225 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const226 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
227                                                 int32_t rangeStart,
228                                                 int32_t rangeEnd,
229                                                 UVector32 &foundBreaks ) const {
230     utext_setNativeIndex(text, rangeStart);
231     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
232     if (utext_getNativeIndex(text) >= rangeEnd) {
233         return 0;       // Not enough characters for two words
234     }
235     utext_setNativeIndex(text, rangeStart);
236 
237 
238     uint32_t wordsFound = 0;
239     int32_t cpWordLength = 0;    // Word Length in Code Points.
240     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
241     int32_t current;
242     UErrorCode status = U_ZERO_ERROR;
243     PossibleWord words[THAI_LOOKAHEAD];
244 
245     utext_setNativeIndex(text, rangeStart);
246 
247     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
248         cpWordLength = 0;
249         cuWordLength = 0;
250 
251         // Look for candidate words at the current position
252         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
253 
254         // If we found exactly one, use that
255         if (candidates == 1) {
256             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
257             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
258             wordsFound += 1;
259         }
260         // If there was more than one, see which one can take us forward the most words
261         else if (candidates > 1) {
262             // If we're already at the end of the range, we're done
263             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
264                 goto foundBest;
265             }
266             do {
267                 int32_t wordsMatched = 1;
268                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
269                     if (wordsMatched < 2) {
270                         // Followed by another dictionary word; mark first word as a good candidate
271                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
272                         wordsMatched = 2;
273                     }
274 
275                     // If we're already at the end of the range, we're done
276                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277                         goto foundBest;
278                     }
279 
280                     // See if any of the possible second words is followed by a third word
281                     do {
282                         // If we find a third word, stop right away
283                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
284                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
285                             goto foundBest;
286                         }
287                     }
288                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
289                 }
290             }
291             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
292 foundBest:
293             // Set UText position to after the accepted word.
294             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
295             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
296             wordsFound += 1;
297         }
298 
299         // We come here after having either found a word or not. We look ahead to the
300         // next word. If it's not a dictionary word, we will combine it with the word we
301         // just found (if there is one), but only if the preceding word does not exceed
302         // the threshold.
303         // The text iterator should now be positioned at the end of the word we found.
304 
305         UChar32 uc = 0;
306         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307             // if it is a dictionary word, do nothing. If it isn't, then if there is
308             // no preceding word, or the non-word shares less than the minimum threshold
309             // of characters with a dictionary word, then scan to resynchronize
310             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311                   && (cuWordLength == 0
312                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313                 // Look for a plausible word boundary
314                 int32_t remaining = rangeEnd - (current+cuWordLength);
315                 UChar32 pc;
316                 int32_t chars = 0;
317                 for (;;) {
318                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
319                     pc = utext_next32(text);
320                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
321                     chars += pcSize;
322                     remaining -= pcSize;
323                     if (remaining <= 0) {
324                         break;
325                     }
326                     uc = utext_current32(text);
327                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328                         // Maybe. See if it's in the dictionary.
329                         // NOTE: In the original Apple code, checked that the next
330                         // two characters after uc were not 0x0E4C THANTHAKHAT before
331                         // checking the dictionary. That is just a performance filter,
332                         // but it's not clear it's faster than checking the trie.
333                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334                         utext_setNativeIndex(text, current + cuWordLength + chars);
335                         if (candidates > 0) {
336                             break;
337                         }
338                     }
339                 }
340 
341                 // Bump the word count if there wasn't already one
342                 if (cuWordLength <= 0) {
343                     wordsFound += 1;
344                 }
345 
346                 // Update the length with the passed-over characters
347                 cuWordLength += chars;
348             }
349             else {
350                 // Back up to where we were for next iteration
351                 utext_setNativeIndex(text, current+cuWordLength);
352             }
353         }
354 
355         // Never stop before a combining mark.
356         int32_t currPos;
357         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
358             utext_next32(text);
359             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
360         }
361 
362         // Look ahead for possible suffixes if a dictionary word does not follow.
363         // We do this in code rather than using a rule so that the heuristic
364         // resynch continues to function. For example, one of the suffix characters
365         // could be a typo in the middle of a word.
366         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
367             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
368                 && fSuffixSet.contains(uc = utext_current32(text))) {
369                 if (uc == THAI_PAIYANNOI) {
370                     if (!fSuffixSet.contains(utext_previous32(text))) {
371                         // Skip over previous end and PAIYANNOI
372                         utext_next32(text);
373                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
374                         utext_next32(text);
375                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
376                         uc = utext_current32(text);     // Fetch next character
377                     }
378                     else {
379                         // Restore prior position
380                         utext_next32(text);
381                     }
382                 }
383                 if (uc == THAI_MAIYAMOK) {
384                     if (utext_previous32(text) != THAI_MAIYAMOK) {
385                         // Skip over previous end and MAIYAMOK
386                         utext_next32(text);
387                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
388                         utext_next32(text);
389                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
390                     }
391                     else {
392                         // Restore prior position
393                         utext_next32(text);
394                     }
395                 }
396             }
397             else {
398                 utext_setNativeIndex(text, current+cuWordLength);
399             }
400         }
401 
402         // Did we find a word on this iteration? If so, push it on the break stack
403         if (cuWordLength > 0) {
404             foundBreaks.push((current+cuWordLength), status);
405         }
406     }
407 
408     // Don't return a break for the end of the dictionary range if there is one there.
409     if (foundBreaks.peeki() >= rangeEnd) {
410         (void) foundBreaks.popi();
411         wordsFound -= 1;
412     }
413 
414     return wordsFound;
415 }
416 
417 /*
418  ******************************************************************
419  * LaoBreakEngine
420  */
421 
422 // How many words in a row are "good enough"?
423 static const int32_t LAO_LOOKAHEAD = 3;
424 
425 // Will not combine a non-word with a preceding dictionary word longer than this
426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
427 
428 // Will not combine a non-word that shares at least this much prefix with a
429 // dictionary word, with a preceding word
430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
431 
432 // Minimum word size
433 static const int32_t LAO_MIN_WORD = 2;
434 
435 // Minimum number of characters for two words
436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
437 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
439     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
440       fDictionary(adoptDictionary)
441 {
442     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
443     if (U_SUCCESS(status)) {
444         setCharacters(fLaoWordSet);
445     }
446     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
447     fMarkSet.add(0x0020);
448     fEndWordSet = fLaoWordSet;
449     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
450     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
451     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
452     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
453 
454     // Compact for caching.
455     fMarkSet.compact();
456     fEndWordSet.compact();
457     fBeginWordSet.compact();
458 }
459 
~LaoBreakEngine()460 LaoBreakEngine::~LaoBreakEngine() {
461     delete fDictionary;
462 }
463 
464 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const465 LaoBreakEngine::divideUpDictionaryRange( UText *text,
466                                                 int32_t rangeStart,
467                                                 int32_t rangeEnd,
468                                                 UVector32 &foundBreaks ) const {
469     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
470         return 0;       // Not enough characters for two words
471     }
472 
473     uint32_t wordsFound = 0;
474     int32_t cpWordLength = 0;
475     int32_t cuWordLength = 0;
476     int32_t current;
477     UErrorCode status = U_ZERO_ERROR;
478     PossibleWord words[LAO_LOOKAHEAD];
479 
480     utext_setNativeIndex(text, rangeStart);
481 
482     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
483         cuWordLength = 0;
484         cpWordLength = 0;
485 
486         // Look for candidate words at the current position
487         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
488 
489         // If we found exactly one, use that
490         if (candidates == 1) {
491             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
492             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
493             wordsFound += 1;
494         }
495         // If there was more than one, see which one can take us forward the most words
496         else if (candidates > 1) {
497             // If we're already at the end of the range, we're done
498             if (utext_getNativeIndex(text) >= rangeEnd) {
499                 goto foundBest;
500             }
501             do {
502                 int32_t wordsMatched = 1;
503                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
504                     if (wordsMatched < 2) {
505                         // Followed by another dictionary word; mark first word as a good candidate
506                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
507                         wordsMatched = 2;
508                     }
509 
510                     // If we're already at the end of the range, we're done
511                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
512                         goto foundBest;
513                     }
514 
515                     // See if any of the possible second words is followed by a third word
516                     do {
517                         // If we find a third word, stop right away
518                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
519                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
520                             goto foundBest;
521                         }
522                     }
523                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
524                 }
525             }
526             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
527 foundBest:
528             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
529             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
530             wordsFound += 1;
531         }
532 
533         // We come here after having either found a word or not. We look ahead to the
534         // next word. If it's not a dictionary word, we will combine it withe the word we
535         // just found (if there is one), but only if the preceding word does not exceed
536         // the threshold.
537         // The text iterator should now be positioned at the end of the word we found.
538         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
539             // if it is a dictionary word, do nothing. If it isn't, then if there is
540             // no preceding word, or the non-word shares less than the minimum threshold
541             // of characters with a dictionary word, then scan to resynchronize
542             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
543                   && (cuWordLength == 0
544                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
545                 // Look for a plausible word boundary
546                 int32_t remaining = rangeEnd - (current + cuWordLength);
547                 UChar32 pc;
548                 UChar32 uc;
549                 int32_t chars = 0;
550                 for (;;) {
551                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
552                     pc = utext_next32(text);
553                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
554                     chars += pcSize;
555                     remaining -= pcSize;
556                     if (remaining <= 0) {
557                         break;
558                     }
559                     uc = utext_current32(text);
560                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
561                         // Maybe. See if it's in the dictionary.
562                         // TODO: this looks iffy; compare with old code.
563                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
564                         utext_setNativeIndex(text, current + cuWordLength + chars);
565                         if (candidates > 0) {
566                             break;
567                         }
568                     }
569                 }
570 
571                 // Bump the word count if there wasn't already one
572                 if (cuWordLength <= 0) {
573                     wordsFound += 1;
574                 }
575 
576                 // Update the length with the passed-over characters
577                 cuWordLength += chars;
578             }
579             else {
580                 // Back up to where we were for next iteration
581                 utext_setNativeIndex(text, current + cuWordLength);
582             }
583         }
584 
585         // Never stop before a combining mark.
586         int32_t currPos;
587         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
588             utext_next32(text);
589             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
590         }
591 
592         // Look ahead for possible suffixes if a dictionary word does not follow.
593         // We do this in code rather than using a rule so that the heuristic
594         // resynch continues to function. For example, one of the suffix characters
595         // could be a typo in the middle of a word.
596         // NOT CURRENTLY APPLICABLE TO LAO
597 
598         // Did we find a word on this iteration? If so, push it on the break stack
599         if (cuWordLength > 0) {
600             foundBreaks.push((current+cuWordLength), status);
601         }
602     }
603 
604     // Don't return a break for the end of the dictionary range if there is one there.
605     if (foundBreaks.peeki() >= rangeEnd) {
606         (void) foundBreaks.popi();
607         wordsFound -= 1;
608     }
609 
610     return wordsFound;
611 }
612 
613 /*
614  ******************************************************************
615  * BurmeseBreakEngine
616  */
617 
618 // How many words in a row are "good enough"?
619 static const int32_t BURMESE_LOOKAHEAD = 3;
620 
621 // Will not combine a non-word with a preceding dictionary word longer than this
622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
623 
624 // Will not combine a non-word that shares at least this much prefix with a
625 // dictionary word, with a preceding word
626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
627 
628 // Minimum word size
629 static const int32_t BURMESE_MIN_WORD = 2;
630 
631 // Minimum number of characters for two words
632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
633 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
635     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
636       fDictionary(adoptDictionary)
637 {
638     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
639     if (U_SUCCESS(status)) {
640         setCharacters(fBurmeseWordSet);
641     }
642     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
643     fMarkSet.add(0x0020);
644     fEndWordSet = fBurmeseWordSet;
645     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
646 
647     // Compact for caching.
648     fMarkSet.compact();
649     fEndWordSet.compact();
650     fBeginWordSet.compact();
651 }
652 
~BurmeseBreakEngine()653 BurmeseBreakEngine::~BurmeseBreakEngine() {
654     delete fDictionary;
655 }
656 
657 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const658 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
659                                                 int32_t rangeStart,
660                                                 int32_t rangeEnd,
661                                                 UVector32 &foundBreaks ) const {
662     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
663         return 0;       // Not enough characters for two words
664     }
665 
666     uint32_t wordsFound = 0;
667     int32_t cpWordLength = 0;
668     int32_t cuWordLength = 0;
669     int32_t current;
670     UErrorCode status = U_ZERO_ERROR;
671     PossibleWord words[BURMESE_LOOKAHEAD];
672 
673     utext_setNativeIndex(text, rangeStart);
674 
675     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
676         cuWordLength = 0;
677         cpWordLength = 0;
678 
679         // Look for candidate words at the current position
680         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
681 
682         // If we found exactly one, use that
683         if (candidates == 1) {
684             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
685             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
686             wordsFound += 1;
687         }
688         // If there was more than one, see which one can take us forward the most words
689         else if (candidates > 1) {
690             // If we're already at the end of the range, we're done
691             if (utext_getNativeIndex(text) >= rangeEnd) {
692                 goto foundBest;
693             }
694             do {
695                 int32_t wordsMatched = 1;
696                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
697                     if (wordsMatched < 2) {
698                         // Followed by another dictionary word; mark first word as a good candidate
699                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
700                         wordsMatched = 2;
701                     }
702 
703                     // If we're already at the end of the range, we're done
704                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705                         goto foundBest;
706                     }
707 
708                     // See if any of the possible second words is followed by a third word
709                     do {
710                         // If we find a third word, stop right away
711                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
712                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
713                             goto foundBest;
714                         }
715                     }
716                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
717                 }
718             }
719             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
720 foundBest:
721             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
722             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
723             wordsFound += 1;
724         }
725 
726         // We come here after having either found a word or not. We look ahead to the
727         // next word. If it's not a dictionary word, we will combine it withe the word we
728         // just found (if there is one), but only if the preceding word does not exceed
729         // the threshold.
730         // The text iterator should now be positioned at the end of the word we found.
731         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
732             // if it is a dictionary word, do nothing. If it isn't, then if there is
733             // no preceding word, or the non-word shares less than the minimum threshold
734             // of characters with a dictionary word, then scan to resynchronize
735             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
736                   && (cuWordLength == 0
737                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
738                 // Look for a plausible word boundary
739                 int32_t remaining = rangeEnd - (current + cuWordLength);
740                 UChar32 pc;
741                 UChar32 uc;
742                 int32_t chars = 0;
743                 for (;;) {
744                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
745                     pc = utext_next32(text);
746                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
747                     chars += pcSize;
748                     remaining -= pcSize;
749                     if (remaining <= 0) {
750                         break;
751                     }
752                     uc = utext_current32(text);
753                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
754                         // Maybe. See if it's in the dictionary.
755                         // TODO: this looks iffy; compare with old code.
756                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
757                         utext_setNativeIndex(text, current + cuWordLength + chars);
758                         if (candidates > 0) {
759                             break;
760                         }
761                     }
762                 }
763 
764                 // Bump the word count if there wasn't already one
765                 if (cuWordLength <= 0) {
766                     wordsFound += 1;
767                 }
768 
769                 // Update the length with the passed-over characters
770                 cuWordLength += chars;
771             }
772             else {
773                 // Back up to where we were for next iteration
774                 utext_setNativeIndex(text, current + cuWordLength);
775             }
776         }
777 
778         // Never stop before a combining mark.
779         int32_t currPos;
780         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
781             utext_next32(text);
782             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
783         }
784 
785         // Look ahead for possible suffixes if a dictionary word does not follow.
786         // We do this in code rather than using a rule so that the heuristic
787         // resynch continues to function. For example, one of the suffix characters
788         // could be a typo in the middle of a word.
789         // NOT CURRENTLY APPLICABLE TO BURMESE
790 
791         // Did we find a word on this iteration? If so, push it on the break stack
792         if (cuWordLength > 0) {
793             foundBreaks.push((current+cuWordLength), status);
794         }
795     }
796 
797     // Don't return a break for the end of the dictionary range if there is one there.
798     if (foundBreaks.peeki() >= rangeEnd) {
799         (void) foundBreaks.popi();
800         wordsFound -= 1;
801     }
802 
803     return wordsFound;
804 }
805 
806 /*
807  ******************************************************************
808  * KhmerBreakEngine
809  */
810 
811 // How many words in a row are "good enough"?
812 static const int32_t KHMER_LOOKAHEAD = 3;
813 
814 // Will not combine a non-word with a preceding dictionary word longer than this
815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
816 
817 // Will not combine a non-word that shares at least this much prefix with a
818 // dictionary word, with a preceding word
819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
820 
821 // Minimum word size
822 static const int32_t KHMER_MIN_WORD = 2;
823 
824 // Minimum number of characters for two words
825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
826 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
828     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
829       fDictionary(adoptDictionary)
830 {
831     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
832     if (U_SUCCESS(status)) {
833         setCharacters(fKhmerWordSet);
834     }
835     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
836     fMarkSet.add(0x0020);
837     fEndWordSet = fKhmerWordSet;
838     fBeginWordSet.add(0x1780, 0x17B3);
839     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
840     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
841     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
842     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
843     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
844 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
845 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
846 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
847 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
848 //    fSuffixSet.add(THAI_PAIYANNOI);
849 //    fSuffixSet.add(THAI_MAIYAMOK);
850 
851     // Compact for caching.
852     fMarkSet.compact();
853     fEndWordSet.compact();
854     fBeginWordSet.compact();
855 //    fSuffixSet.compact();
856 }
857 
~KhmerBreakEngine()858 KhmerBreakEngine::~KhmerBreakEngine() {
859     delete fDictionary;
860 }
861 
862 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const863 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
864                                                 int32_t rangeStart,
865                                                 int32_t rangeEnd,
866                                                 UVector32 &foundBreaks ) const {
867     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
868         return 0;       // Not enough characters for two words
869     }
870 
871     uint32_t wordsFound = 0;
872     int32_t cpWordLength = 0;
873     int32_t cuWordLength = 0;
874     int32_t current;
875     UErrorCode status = U_ZERO_ERROR;
876     PossibleWord words[KHMER_LOOKAHEAD];
877 
878     utext_setNativeIndex(text, rangeStart);
879 
880     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
881         cuWordLength = 0;
882         cpWordLength = 0;
883 
884         // Look for candidate words at the current position
885         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
886 
887         // If we found exactly one, use that
888         if (candidates == 1) {
889             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
890             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
891             wordsFound += 1;
892         }
893 
894         // If there was more than one, see which one can take us forward the most words
895         else if (candidates > 1) {
896             // If we're already at the end of the range, we're done
897             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
898                 goto foundBest;
899             }
900             do {
901                 int32_t wordsMatched = 1;
902                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
903                     if (wordsMatched < 2) {
904                         // Followed by another dictionary word; mark first word as a good candidate
905                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
906                         wordsMatched = 2;
907                     }
908 
909                     // If we're already at the end of the range, we're done
910                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
911                         goto foundBest;
912                     }
913 
914                     // See if any of the possible second words is followed by a third word
915                     do {
916                         // If we find a third word, stop right away
917                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
918                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
919                             goto foundBest;
920                         }
921                     }
922                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
923                 }
924             }
925             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
926 foundBest:
927             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
928             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
929             wordsFound += 1;
930         }
931 
932         // We come here after having either found a word or not. We look ahead to the
933         // next word. If it's not a dictionary word, we will combine it with the word we
934         // just found (if there is one), but only if the preceding word does not exceed
935         // the threshold.
936         // The text iterator should now be positioned at the end of the word we found.
937         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
938             // if it is a dictionary word, do nothing. If it isn't, then if there is
939             // no preceding word, or the non-word shares less than the minimum threshold
940             // of characters with a dictionary word, then scan to resynchronize
941             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
942                   && (cuWordLength == 0
943                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
944                 // Look for a plausible word boundary
945                 int32_t remaining = rangeEnd - (current+cuWordLength);
946                 UChar32 pc;
947                 UChar32 uc;
948                 int32_t chars = 0;
949                 for (;;) {
950                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
951                     pc = utext_next32(text);
952                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
953                     chars += pcSize;
954                     remaining -= pcSize;
955                     if (remaining <= 0) {
956                         break;
957                     }
958                     uc = utext_current32(text);
959                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
960                         // Maybe. See if it's in the dictionary.
961                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
962                         utext_setNativeIndex(text, current+cuWordLength+chars);
963                         if (candidates > 0) {
964                             break;
965                         }
966                     }
967                 }
968 
969                 // Bump the word count if there wasn't already one
970                 if (cuWordLength <= 0) {
971                     wordsFound += 1;
972                 }
973 
974                 // Update the length with the passed-over characters
975                 cuWordLength += chars;
976             }
977             else {
978                 // Back up to where we were for next iteration
979                 utext_setNativeIndex(text, current+cuWordLength);
980             }
981         }
982 
983         // Never stop before a combining mark.
984         int32_t currPos;
985         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
986             utext_next32(text);
987             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
988         }
989 
990         // Look ahead for possible suffixes if a dictionary word does not follow.
991         // We do this in code rather than using a rule so that the heuristic
992         // resynch continues to function. For example, one of the suffix characters
993         // could be a typo in the middle of a word.
994 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
995 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
996 //                && fSuffixSet.contains(uc = utext_current32(text))) {
997 //                if (uc == KHMER_PAIYANNOI) {
998 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
999 //                        // Skip over previous end and PAIYANNOI
1000 //                        utext_next32(text);
1001 //                        utext_next32(text);
1002 //                        wordLength += 1;            // Add PAIYANNOI to word
1003 //                        uc = utext_current32(text);     // Fetch next character
1004 //                    }
1005 //                    else {
1006 //                        // Restore prior position
1007 //                        utext_next32(text);
1008 //                    }
1009 //                }
1010 //                if (uc == KHMER_MAIYAMOK) {
1011 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1012 //                        // Skip over previous end and MAIYAMOK
1013 //                        utext_next32(text);
1014 //                        utext_next32(text);
1015 //                        wordLength += 1;            // Add MAIYAMOK to word
1016 //                    }
1017 //                    else {
1018 //                        // Restore prior position
1019 //                        utext_next32(text);
1020 //                    }
1021 //                }
1022 //            }
1023 //            else {
1024 //                utext_setNativeIndex(text, current+wordLength);
1025 //            }
1026 //        }
1027 
1028         // Did we find a word on this iteration? If so, push it on the break stack
1029         if (cuWordLength > 0) {
1030             foundBreaks.push((current+cuWordLength), status);
1031         }
1032     }
1033 
1034     // Don't return a break for the end of the dictionary range if there is one there.
1035     if (foundBreaks.peeki() >= rangeEnd) {
1036         (void) foundBreaks.popi();
1037         wordsFound -= 1;
1038     }
1039 
1040     return wordsFound;
1041 }
1042 
1043 #if !UCONFIG_NO_NORMALIZATION
1044 /*
1045  ******************************************************************
1046  * CjkBreakEngine
1047  */
1048 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1050 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1051     // Korean dictionary only includes Hangul syllables
1052     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1053     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1054     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1055     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1056     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1057 
1058     if (U_SUCCESS(status)) {
1059         // handle Korean and Japanese/Chinese using different dictionaries
1060         if (type == kKorean) {
1061             setCharacters(fHangulWordSet);
1062         } else { //Chinese and Japanese
1063             UnicodeSet cjSet;
1064             cjSet.addAll(fHanWordSet);
1065             cjSet.addAll(fKatakanaWordSet);
1066             cjSet.addAll(fHiraganaWordSet);
1067             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1068             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1069             setCharacters(cjSet);
1070         }
1071     }
1072 }
1073 
~CjkBreakEngine()1074 CjkBreakEngine::~CjkBreakEngine(){
1075     delete fDictionary;
1076 }
1077 
1078 // The katakanaCost values below are based on the length frequencies of all
1079 // katakana phrases in the dictionary
1080 static const int32_t kMaxKatakanaLength = 8;
1081 static const int32_t kMaxKatakanaGroupLength = 20;
1082 static const uint32_t maxSnlp = 255;
1083 
getKatakanaCost(int32_t wordLength)1084 static inline uint32_t getKatakanaCost(int32_t wordLength){
1085     //TODO: fill array with actual values from dictionary!
1086     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1087                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1088     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1089 }
1090 
isKatakana(UChar32 value)1091 static inline bool isKatakana(UChar32 value) {
1092     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1093             (value >= 0xFF66 && value <= 0xFF9f);
1094 }
1095 
1096 
1097 // Function for accessing internal utext flags.
1098 //   Replicates an internal UText function.
1099 
utext_i32_flag(int32_t bitIndex)1100 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1101     return (int32_t)1 << bitIndex;
1102 }
1103 
1104 
1105 /*
1106  * @param text A UText representing the text
1107  * @param rangeStart The start of the range of dictionary characters
1108  * @param rangeEnd The end of the range of dictionary characters
1109  * @param foundBreaks vector<int32> to receive the break positions
1110  * @return The number of breaks found
1111  */
1112 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1113 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1114         int32_t rangeStart,
1115         int32_t rangeEnd,
1116         UVector32 &foundBreaks ) const {
1117     if (rangeStart >= rangeEnd) {
1118         return 0;
1119     }
1120 
1121     // UnicodeString version of input UText, NFKC normalized if necessary.
1122     UnicodeString inString;
1123 
1124     // inputMap[inStringIndex] = corresponding native index from UText inText.
1125     // If NULL then mapping is 1:1
1126     LocalPointer<UVector32>     inputMap;
1127 
1128     UErrorCode     status      = U_ZERO_ERROR;
1129 
1130 
1131     // if UText has the input string as one contiguous UTF-16 chunk
1132     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1133          inText->chunkNativeStart <= rangeStart &&
1134          inText->chunkNativeLimit >= rangeEnd   &&
1135          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1136 
1137         // Input UText is in one contiguous UTF-16 chunk.
1138         // Use Read-only aliasing UnicodeString.
1139         inString.setTo(FALSE,
1140                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1141                        rangeEnd - rangeStart);
1142     } else {
1143         // Copy the text from the original inText (UText) to inString (UnicodeString).
1144         // Create a map from UnicodeString indices -> UText offsets.
1145         utext_setNativeIndex(inText, rangeStart);
1146         int32_t limit = rangeEnd;
1147         U_ASSERT(limit <= utext_nativeLength(inText));
1148         if (limit > utext_nativeLength(inText)) {
1149             limit = (int32_t)utext_nativeLength(inText);
1150         }
1151         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1152         if (U_FAILURE(status)) {
1153             return 0;
1154         }
1155         while (utext_getNativeIndex(inText) < limit) {
1156             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1157             UChar32 c = utext_next32(inText);
1158             U_ASSERT(c != U_SENTINEL);
1159             inString.append(c);
1160             while (inputMap->size() < inString.length()) {
1161                 inputMap->addElement(nativePosition, status);
1162             }
1163         }
1164         inputMap->addElement(limit, status);
1165     }
1166 
1167 
1168     if (!nfkcNorm2->isNormalized(inString, status)) {
1169         UnicodeString normalizedInput;
1170         //  normalizedMap[normalizedInput position] ==  original UText position.
1171         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1172         if (U_FAILURE(status)) {
1173             return 0;
1174         }
1175 
1176         UnicodeString fragment;
1177         UnicodeString normalizedFragment;
1178         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1179             fragment.remove();
1180             int32_t fragmentStartI = srcI;
1181             UChar32 c = inString.char32At(srcI);
1182             for (;;) {
1183                 fragment.append(c);
1184                 srcI = inString.moveIndex32(srcI, 1);
1185                 if (srcI == inString.length()) {
1186                     break;
1187                 }
1188                 c = inString.char32At(srcI);
1189                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1190                     break;
1191                 }
1192             }
1193             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1194             normalizedInput.append(normalizedFragment);
1195 
1196             // Map every position in the normalized chunk to the start of the chunk
1197             //   in the original input.
1198             int32_t fragmentOriginalStart = inputMap.isValid() ?
1199                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1200             while (normalizedMap->size() < normalizedInput.length()) {
1201                 normalizedMap->addElement(fragmentOriginalStart, status);
1202                 if (U_FAILURE(status)) {
1203                     break;
1204                 }
1205             }
1206         }
1207         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1208         int32_t nativeEnd = inputMap.isValid() ?
1209                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1210         normalizedMap->addElement(nativeEnd, status);
1211 
1212         inputMap.moveFrom(normalizedMap);
1213         inString.moveFrom(normalizedInput);
1214     }
1215 
1216     int32_t numCodePts = inString.countChar32();
1217     if (numCodePts != inString.length()) {
1218         // There are supplementary characters in the input.
1219         // The dictionary will produce boundary positions in terms of code point indexes,
1220         //   not in terms of code unit string indexes.
1221         // Use the inputMap mechanism to take care of this in addition to indexing differences
1222         //    from normalization and/or UTF-8 input.
1223         UBool hadExistingMap = inputMap.isValid();
1224         if (!hadExistingMap) {
1225             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1226             if (U_FAILURE(status)) {
1227                 return 0;
1228             }
1229         }
1230         int32_t cpIdx = 0;
1231         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1232             U_ASSERT(cuIdx >= cpIdx);
1233             if (hadExistingMap) {
1234                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1235             } else {
1236                 inputMap->addElement(cuIdx+rangeStart, status);
1237             }
1238             cpIdx++;
1239             if (cuIdx == inString.length()) {
1240                break;
1241             }
1242         }
1243     }
1244 
1245     // bestSnlp[i] is the snlp of the best segmentation of the first i
1246     // code points in the range to be matched.
1247     UVector32 bestSnlp(numCodePts + 1, status);
1248     bestSnlp.addElement(0, status);
1249     for(int32_t i = 1; i <= numCodePts; i++) {
1250         bestSnlp.addElement(kuint32max, status);
1251     }
1252 
1253 
1254     // prev[i] is the index of the last CJK code point in the previous word in
1255     // the best segmentation of the first i characters.
1256     UVector32 prev(numCodePts + 1, status);
1257     for(int32_t i = 0; i <= numCodePts; i++){
1258         prev.addElement(-1, status);
1259     }
1260 
1261     const int32_t maxWordSize = 20;
1262     UVector32 values(numCodePts, status);
1263     values.setSize(numCodePts);
1264     UVector32 lengths(numCodePts, status);
1265     lengths.setSize(numCodePts);
1266 
1267     UText fu = UTEXT_INITIALIZER;
1268     utext_openUnicodeString(&fu, &inString, &status);
1269 
1270     // Dynamic programming to find the best segmentation.
1271 
1272     // In outer loop, i  is the code point index,
1273     //                ix is the corresponding string (code unit) index.
1274     //    They differ when the string contains supplementary characters.
1275     int32_t ix = 0;
1276     bool is_prev_katakana = false;
1277     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1278         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1279             continue;
1280         }
1281 
1282         int32_t count;
1283         utext_setNativeIndex(&fu, ix);
1284         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1285                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1286                              // Note: lengths is filled with code point lengths
1287                              //       The NULL parameter is the ignored code unit lengths.
1288 
1289         // if there are no single character matches found in the dictionary
1290         // starting with this character, treat character as a 1-character word
1291         // with the highest value possible, i.e. the least likely to occur.
1292         // Exclude Korean characters from this treatment, as they should be left
1293         // together by default.
1294         if ((count == 0 || lengths.elementAti(0) != 1) &&
1295                 !fHangulWordSet.contains(inString.char32At(ix))) {
1296             values.setElementAt(maxSnlp, count);   // 255
1297             lengths.setElementAt(1, count++);
1298         }
1299 
1300         for (int32_t j = 0; j < count; j++) {
1301             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1302             int32_t ln_j_i = lengths.elementAti(j) + i;
1303             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1304                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1305                 prev.setElementAt(i, ln_j_i);
1306             }
1307         }
1308 
1309         // In Japanese,
1310         // Katakana word in single character is pretty rare. So we apply
1311         // the following heuristic to Katakana: any continuous run of Katakana
1312         // characters is considered a candidate word with a default cost
1313         // specified in the katakanaCost table according to its length.
1314 
1315         bool is_katakana = isKatakana(inString.char32At(ix));
1316         int32_t katakanaRunLength = 1;
1317         if (!is_prev_katakana && is_katakana) {
1318             int32_t j = inString.moveIndex32(ix, 1);
1319             // Find the end of the continuous run of Katakana characters
1320             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1321                     isKatakana(inString.char32At(j))) {
1322                 j = inString.moveIndex32(j, 1);
1323                 katakanaRunLength++;
1324             }
1325             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1326                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1327                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1328                     bestSnlp.setElementAt(newSnlp, j);
1329                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1330                 }
1331             }
1332         }
1333         is_prev_katakana = is_katakana;
1334     }
1335     utext_close(&fu);
1336 
1337     // Start pushing the optimal offset index into t_boundary (t for tentative).
1338     // prev[numCodePts] is guaranteed to be meaningful.
1339     // We'll first push in the reverse order, i.e.,
1340     // t_boundary[0] = numCodePts, and afterwards do a swap.
1341     UVector32 t_boundary(numCodePts+1, status);
1342 
1343     int32_t numBreaks = 0;
1344     // No segmentation found, set boundary to end of range
1345     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1346         t_boundary.addElement(numCodePts, status);
1347         numBreaks++;
1348     } else {
1349         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1350             t_boundary.addElement(i, status);
1351             numBreaks++;
1352         }
1353         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1354     }
1355 
1356     // Add a break for the start of the dictionary range if there is not one
1357     // there already.
1358     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1359         t_boundary.addElement(0, status);
1360         numBreaks++;
1361     }
1362 
1363     // Now that we're done, convert positions in t_boundary[] (indices in
1364     // the normalized input string) back to indices in the original input UText
1365     // while reversing t_boundary and pushing values to foundBreaks.
1366     int32_t prevCPPos = -1;
1367     int32_t prevUTextPos = -1;
1368     for (int32_t i = numBreaks-1; i >= 0; i--) {
1369         int32_t cpPos = t_boundary.elementAti(i);
1370         U_ASSERT(cpPos > prevCPPos);
1371         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1372         U_ASSERT(utextPos >= prevUTextPos);
1373         if (utextPos > prevUTextPos) {
1374             // Boundaries are added to foundBreaks output in ascending order.
1375             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1376             foundBreaks.push(utextPos, status);
1377         } else {
1378             // Normalization expanded the input text, the dictionary found a boundary
1379             // within the expansion, giving two boundaries with the same index in the
1380             // original text. Ignore the second. See ticket #12918.
1381             --numBreaks;
1382         }
1383         prevCPPos = cpPos;
1384         prevUTextPos = utextPos;
1385     }
1386     (void)prevCPPos; // suppress compiler warnings about unused variable
1387 
1388     // inString goes out of scope
1389     // inputMap goes out of scope
1390     return numBreaks;
1391 }
1392 #endif
1393 
1394 U_NAMESPACE_END
1395 
1396 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1397 
1398