1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include <utility>
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
21 #include "utracimp.h"
22 #include "uvectr32.h"
23 #include "uvector.h"
24 #include "uassert.h"
25 #include "unicode/normlzr.h"
26 #include "cmemory.h"
27 #include "dictionarydata.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 /*
32  ******************************************************************
33  */
34 
DictionaryBreakEngine()35 DictionaryBreakEngine::DictionaryBreakEngine() {
36 }
37 
~DictionaryBreakEngine()38 DictionaryBreakEngine::~DictionaryBreakEngine() {
39 }
40 
41 UBool
handles(UChar32 c) const42 DictionaryBreakEngine::handles(UChar32 c) const {
43     return fSet.contains(c);
44 }
45 
46 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks,UErrorCode & status) const47 DictionaryBreakEngine::findBreaks( UText *text,
48                                  int32_t startPos,
49                                  int32_t endPos,
50                                  UVector32 &foundBreaks,
51                                  UErrorCode& status) const {
52     if (U_FAILURE(status)) return 0;
53     (void)startPos;            // TODO: remove this param?
54     int32_t result = 0;
55 
56     // Find the span of characters included in the set.
57     //   The span to break begins at the current position in the text, and
58     //   extends towards the start or end of the text, depending on 'reverse'.
59 
60     int32_t start = (int32_t)utext_getNativeIndex(text);
61     int32_t current;
62     int32_t rangeStart;
63     int32_t rangeEnd;
64     UChar32 c = utext_current32(text);
65     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
66         utext_next32(text);         // TODO:  recast loop for postincrement
67         c = utext_current32(text);
68     }
69     rangeStart = start;
70     rangeEnd = current;
71     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
72     utext_setNativeIndex(text, current);
73 
74     return result;
75 }
76 
77 void
setCharacters(const UnicodeSet & set)78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
79     fSet = set;
80     // Compact for caching
81     fSet.compact();
82 }
83 
84 /*
85  ******************************************************************
86  * PossibleWord
87  */
88 
89 // Helper class for improving readability of the Thai/Lao/Khmer word break
90 // algorithm. The implementation is completely inline.
91 
92 // List size, limited by the maximum number of words in the dictionary
93 // that form a nested sequence.
94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
95 
96 class PossibleWord {
97 private:
98     // list of word candidate lengths, in increasing length order
99     // TODO: bytes would be sufficient for word lengths.
100     int32_t   count;      // Count of candidates
101     int32_t   prefix;     // The longest match with a dictionary word
102     int32_t   offset;     // Offset in the text of these candidates
103     int32_t   mark;       // The preferred candidate's offset
104     int32_t   current;    // The candidate we're currently looking at
105     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
106     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
107 
108 public:
PossibleWord()109     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
~PossibleWord()110     ~PossibleWord() {}
111 
112     // Fill the list of candidates if needed, select the longest, and return the number found
113     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
114 
115     // Select the currently marked candidate, point after it in the text, and invalidate self
116     int32_t   acceptMarked( UText *text );
117 
118     // Back up from the current candidate to the next shorter one; return TRUE if that exists
119     // and point the text after it
120     UBool     backUp( UText *text );
121 
122     // Return the longest prefix this candidate location shares with a dictionary word
123     // Return value is in code points.
longestPrefix()124     int32_t   longestPrefix() { return prefix; }
125 
126     // Mark the current candidate as the one we like
markCurrent()127     void      markCurrent() { mark = current; }
128 
129     // Get length in code points of the marked word.
markedCPLength()130     int32_t   markedCPLength() { return cpLengths[mark]; }
131 };
132 
133 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
135     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
136     int32_t start = (int32_t)utext_getNativeIndex(text);
137     if (start != offset) {
138         offset = start;
139         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
140         // Dictionary leaves text after longest prefix, not longest word. Back up.
141         if (count <= 0) {
142             utext_setNativeIndex(text, start);
143         }
144     }
145     if (count > 0) {
146         utext_setNativeIndex(text, start+cuLengths[count-1]);
147     }
148     current = count-1;
149     mark = current;
150     return count;
151 }
152 
153 int32_t
acceptMarked(UText * text)154 PossibleWord::acceptMarked( UText *text ) {
155     utext_setNativeIndex(text, offset + cuLengths[mark]);
156     return cuLengths[mark];
157 }
158 
159 
160 UBool
backUp(UText * text)161 PossibleWord::backUp( UText *text ) {
162     if (current > 0) {
163         utext_setNativeIndex(text, offset + cuLengths[--current]);
164         return TRUE;
165     }
166     return FALSE;
167 }
168 
169 /*
170  ******************************************************************
171  * ThaiBreakEngine
172  */
173 
174 // How many words in a row are "good enough"?
175 static const int32_t THAI_LOOKAHEAD = 3;
176 
177 // Will not combine a non-word with a preceding dictionary word longer than this
178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
179 
180 // Will not combine a non-word that shares at least this much prefix with a
181 // dictionary word, with a preceding word
182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
183 
184 // Elision character
185 static const int32_t THAI_PAIYANNOI = 0x0E2F;
186 
187 // Repeat character
188 static const int32_t THAI_MAIYAMOK = 0x0E46;
189 
190 // Minimum word size
191 static const int32_t THAI_MIN_WORD = 2;
192 
193 // Minimum number of characters for two words
194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
195 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
197     : DictionaryBreakEngine(),
198       fDictionary(adoptDictionary)
199 {
200     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
201     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
202     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
203     if (U_SUCCESS(status)) {
204         setCharacters(fThaiWordSet);
205     }
206     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
207     fMarkSet.add(0x0020);
208     fEndWordSet = fThaiWordSet;
209     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
210     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
211     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
212     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
213     fSuffixSet.add(THAI_PAIYANNOI);
214     fSuffixSet.add(THAI_MAIYAMOK);
215 
216     // Compact for caching.
217     fMarkSet.compact();
218     fEndWordSet.compact();
219     fBeginWordSet.compact();
220     fSuffixSet.compact();
221     UTRACE_EXIT_STATUS(status);
222 }
223 
~ThaiBreakEngine()224 ThaiBreakEngine::~ThaiBreakEngine() {
225     delete fDictionary;
226 }
227 
228 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const229 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
230                                                 int32_t rangeStart,
231                                                 int32_t rangeEnd,
232                                                 UVector32 &foundBreaks,
233                                                 UErrorCode& status) const {
234     if (U_FAILURE(status)) return 0;
235     utext_setNativeIndex(text, rangeStart);
236     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
237     if (utext_getNativeIndex(text) >= rangeEnd) {
238         return 0;       // Not enough characters for two words
239     }
240     utext_setNativeIndex(text, rangeStart);
241 
242 
243     uint32_t wordsFound = 0;
244     int32_t cpWordLength = 0;    // Word Length in Code Points.
245     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
246     int32_t current;
247     PossibleWord words[THAI_LOOKAHEAD];
248 
249     utext_setNativeIndex(text, rangeStart);
250 
251     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
252         cpWordLength = 0;
253         cuWordLength = 0;
254 
255         // Look for candidate words at the current position
256         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
257 
258         // If we found exactly one, use that
259         if (candidates == 1) {
260             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
261             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
262             wordsFound += 1;
263         }
264         // If there was more than one, see which one can take us forward the most words
265         else if (candidates > 1) {
266             // If we're already at the end of the range, we're done
267             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
268                 goto foundBest;
269             }
270             do {
271                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
272                     // Followed by another dictionary word; mark first word as a good candidate
273                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();
274 
275                     // If we're already at the end of the range, we're done
276                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277                         goto foundBest;
278                     }
279 
280                     // See if any of the possible second words is followed by a third word
281                     do {
282                         // If we find a third word, stop right away
283                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
284                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
285                             goto foundBest;
286                         }
287                     }
288                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
289                 }
290             }
291             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
292 foundBest:
293             // Set UText position to after the accepted word.
294             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
295             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
296             wordsFound += 1;
297         }
298 
299         // We come here after having either found a word or not. We look ahead to the
300         // next word. If it's not a dictionary word, we will combine it with the word we
301         // just found (if there is one), but only if the preceding word does not exceed
302         // the threshold.
303         // The text iterator should now be positioned at the end of the word we found.
304 
305         UChar32 uc = 0;
306         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307             // if it is a dictionary word, do nothing. If it isn't, then if there is
308             // no preceding word, or the non-word shares less than the minimum threshold
309             // of characters with a dictionary word, then scan to resynchronize
310             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311                   && (cuWordLength == 0
312                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313                 // Look for a plausible word boundary
314                 int32_t remaining = rangeEnd - (current+cuWordLength);
315                 UChar32 pc;
316                 int32_t chars = 0;
317                 for (;;) {
318                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
319                     pc = utext_next32(text);
320                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
321                     chars += pcSize;
322                     remaining -= pcSize;
323                     if (remaining <= 0) {
324                         break;
325                     }
326                     uc = utext_current32(text);
327                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328                         // Maybe. See if it's in the dictionary.
329                         // NOTE: In the original Apple code, checked that the next
330                         // two characters after uc were not 0x0E4C THANTHAKHAT before
331                         // checking the dictionary. That is just a performance filter,
332                         // but it's not clear it's faster than checking the trie.
333                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334                         utext_setNativeIndex(text, current + cuWordLength + chars);
335                         if (num_candidates > 0) {
336                             break;
337                         }
338                     }
339                 }
340 
341                 // Bump the word count if there wasn't already one
342                 if (cuWordLength <= 0) {
343                     wordsFound += 1;
344                 }
345 
346                 // Update the length with the passed-over characters
347                 cuWordLength += chars;
348             }
349             else {
350                 // Back up to where we were for next iteration
351                 utext_setNativeIndex(text, current+cuWordLength);
352             }
353         }
354 
355         // Never stop before a combining mark.
356         int32_t currPos;
357         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
358             utext_next32(text);
359             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
360         }
361 
362         // Look ahead for possible suffixes if a dictionary word does not follow.
363         // We do this in code rather than using a rule so that the heuristic
364         // resynch continues to function. For example, one of the suffix characters
365         // could be a typo in the middle of a word.
366         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
367             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
368                 && fSuffixSet.contains(uc = utext_current32(text))) {
369                 if (uc == THAI_PAIYANNOI) {
370                     if (!fSuffixSet.contains(utext_previous32(text))) {
371                         // Skip over previous end and PAIYANNOI
372                         utext_next32(text);
373                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
374                         utext_next32(text);
375                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
376                         uc = utext_current32(text);     // Fetch next character
377                     }
378                     else {
379                         // Restore prior position
380                         utext_next32(text);
381                     }
382                 }
383                 if (uc == THAI_MAIYAMOK) {
384                     if (utext_previous32(text) != THAI_MAIYAMOK) {
385                         // Skip over previous end and MAIYAMOK
386                         utext_next32(text);
387                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
388                         utext_next32(text);
389                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
390                     }
391                     else {
392                         // Restore prior position
393                         utext_next32(text);
394                     }
395                 }
396             }
397             else {
398                 utext_setNativeIndex(text, current+cuWordLength);
399             }
400         }
401 
402         // Did we find a word on this iteration? If so, push it on the break stack
403         if (cuWordLength > 0) {
404             foundBreaks.push((current+cuWordLength), status);
405         }
406     }
407 
408     // Don't return a break for the end of the dictionary range if there is one there.
409     if (foundBreaks.peeki() >= rangeEnd) {
410         (void) foundBreaks.popi();
411         wordsFound -= 1;
412     }
413 
414     return wordsFound;
415 }
416 
417 /*
418  ******************************************************************
419  * LaoBreakEngine
420  */
421 
422 // How many words in a row are "good enough"?
423 static const int32_t LAO_LOOKAHEAD = 3;
424 
425 // Will not combine a non-word with a preceding dictionary word longer than this
426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
427 
428 // Will not combine a non-word that shares at least this much prefix with a
429 // dictionary word, with a preceding word
430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
431 
432 // Minimum word size
433 static const int32_t LAO_MIN_WORD = 2;
434 
435 // Minimum number of characters for two words
436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
437 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
439     : DictionaryBreakEngine(),
440       fDictionary(adoptDictionary)
441 {
442     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
443     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
444     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
445     if (U_SUCCESS(status)) {
446         setCharacters(fLaoWordSet);
447     }
448     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
449     fMarkSet.add(0x0020);
450     fEndWordSet = fLaoWordSet;
451     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
452     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
453     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
454     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
455 
456     // Compact for caching.
457     fMarkSet.compact();
458     fEndWordSet.compact();
459     fBeginWordSet.compact();
460     UTRACE_EXIT_STATUS(status);
461 }
462 
~LaoBreakEngine()463 LaoBreakEngine::~LaoBreakEngine() {
464     delete fDictionary;
465 }
466 
467 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const468 LaoBreakEngine::divideUpDictionaryRange( UText *text,
469                                                 int32_t rangeStart,
470                                                 int32_t rangeEnd,
471                                                 UVector32 &foundBreaks,
472                                                 UErrorCode& status) const {
473     if (U_FAILURE(status)) return 0;
474     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
475         return 0;       // Not enough characters for two words
476     }
477 
478     uint32_t wordsFound = 0;
479     int32_t cpWordLength = 0;
480     int32_t cuWordLength = 0;
481     int32_t current;
482     PossibleWord words[LAO_LOOKAHEAD];
483 
484     utext_setNativeIndex(text, rangeStart);
485 
486     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487         cuWordLength = 0;
488         cpWordLength = 0;
489 
490         // Look for candidate words at the current position
491         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492 
493         // If we found exactly one, use that
494         if (candidates == 1) {
495             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497             wordsFound += 1;
498         }
499         // If there was more than one, see which one can take us forward the most words
500         else if (candidates > 1) {
501             // If we're already at the end of the range, we're done
502             if (utext_getNativeIndex(text) >= rangeEnd) {
503                 goto foundBest;
504             }
505             do {
506                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
507                     // Followed by another dictionary word; mark first word as a good candidate
508                     words[wordsFound%LAO_LOOKAHEAD].markCurrent();
509 
510                     // If we're already at the end of the range, we're done
511                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
512                         goto foundBest;
513                     }
514 
515                     // See if any of the possible second words is followed by a third word
516                     do {
517                         // If we find a third word, stop right away
518                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
519                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
520                             goto foundBest;
521                         }
522                     }
523                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
524                 }
525             }
526             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
527 foundBest:
528             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
529             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
530             wordsFound += 1;
531         }
532 
533         // We come here after having either found a word or not. We look ahead to the
534         // next word. If it's not a dictionary word, we will combine it with the word we
535         // just found (if there is one), but only if the preceding word does not exceed
536         // the threshold.
537         // The text iterator should now be positioned at the end of the word we found.
538         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
539             // if it is a dictionary word, do nothing. If it isn't, then if there is
540             // no preceding word, or the non-word shares less than the minimum threshold
541             // of characters with a dictionary word, then scan to resynchronize
542             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
543                   && (cuWordLength == 0
544                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
545                 // Look for a plausible word boundary
546                 int32_t remaining = rangeEnd - (current + cuWordLength);
547                 UChar32 pc;
548                 UChar32 uc;
549                 int32_t chars = 0;
550                 for (;;) {
551                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
552                     pc = utext_next32(text);
553                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
554                     chars += pcSize;
555                     remaining -= pcSize;
556                     if (remaining <= 0) {
557                         break;
558                     }
559                     uc = utext_current32(text);
560                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
561                         // Maybe. See if it's in the dictionary.
562                         // TODO: this looks iffy; compare with old code.
563                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
564                         utext_setNativeIndex(text, current + cuWordLength + chars);
565                         if (num_candidates > 0) {
566                             break;
567                         }
568                     }
569                 }
570 
571                 // Bump the word count if there wasn't already one
572                 if (cuWordLength <= 0) {
573                     wordsFound += 1;
574                 }
575 
576                 // Update the length with the passed-over characters
577                 cuWordLength += chars;
578             }
579             else {
580                 // Back up to where we were for next iteration
581                 utext_setNativeIndex(text, current + cuWordLength);
582             }
583         }
584 
585         // Never stop before a combining mark.
586         int32_t currPos;
587         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
588             utext_next32(text);
589             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
590         }
591 
592         // Look ahead for possible suffixes if a dictionary word does not follow.
593         // We do this in code rather than using a rule so that the heuristic
594         // resynch continues to function. For example, one of the suffix characters
595         // could be a typo in the middle of a word.
596         // NOT CURRENTLY APPLICABLE TO LAO
597 
598         // Did we find a word on this iteration? If so, push it on the break stack
599         if (cuWordLength > 0) {
600             foundBreaks.push((current+cuWordLength), status);
601         }
602     }
603 
604     // Don't return a break for the end of the dictionary range if there is one there.
605     if (foundBreaks.peeki() >= rangeEnd) {
606         (void) foundBreaks.popi();
607         wordsFound -= 1;
608     }
609 
610     return wordsFound;
611 }
612 
613 /*
614  ******************************************************************
615  * BurmeseBreakEngine
616  */
617 
618 // How many words in a row are "good enough"?
619 static const int32_t BURMESE_LOOKAHEAD = 3;
620 
621 // Will not combine a non-word with a preceding dictionary word longer than this
622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
623 
624 // Will not combine a non-word that shares at least this much prefix with a
625 // dictionary word, with a preceding word
626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
627 
628 // Minimum word size
629 static const int32_t BURMESE_MIN_WORD = 2;
630 
631 // Minimum number of characters for two words
632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
633 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
635     : DictionaryBreakEngine(),
636       fDictionary(adoptDictionary)
637 {
638     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
639     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
640     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
641     if (U_SUCCESS(status)) {
642         setCharacters(fBurmeseWordSet);
643     }
644     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
645     fMarkSet.add(0x0020);
646     fEndWordSet = fBurmeseWordSet;
647     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
648 
649     // Compact for caching.
650     fMarkSet.compact();
651     fEndWordSet.compact();
652     fBeginWordSet.compact();
653     UTRACE_EXIT_STATUS(status);
654 }
655 
~BurmeseBreakEngine()656 BurmeseBreakEngine::~BurmeseBreakEngine() {
657     delete fDictionary;
658 }
659 
660 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const661 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
662                                                 int32_t rangeStart,
663                                                 int32_t rangeEnd,
664                                                 UVector32 &foundBreaks,
665                                                 UErrorCode& status ) const {
666     if (U_FAILURE(status)) return 0;
667     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
668         return 0;       // Not enough characters for two words
669     }
670 
671     uint32_t wordsFound = 0;
672     int32_t cpWordLength = 0;
673     int32_t cuWordLength = 0;
674     int32_t current;
675     PossibleWord words[BURMESE_LOOKAHEAD];
676 
677     utext_setNativeIndex(text, rangeStart);
678 
679     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
680         cuWordLength = 0;
681         cpWordLength = 0;
682 
683         // Look for candidate words at the current position
684         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
685 
686         // If we found exactly one, use that
687         if (candidates == 1) {
688             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
689             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
690             wordsFound += 1;
691         }
692         // If there was more than one, see which one can take us forward the most words
693         else if (candidates > 1) {
694             // If we're already at the end of the range, we're done
695             if (utext_getNativeIndex(text) >= rangeEnd) {
696                 goto foundBest;
697             }
698             do {
699                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
700                     // Followed by another dictionary word; mark first word as a good candidate
701                     words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
702 
703                     // If we're already at the end of the range, we're done
704                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705                         goto foundBest;
706                     }
707 
708                     // See if any of the possible second words is followed by a third word
709                     do {
710                         // If we find a third word, stop right away
711                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
712                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
713                             goto foundBest;
714                         }
715                     }
716                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
717                 }
718             }
719             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
720 foundBest:
721             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
722             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
723             wordsFound += 1;
724         }
725 
726         // We come here after having either found a word or not. We look ahead to the
727         // next word. If it's not a dictionary word, we will combine it with the word we
728         // just found (if there is one), but only if the preceding word does not exceed
729         // the threshold.
730         // The text iterator should now be positioned at the end of the word we found.
731         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
732             // if it is a dictionary word, do nothing. If it isn't, then if there is
733             // no preceding word, or the non-word shares less than the minimum threshold
734             // of characters with a dictionary word, then scan to resynchronize
735             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
736                   && (cuWordLength == 0
737                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
738                 // Look for a plausible word boundary
739                 int32_t remaining = rangeEnd - (current + cuWordLength);
740                 UChar32 pc;
741                 UChar32 uc;
742                 int32_t chars = 0;
743                 for (;;) {
744                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
745                     pc = utext_next32(text);
746                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
747                     chars += pcSize;
748                     remaining -= pcSize;
749                     if (remaining <= 0) {
750                         break;
751                     }
752                     uc = utext_current32(text);
753                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
754                         // Maybe. See if it's in the dictionary.
755                         // TODO: this looks iffy; compare with old code.
756                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
757                         utext_setNativeIndex(text, current + cuWordLength + chars);
758                         if (num_candidates > 0) {
759                             break;
760                         }
761                     }
762                 }
763 
764                 // Bump the word count if there wasn't already one
765                 if (cuWordLength <= 0) {
766                     wordsFound += 1;
767                 }
768 
769                 // Update the length with the passed-over characters
770                 cuWordLength += chars;
771             }
772             else {
773                 // Back up to where we were for next iteration
774                 utext_setNativeIndex(text, current + cuWordLength);
775             }
776         }
777 
778         // Never stop before a combining mark.
779         int32_t currPos;
780         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
781             utext_next32(text);
782             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
783         }
784 
785         // Look ahead for possible suffixes if a dictionary word does not follow.
786         // We do this in code rather than using a rule so that the heuristic
787         // resynch continues to function. For example, one of the suffix characters
788         // could be a typo in the middle of a word.
789         // NOT CURRENTLY APPLICABLE TO BURMESE
790 
791         // Did we find a word on this iteration? If so, push it on the break stack
792         if (cuWordLength > 0) {
793             foundBreaks.push((current+cuWordLength), status);
794         }
795     }
796 
797     // Don't return a break for the end of the dictionary range if there is one there.
798     if (foundBreaks.peeki() >= rangeEnd) {
799         (void) foundBreaks.popi();
800         wordsFound -= 1;
801     }
802 
803     return wordsFound;
804 }
805 
806 /*
807  ******************************************************************
808  * KhmerBreakEngine
809  */
810 
811 // How many words in a row are "good enough"?
812 static const int32_t KHMER_LOOKAHEAD = 3;
813 
814 // Will not combine a non-word with a preceding dictionary word longer than this
815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
816 
817 // Will not combine a non-word that shares at least this much prefix with a
818 // dictionary word, with a preceding word
819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
820 
821 // Minimum word size
822 static const int32_t KHMER_MIN_WORD = 2;
823 
824 // Minimum number of characters for two words
825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
826 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
828     : DictionaryBreakEngine(),
829       fDictionary(adoptDictionary)
830 {
831     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
832     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
833     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
834     if (U_SUCCESS(status)) {
835         setCharacters(fKhmerWordSet);
836     }
837     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
838     fMarkSet.add(0x0020);
839     fEndWordSet = fKhmerWordSet;
840     fBeginWordSet.add(0x1780, 0x17B3);
841     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
842     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
843     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
844     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
845     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
846 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
847 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
848 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
849 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
850 //    fSuffixSet.add(THAI_PAIYANNOI);
851 //    fSuffixSet.add(THAI_MAIYAMOK);
852 
853     // Compact for caching.
854     fMarkSet.compact();
855     fEndWordSet.compact();
856     fBeginWordSet.compact();
857 //    fSuffixSet.compact();
858     UTRACE_EXIT_STATUS(status);
859 }
860 
~KhmerBreakEngine()861 KhmerBreakEngine::~KhmerBreakEngine() {
862     delete fDictionary;
863 }
864 
865 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const866 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
867                                                 int32_t rangeStart,
868                                                 int32_t rangeEnd,
869                                                 UVector32 &foundBreaks,
870                                                 UErrorCode& status ) const {
871     if (U_FAILURE(status)) return 0;
872     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
873         return 0;       // Not enough characters for two words
874     }
875 
876     uint32_t wordsFound = 0;
877     int32_t cpWordLength = 0;
878     int32_t cuWordLength = 0;
879     int32_t current;
880     PossibleWord words[KHMER_LOOKAHEAD];
881 
882     utext_setNativeIndex(text, rangeStart);
883 
884     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
885         cuWordLength = 0;
886         cpWordLength = 0;
887 
888         // Look for candidate words at the current position
889         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
890 
891         // If we found exactly one, use that
892         if (candidates == 1) {
893             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
894             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
895             wordsFound += 1;
896         }
897 
898         // If there was more than one, see which one can take us forward the most words
899         else if (candidates > 1) {
900             // If we're already at the end of the range, we're done
901             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
902                 goto foundBest;
903             }
904             do {
905                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
906                     // Followed by another dictionary word; mark first word as a good candidate
907                     words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
908 
909                     // If we're already at the end of the range, we're done
910                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
911                         goto foundBest;
912                     }
913 
914                     // See if any of the possible second words is followed by a third word
915                     do {
916                         // If we find a third word, stop right away
917                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
918                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
919                             goto foundBest;
920                         }
921                     }
922                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
923                 }
924             }
925             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
926 foundBest:
927             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
928             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
929             wordsFound += 1;
930         }
931 
932         // We come here after having either found a word or not. We look ahead to the
933         // next word. If it's not a dictionary word, we will combine it with the word we
934         // just found (if there is one), but only if the preceding word does not exceed
935         // the threshold.
936         // The text iterator should now be positioned at the end of the word we found.
937         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
938             // if it is a dictionary word, do nothing. If it isn't, then if there is
939             // no preceding word, or the non-word shares less than the minimum threshold
940             // of characters with a dictionary word, then scan to resynchronize
941             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
942                   && (cuWordLength == 0
943                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
944                 // Look for a plausible word boundary
945                 int32_t remaining = rangeEnd - (current+cuWordLength);
946                 UChar32 pc;
947                 UChar32 uc;
948                 int32_t chars = 0;
949                 for (;;) {
950                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
951                     pc = utext_next32(text);
952                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
953                     chars += pcSize;
954                     remaining -= pcSize;
955                     if (remaining <= 0) {
956                         break;
957                     }
958                     uc = utext_current32(text);
959                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
960                         // Maybe. See if it's in the dictionary.
961                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
962                         utext_setNativeIndex(text, current+cuWordLength+chars);
963                         if (num_candidates > 0) {
964                             break;
965                         }
966                     }
967                 }
968 
969                 // Bump the word count if there wasn't already one
970                 if (cuWordLength <= 0) {
971                     wordsFound += 1;
972                 }
973 
974                 // Update the length with the passed-over characters
975                 cuWordLength += chars;
976             }
977             else {
978                 // Back up to where we were for next iteration
979                 utext_setNativeIndex(text, current+cuWordLength);
980             }
981         }
982 
983         // Never stop before a combining mark.
984         int32_t currPos;
985         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
986             utext_next32(text);
987             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
988         }
989 
990         // Look ahead for possible suffixes if a dictionary word does not follow.
991         // We do this in code rather than using a rule so that the heuristic
992         // resynch continues to function. For example, one of the suffix characters
993         // could be a typo in the middle of a word.
994 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
995 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
996 //                && fSuffixSet.contains(uc = utext_current32(text))) {
997 //                if (uc == KHMER_PAIYANNOI) {
998 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
999 //                        // Skip over previous end and PAIYANNOI
1000 //                        utext_next32(text);
1001 //                        utext_next32(text);
1002 //                        wordLength += 1;            // Add PAIYANNOI to word
1003 //                        uc = utext_current32(text);     // Fetch next character
1004 //                    }
1005 //                    else {
1006 //                        // Restore prior position
1007 //                        utext_next32(text);
1008 //                    }
1009 //                }
1010 //                if (uc == KHMER_MAIYAMOK) {
1011 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1012 //                        // Skip over previous end and MAIYAMOK
1013 //                        utext_next32(text);
1014 //                        utext_next32(text);
1015 //                        wordLength += 1;            // Add MAIYAMOK to word
1016 //                    }
1017 //                    else {
1018 //                        // Restore prior position
1019 //                        utext_next32(text);
1020 //                    }
1021 //                }
1022 //            }
1023 //            else {
1024 //                utext_setNativeIndex(text, current+wordLength);
1025 //            }
1026 //        }
1027 
1028         // Did we find a word on this iteration? If so, push it on the break stack
1029         if (cuWordLength > 0) {
1030             foundBreaks.push((current+cuWordLength), status);
1031         }
1032     }
1033 
1034     // Don't return a break for the end of the dictionary range if there is one there.
1035     if (foundBreaks.peeki() >= rangeEnd) {
1036         (void) foundBreaks.popi();
1037         wordsFound -= 1;
1038     }
1039 
1040     return wordsFound;
1041 }
1042 
1043 #if !UCONFIG_NO_NORMALIZATION
1044 /*
1045  ******************************************************************
1046  * CjkBreakEngine
1047  */
1048 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1050 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1051     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1052     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1053     // Korean dictionary only includes Hangul syllables
1054     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1055     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1056     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1057     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1058     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1059 
1060     if (U_SUCCESS(status)) {
1061         // handle Korean and Japanese/Chinese using different dictionaries
1062         if (type == kKorean) {
1063             setCharacters(fHangulWordSet);
1064         } else { //Chinese and Japanese
1065             UnicodeSet cjSet;
1066             cjSet.addAll(fHanWordSet);
1067             cjSet.addAll(fKatakanaWordSet);
1068             cjSet.addAll(fHiraganaWordSet);
1069             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1070             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1071             setCharacters(cjSet);
1072         }
1073     }
1074     UTRACE_EXIT_STATUS(status);
1075 }
1076 
~CjkBreakEngine()1077 CjkBreakEngine::~CjkBreakEngine(){
1078     delete fDictionary;
1079 }
1080 
1081 // The katakanaCost values below are based on the length frequencies of all
1082 // katakana phrases in the dictionary
1083 static const int32_t kMaxKatakanaLength = 8;
1084 static const int32_t kMaxKatakanaGroupLength = 20;
1085 static const uint32_t maxSnlp = 255;
1086 
getKatakanaCost(int32_t wordLength)1087 static inline uint32_t getKatakanaCost(int32_t wordLength){
1088     //TODO: fill array with actual values from dictionary!
1089     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1090                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1091     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1092 }
1093 
isKatakana(UChar32 value)1094 static inline bool isKatakana(UChar32 value) {
1095     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1096             (value >= 0xFF66 && value <= 0xFF9f);
1097 }
1098 
1099 
1100 // Function for accessing internal utext flags.
1101 //   Replicates an internal UText function.
1102 
utext_i32_flag(int32_t bitIndex)1103 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1104     return (int32_t)1 << bitIndex;
1105 }
1106 
1107 
1108 /*
1109  * @param text A UText representing the text
1110  * @param rangeStart The start of the range of dictionary characters
1111  * @param rangeEnd The end of the range of dictionary characters
1112  * @param foundBreaks vector<int32> to receive the break positions
1113  * @return The number of breaks found
1114  */
1115 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const1116 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1117         int32_t rangeStart,
1118         int32_t rangeEnd,
1119         UVector32 &foundBreaks,
1120         UErrorCode& status) const {
1121     if (U_FAILURE(status)) return 0;
1122     if (rangeStart >= rangeEnd) {
1123         return 0;
1124     }
1125 
1126     // UnicodeString version of input UText, NFKC normalized if necessary.
1127     UnicodeString inString;
1128 
1129     // inputMap[inStringIndex] = corresponding native index from UText inText.
1130     // If NULL then mapping is 1:1
1131     LocalPointer<UVector32>     inputMap;
1132 
1133     // if UText has the input string as one contiguous UTF-16 chunk
1134     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1135          inText->chunkNativeStart <= rangeStart &&
1136          inText->chunkNativeLimit >= rangeEnd   &&
1137          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1138 
1139         // Input UText is in one contiguous UTF-16 chunk.
1140         // Use Read-only aliasing UnicodeString.
1141         inString.setTo(FALSE,
1142                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1143                        rangeEnd - rangeStart);
1144     } else {
1145         // Copy the text from the original inText (UText) to inString (UnicodeString).
1146         // Create a map from UnicodeString indices -> UText offsets.
1147         utext_setNativeIndex(inText, rangeStart);
1148         int32_t limit = rangeEnd;
1149         U_ASSERT(limit <= utext_nativeLength(inText));
1150         if (limit > utext_nativeLength(inText)) {
1151             limit = (int32_t)utext_nativeLength(inText);
1152         }
1153         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1154         if (U_FAILURE(status)) {
1155             return 0;
1156         }
1157         while (utext_getNativeIndex(inText) < limit) {
1158             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1159             UChar32 c = utext_next32(inText);
1160             U_ASSERT(c != U_SENTINEL);
1161             inString.append(c);
1162             while (inputMap->size() < inString.length()) {
1163                 inputMap->addElement(nativePosition, status);
1164             }
1165         }
1166         inputMap->addElement(limit, status);
1167     }
1168 
1169 
1170     if (!nfkcNorm2->isNormalized(inString, status)) {
1171         UnicodeString normalizedInput;
1172         //  normalizedMap[normalizedInput position] ==  original UText position.
1173         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1174         if (U_FAILURE(status)) {
1175             return 0;
1176         }
1177 
1178         UnicodeString fragment;
1179         UnicodeString normalizedFragment;
1180         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1181             fragment.remove();
1182             int32_t fragmentStartI = srcI;
1183             UChar32 c = inString.char32At(srcI);
1184             for (;;) {
1185                 fragment.append(c);
1186                 srcI = inString.moveIndex32(srcI, 1);
1187                 if (srcI == inString.length()) {
1188                     break;
1189                 }
1190                 c = inString.char32At(srcI);
1191                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1192                     break;
1193                 }
1194             }
1195             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1196             normalizedInput.append(normalizedFragment);
1197 
1198             // Map every position in the normalized chunk to the start of the chunk
1199             //   in the original input.
1200             int32_t fragmentOriginalStart = inputMap.isValid() ?
1201                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1202             while (normalizedMap->size() < normalizedInput.length()) {
1203                 normalizedMap->addElement(fragmentOriginalStart, status);
1204                 if (U_FAILURE(status)) {
1205                     break;
1206                 }
1207             }
1208         }
1209         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1210         int32_t nativeEnd = inputMap.isValid() ?
1211                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1212         normalizedMap->addElement(nativeEnd, status);
1213 
1214         inputMap = std::move(normalizedMap);
1215         inString = std::move(normalizedInput);
1216     }
1217 
1218     int32_t numCodePts = inString.countChar32();
1219     if (numCodePts != inString.length()) {
1220         // There are supplementary characters in the input.
1221         // The dictionary will produce boundary positions in terms of code point indexes,
1222         //   not in terms of code unit string indexes.
1223         // Use the inputMap mechanism to take care of this in addition to indexing differences
1224         //    from normalization and/or UTF-8 input.
1225         UBool hadExistingMap = inputMap.isValid();
1226         if (!hadExistingMap) {
1227             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1228             if (U_FAILURE(status)) {
1229                 return 0;
1230             }
1231         }
1232         int32_t cpIdx = 0;
1233         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1234             U_ASSERT(cuIdx >= cpIdx);
1235             if (hadExistingMap) {
1236                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1237             } else {
1238                 inputMap->addElement(cuIdx+rangeStart, status);
1239             }
1240             cpIdx++;
1241             if (cuIdx == inString.length()) {
1242                break;
1243             }
1244         }
1245     }
1246 
1247     // bestSnlp[i] is the snlp of the best segmentation of the first i
1248     // code points in the range to be matched.
1249     UVector32 bestSnlp(numCodePts + 1, status);
1250     bestSnlp.addElement(0, status);
1251     for(int32_t i = 1; i <= numCodePts; i++) {
1252         bestSnlp.addElement(kuint32max, status);
1253     }
1254 
1255 
1256     // prev[i] is the index of the last CJK code point in the previous word in
1257     // the best segmentation of the first i characters.
1258     UVector32 prev(numCodePts + 1, status);
1259     for(int32_t i = 0; i <= numCodePts; i++){
1260         prev.addElement(-1, status);
1261     }
1262 
1263     const int32_t maxWordSize = 20;
1264     UVector32 values(numCodePts, status);
1265     values.setSize(numCodePts);
1266     UVector32 lengths(numCodePts, status);
1267     lengths.setSize(numCodePts);
1268 
1269     UText fu = UTEXT_INITIALIZER;
1270     utext_openUnicodeString(&fu, &inString, &status);
1271 
1272     // Dynamic programming to find the best segmentation.
1273 
1274     // In outer loop, i  is the code point index,
1275     //                ix is the corresponding string (code unit) index.
1276     //    They differ when the string contains supplementary characters.
1277     int32_t ix = 0;
1278     bool is_prev_katakana = false;
1279     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1280         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1281             continue;
1282         }
1283 
1284         int32_t count;
1285         utext_setNativeIndex(&fu, ix);
1286         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1287                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1288                              // Note: lengths is filled with code point lengths
1289                              //       The NULL parameter is the ignored code unit lengths.
1290 
1291         // if there are no single character matches found in the dictionary
1292         // starting with this character, treat character as a 1-character word
1293         // with the highest value possible, i.e. the least likely to occur.
1294         // Exclude Korean characters from this treatment, as they should be left
1295         // together by default.
1296         if ((count == 0 || lengths.elementAti(0) != 1) &&
1297                 !fHangulWordSet.contains(inString.char32At(ix))) {
1298             values.setElementAt(maxSnlp, count);   // 255
1299             lengths.setElementAt(1, count++);
1300         }
1301 
1302         for (int32_t j = 0; j < count; j++) {
1303             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1304             int32_t ln_j_i = lengths.elementAti(j) + i;
1305             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1306                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1307                 prev.setElementAt(i, ln_j_i);
1308             }
1309         }
1310 
1311         // In Japanese,
1312         // Katakana word in single character is pretty rare. So we apply
1313         // the following heuristic to Katakana: any continuous run of Katakana
1314         // characters is considered a candidate word with a default cost
1315         // specified in the katakanaCost table according to its length.
1316 
1317         bool is_katakana = isKatakana(inString.char32At(ix));
1318         int32_t katakanaRunLength = 1;
1319         if (!is_prev_katakana && is_katakana) {
1320             int32_t j = inString.moveIndex32(ix, 1);
1321             // Find the end of the continuous run of Katakana characters
1322             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1323                     isKatakana(inString.char32At(j))) {
1324                 j = inString.moveIndex32(j, 1);
1325                 katakanaRunLength++;
1326             }
1327             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1328                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1329                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1330                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1331                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1332                 }
1333             }
1334         }
1335         is_prev_katakana = is_katakana;
1336     }
1337     utext_close(&fu);
1338 
1339     // Start pushing the optimal offset index into t_boundary (t for tentative).
1340     // prev[numCodePts] is guaranteed to be meaningful.
1341     // We'll first push in the reverse order, i.e.,
1342     // t_boundary[0] = numCodePts, and afterwards do a swap.
1343     UVector32 t_boundary(numCodePts+1, status);
1344 
1345     int32_t numBreaks = 0;
1346     // No segmentation found, set boundary to end of range
1347     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1348         t_boundary.addElement(numCodePts, status);
1349         numBreaks++;
1350     } else {
1351         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1352             t_boundary.addElement(i, status);
1353             numBreaks++;
1354         }
1355         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1356     }
1357 
1358     // Add a break for the start of the dictionary range if there is not one
1359     // there already.
1360     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1361         t_boundary.addElement(0, status);
1362         numBreaks++;
1363     }
1364 
1365     // Now that we're done, convert positions in t_boundary[] (indices in
1366     // the normalized input string) back to indices in the original input UText
1367     // while reversing t_boundary and pushing values to foundBreaks.
1368     int32_t prevCPPos = -1;
1369     int32_t prevUTextPos = -1;
1370     for (int32_t i = numBreaks-1; i >= 0; i--) {
1371         int32_t cpPos = t_boundary.elementAti(i);
1372         U_ASSERT(cpPos > prevCPPos);
1373         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1374         U_ASSERT(utextPos >= prevUTextPos);
1375         if (utextPos > prevUTextPos) {
1376             // Boundaries are added to foundBreaks output in ascending order.
1377             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1378             foundBreaks.push(utextPos, status);
1379         } else {
1380             // Normalization expanded the input text, the dictionary found a boundary
1381             // within the expansion, giving two boundaries with the same index in the
1382             // original text. Ignore the second. See ticket #12918.
1383             --numBreaks;
1384         }
1385         prevCPPos = cpPos;
1386         prevUTextPos = utextPos;
1387     }
1388     (void)prevCPPos; // suppress compiler warnings about unused variable
1389 
1390     // inString goes out of scope
1391     // inputMap goes out of scope
1392     return numBreaks;
1393 }
1394 #endif
1395 
1396 U_NAMESPACE_END
1397 
1398 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1399 
1400