1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include <utility>
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
21 #include "uvectr32.h"
22 #include "uvector.h"
23 #include "uassert.h"
24 #include "unicode/normlzr.h"
25 #include "cmemory.h"
26 #include "dictionarydata.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  ******************************************************************
32  */
33 
DictionaryBreakEngine()34 DictionaryBreakEngine::DictionaryBreakEngine() {
35 }
36 
~DictionaryBreakEngine()37 DictionaryBreakEngine::~DictionaryBreakEngine() {
38 }
39 
40 UBool
handles(UChar32 c) const41 DictionaryBreakEngine::handles(UChar32 c) const {
42     return fSet.contains(c);
43 }
44 
45 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks) const46 DictionaryBreakEngine::findBreaks( UText *text,
47                                  int32_t startPos,
48                                  int32_t endPos,
49                                  UVector32 &foundBreaks ) const {
50     (void)startPos;            // TODO: remove this param?
51     int32_t result = 0;
52 
53     // Find the span of characters included in the set.
54     //   The span to break begins at the current position in the text, and
55     //   extends towards the start or end of the text, depending on 'reverse'.
56 
57     int32_t start = (int32_t)utext_getNativeIndex(text);
58     int32_t current;
59     int32_t rangeStart;
60     int32_t rangeEnd;
61     UChar32 c = utext_current32(text);
62     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
63         utext_next32(text);         // TODO:  recast loop for postincrement
64         c = utext_current32(text);
65     }
66     rangeStart = start;
67     rangeEnd = current;
68     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
69     utext_setNativeIndex(text, current);
70 
71     return result;
72 }
73 
74 void
setCharacters(const UnicodeSet & set)75 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
76     fSet = set;
77     // Compact for caching
78     fSet.compact();
79 }
80 
81 /*
82  ******************************************************************
83  * PossibleWord
84  */
85 
86 // Helper class for improving readability of the Thai/Lao/Khmer word break
87 // algorithm. The implementation is completely inline.
88 
89 // List size, limited by the maximum number of words in the dictionary
90 // that form a nested sequence.
91 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
92 
93 class PossibleWord {
94 private:
95     // list of word candidate lengths, in increasing length order
96     // TODO: bytes would be sufficient for word lengths.
97     int32_t   count;      // Count of candidates
98     int32_t   prefix;     // The longest match with a dictionary word
99     int32_t   offset;     // Offset in the text of these candidates
100     int32_t   mark;       // The preferred candidate's offset
101     int32_t   current;    // The candidate we're currently looking at
102     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
103     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
104 
105 public:
PossibleWord()106     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
~PossibleWord()107     ~PossibleWord() {}
108 
109     // Fill the list of candidates if needed, select the longest, and return the number found
110     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
111 
112     // Select the currently marked candidate, point after it in the text, and invalidate self
113     int32_t   acceptMarked( UText *text );
114 
115     // Back up from the current candidate to the next shorter one; return TRUE if that exists
116     // and point the text after it
117     UBool     backUp( UText *text );
118 
119     // Return the longest prefix this candidate location shares with a dictionary word
120     // Return value is in code points.
longestPrefix()121     int32_t   longestPrefix() { return prefix; }
122 
123     // Mark the current candidate as the one we like
markCurrent()124     void      markCurrent() { mark = current; }
125 
126     // Get length in code points of the marked word.
markedCPLength()127     int32_t   markedCPLength() { return cpLengths[mark]; }
128 };
129 
130 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)131 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
132     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
133     int32_t start = (int32_t)utext_getNativeIndex(text);
134     if (start != offset) {
135         offset = start;
136         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
137         // Dictionary leaves text after longest prefix, not longest word. Back up.
138         if (count <= 0) {
139             utext_setNativeIndex(text, start);
140         }
141     }
142     if (count > 0) {
143         utext_setNativeIndex(text, start+cuLengths[count-1]);
144     }
145     current = count-1;
146     mark = current;
147     return count;
148 }
149 
150 int32_t
acceptMarked(UText * text)151 PossibleWord::acceptMarked( UText *text ) {
152     utext_setNativeIndex(text, offset + cuLengths[mark]);
153     return cuLengths[mark];
154 }
155 
156 
157 UBool
backUp(UText * text)158 PossibleWord::backUp( UText *text ) {
159     if (current > 0) {
160         utext_setNativeIndex(text, offset + cuLengths[--current]);
161         return TRUE;
162     }
163     return FALSE;
164 }
165 
166 /*
167  ******************************************************************
168  * ThaiBreakEngine
169  */
170 
171 // How many words in a row are "good enough"?
172 static const int32_t THAI_LOOKAHEAD = 3;
173 
174 // Will not combine a non-word with a preceding dictionary word longer than this
175 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
176 
177 // Will not combine a non-word that shares at least this much prefix with a
178 // dictionary word, with a preceding word
179 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
180 
181 // Ellision character
182 static const int32_t THAI_PAIYANNOI = 0x0E2F;
183 
184 // Repeat character
185 static const int32_t THAI_MAIYAMOK = 0x0E46;
186 
187 // Minimum word size
188 static const int32_t THAI_MIN_WORD = 2;
189 
190 // Minimum number of characters for two words
191 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
192 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)193 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
194     : DictionaryBreakEngine(),
195       fDictionary(adoptDictionary)
196 {
197     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
198     if (U_SUCCESS(status)) {
199         setCharacters(fThaiWordSet);
200     }
201     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
202     fMarkSet.add(0x0020);
203     fEndWordSet = fThaiWordSet;
204     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
205     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
206     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
207     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
208     fSuffixSet.add(THAI_PAIYANNOI);
209     fSuffixSet.add(THAI_MAIYAMOK);
210 
211     // Compact for caching.
212     fMarkSet.compact();
213     fEndWordSet.compact();
214     fBeginWordSet.compact();
215     fSuffixSet.compact();
216 }
217 
~ThaiBreakEngine()218 ThaiBreakEngine::~ThaiBreakEngine() {
219     delete fDictionary;
220 }
221 
222 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const223 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
224                                                 int32_t rangeStart,
225                                                 int32_t rangeEnd,
226                                                 UVector32 &foundBreaks ) const {
227     utext_setNativeIndex(text, rangeStart);
228     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
229     if (utext_getNativeIndex(text) >= rangeEnd) {
230         return 0;       // Not enough characters for two words
231     }
232     utext_setNativeIndex(text, rangeStart);
233 
234 
235     uint32_t wordsFound = 0;
236     int32_t cpWordLength = 0;    // Word Length in Code Points.
237     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
238     int32_t current;
239     UErrorCode status = U_ZERO_ERROR;
240     PossibleWord words[THAI_LOOKAHEAD];
241 
242     utext_setNativeIndex(text, rangeStart);
243 
244     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
245         cpWordLength = 0;
246         cuWordLength = 0;
247 
248         // Look for candidate words at the current position
249         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
250 
251         // If we found exactly one, use that
252         if (candidates == 1) {
253             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
254             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
255             wordsFound += 1;
256         }
257         // If there was more than one, see which one can take us forward the most words
258         else if (candidates > 1) {
259             // If we're already at the end of the range, we're done
260             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
261                 goto foundBest;
262             }
263             do {
264                 int32_t wordsMatched = 1;
265                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
266                     if (wordsMatched < 2) {
267                         // Followed by another dictionary word; mark first word as a good candidate
268                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
269                         wordsMatched = 2;
270                     }
271 
272                     // If we're already at the end of the range, we're done
273                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
274                         goto foundBest;
275                     }
276 
277                     // See if any of the possible second words is followed by a third word
278                     do {
279                         // If we find a third word, stop right away
280                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
281                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
282                             goto foundBest;
283                         }
284                     }
285                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
286                 }
287             }
288             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
289 foundBest:
290             // Set UText position to after the accepted word.
291             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
292             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
293             wordsFound += 1;
294         }
295 
296         // We come here after having either found a word or not. We look ahead to the
297         // next word. If it's not a dictionary word, we will combine it with the word we
298         // just found (if there is one), but only if the preceding word does not exceed
299         // the threshold.
300         // The text iterator should now be positioned at the end of the word we found.
301 
302         UChar32 uc = 0;
303         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
304             // if it is a dictionary word, do nothing. If it isn't, then if there is
305             // no preceding word, or the non-word shares less than the minimum threshold
306             // of characters with a dictionary word, then scan to resynchronize
307             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
308                   && (cuWordLength == 0
309                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
310                 // Look for a plausible word boundary
311                 int32_t remaining = rangeEnd - (current+cuWordLength);
312                 UChar32 pc;
313                 int32_t chars = 0;
314                 for (;;) {
315                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
316                     pc = utext_next32(text);
317                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
318                     chars += pcSize;
319                     remaining -= pcSize;
320                     if (remaining <= 0) {
321                         break;
322                     }
323                     uc = utext_current32(text);
324                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
325                         // Maybe. See if it's in the dictionary.
326                         // NOTE: In the original Apple code, checked that the next
327                         // two characters after uc were not 0x0E4C THANTHAKHAT before
328                         // checking the dictionary. That is just a performance filter,
329                         // but it's not clear it's faster than checking the trie.
330                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
331                         utext_setNativeIndex(text, current + cuWordLength + chars);
332                         if (num_candidates > 0) {
333                             break;
334                         }
335                     }
336                 }
337 
338                 // Bump the word count if there wasn't already one
339                 if (cuWordLength <= 0) {
340                     wordsFound += 1;
341                 }
342 
343                 // Update the length with the passed-over characters
344                 cuWordLength += chars;
345             }
346             else {
347                 // Back up to where we were for next iteration
348                 utext_setNativeIndex(text, current+cuWordLength);
349             }
350         }
351 
352         // Never stop before a combining mark.
353         int32_t currPos;
354         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
355             utext_next32(text);
356             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
357         }
358 
359         // Look ahead for possible suffixes if a dictionary word does not follow.
360         // We do this in code rather than using a rule so that the heuristic
361         // resynch continues to function. For example, one of the suffix characters
362         // could be a typo in the middle of a word.
363         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
364             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
365                 && fSuffixSet.contains(uc = utext_current32(text))) {
366                 if (uc == THAI_PAIYANNOI) {
367                     if (!fSuffixSet.contains(utext_previous32(text))) {
368                         // Skip over previous end and PAIYANNOI
369                         utext_next32(text);
370                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
371                         utext_next32(text);
372                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
373                         uc = utext_current32(text);     // Fetch next character
374                     }
375                     else {
376                         // Restore prior position
377                         utext_next32(text);
378                     }
379                 }
380                 if (uc == THAI_MAIYAMOK) {
381                     if (utext_previous32(text) != THAI_MAIYAMOK) {
382                         // Skip over previous end and MAIYAMOK
383                         utext_next32(text);
384                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
385                         utext_next32(text);
386                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
387                     }
388                     else {
389                         // Restore prior position
390                         utext_next32(text);
391                     }
392                 }
393             }
394             else {
395                 utext_setNativeIndex(text, current+cuWordLength);
396             }
397         }
398 
399         // Did we find a word on this iteration? If so, push it on the break stack
400         if (cuWordLength > 0) {
401             foundBreaks.push((current+cuWordLength), status);
402         }
403     }
404 
405     // Don't return a break for the end of the dictionary range if there is one there.
406     if (foundBreaks.peeki() >= rangeEnd) {
407         (void) foundBreaks.popi();
408         wordsFound -= 1;
409     }
410 
411     return wordsFound;
412 }
413 
414 /*
415  ******************************************************************
416  * LaoBreakEngine
417  */
418 
419 // How many words in a row are "good enough"?
420 static const int32_t LAO_LOOKAHEAD = 3;
421 
422 // Will not combine a non-word with a preceding dictionary word longer than this
423 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
424 
425 // Will not combine a non-word that shares at least this much prefix with a
426 // dictionary word, with a preceding word
427 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
428 
429 // Minimum word size
430 static const int32_t LAO_MIN_WORD = 2;
431 
432 // Minimum number of characters for two words
433 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
434 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)435 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
436     : DictionaryBreakEngine(),
437       fDictionary(adoptDictionary)
438 {
439     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
440     if (U_SUCCESS(status)) {
441         setCharacters(fLaoWordSet);
442     }
443     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
444     fMarkSet.add(0x0020);
445     fEndWordSet = fLaoWordSet;
446     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
447     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
448     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
449     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
450 
451     // Compact for caching.
452     fMarkSet.compact();
453     fEndWordSet.compact();
454     fBeginWordSet.compact();
455 }
456 
~LaoBreakEngine()457 LaoBreakEngine::~LaoBreakEngine() {
458     delete fDictionary;
459 }
460 
461 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const462 LaoBreakEngine::divideUpDictionaryRange( UText *text,
463                                                 int32_t rangeStart,
464                                                 int32_t rangeEnd,
465                                                 UVector32 &foundBreaks ) const {
466     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
467         return 0;       // Not enough characters for two words
468     }
469 
470     uint32_t wordsFound = 0;
471     int32_t cpWordLength = 0;
472     int32_t cuWordLength = 0;
473     int32_t current;
474     UErrorCode status = U_ZERO_ERROR;
475     PossibleWord words[LAO_LOOKAHEAD];
476 
477     utext_setNativeIndex(text, rangeStart);
478 
479     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
480         cuWordLength = 0;
481         cpWordLength = 0;
482 
483         // Look for candidate words at the current position
484         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
485 
486         // If we found exactly one, use that
487         if (candidates == 1) {
488             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
489             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
490             wordsFound += 1;
491         }
492         // If there was more than one, see which one can take us forward the most words
493         else if (candidates > 1) {
494             // If we're already at the end of the range, we're done
495             if (utext_getNativeIndex(text) >= rangeEnd) {
496                 goto foundBest;
497             }
498             do {
499                 int32_t wordsMatched = 1;
500                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
501                     if (wordsMatched < 2) {
502                         // Followed by another dictionary word; mark first word as a good candidate
503                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
504                         wordsMatched = 2;
505                     }
506 
507                     // If we're already at the end of the range, we're done
508                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
509                         goto foundBest;
510                     }
511 
512                     // See if any of the possible second words is followed by a third word
513                     do {
514                         // If we find a third word, stop right away
515                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
516                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
517                             goto foundBest;
518                         }
519                     }
520                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
521                 }
522             }
523             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
524 foundBest:
525             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
526             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
527             wordsFound += 1;
528         }
529 
530         // We come here after having either found a word or not. We look ahead to the
531         // next word. If it's not a dictionary word, we will combine it withe the word we
532         // just found (if there is one), but only if the preceding word does not exceed
533         // the threshold.
534         // The text iterator should now be positioned at the end of the word we found.
535         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
536             // if it is a dictionary word, do nothing. If it isn't, then if there is
537             // no preceding word, or the non-word shares less than the minimum threshold
538             // of characters with a dictionary word, then scan to resynchronize
539             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
540                   && (cuWordLength == 0
541                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
542                 // Look for a plausible word boundary
543                 int32_t remaining = rangeEnd - (current + cuWordLength);
544                 UChar32 pc;
545                 UChar32 uc;
546                 int32_t chars = 0;
547                 for (;;) {
548                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
549                     pc = utext_next32(text);
550                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
551                     chars += pcSize;
552                     remaining -= pcSize;
553                     if (remaining <= 0) {
554                         break;
555                     }
556                     uc = utext_current32(text);
557                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
558                         // Maybe. See if it's in the dictionary.
559                         // TODO: this looks iffy; compare with old code.
560                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
561                         utext_setNativeIndex(text, current + cuWordLength + chars);
562                         if (num_candidates > 0) {
563                             break;
564                         }
565                     }
566                 }
567 
568                 // Bump the word count if there wasn't already one
569                 if (cuWordLength <= 0) {
570                     wordsFound += 1;
571                 }
572 
573                 // Update the length with the passed-over characters
574                 cuWordLength += chars;
575             }
576             else {
577                 // Back up to where we were for next iteration
578                 utext_setNativeIndex(text, current + cuWordLength);
579             }
580         }
581 
582         // Never stop before a combining mark.
583         int32_t currPos;
584         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
585             utext_next32(text);
586             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
587         }
588 
589         // Look ahead for possible suffixes if a dictionary word does not follow.
590         // We do this in code rather than using a rule so that the heuristic
591         // resynch continues to function. For example, one of the suffix characters
592         // could be a typo in the middle of a word.
593         // NOT CURRENTLY APPLICABLE TO LAO
594 
595         // Did we find a word on this iteration? If so, push it on the break stack
596         if (cuWordLength > 0) {
597             foundBreaks.push((current+cuWordLength), status);
598         }
599     }
600 
601     // Don't return a break for the end of the dictionary range if there is one there.
602     if (foundBreaks.peeki() >= rangeEnd) {
603         (void) foundBreaks.popi();
604         wordsFound -= 1;
605     }
606 
607     return wordsFound;
608 }
609 
610 /*
611  ******************************************************************
612  * BurmeseBreakEngine
613  */
614 
615 // How many words in a row are "good enough"?
616 static const int32_t BURMESE_LOOKAHEAD = 3;
617 
618 // Will not combine a non-word with a preceding dictionary word longer than this
619 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
620 
621 // Will not combine a non-word that shares at least this much prefix with a
622 // dictionary word, with a preceding word
623 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
624 
625 // Minimum word size
626 static const int32_t BURMESE_MIN_WORD = 2;
627 
628 // Minimum number of characters for two words
629 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
630 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)631 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
632     : DictionaryBreakEngine(),
633       fDictionary(adoptDictionary)
634 {
635     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
636     if (U_SUCCESS(status)) {
637         setCharacters(fBurmeseWordSet);
638     }
639     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
640     fMarkSet.add(0x0020);
641     fEndWordSet = fBurmeseWordSet;
642     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
643 
644     // Compact for caching.
645     fMarkSet.compact();
646     fEndWordSet.compact();
647     fBeginWordSet.compact();
648 }
649 
~BurmeseBreakEngine()650 BurmeseBreakEngine::~BurmeseBreakEngine() {
651     delete fDictionary;
652 }
653 
654 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const655 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
656                                                 int32_t rangeStart,
657                                                 int32_t rangeEnd,
658                                                 UVector32 &foundBreaks ) const {
659     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
660         return 0;       // Not enough characters for two words
661     }
662 
663     uint32_t wordsFound = 0;
664     int32_t cpWordLength = 0;
665     int32_t cuWordLength = 0;
666     int32_t current;
667     UErrorCode status = U_ZERO_ERROR;
668     PossibleWord words[BURMESE_LOOKAHEAD];
669 
670     utext_setNativeIndex(text, rangeStart);
671 
672     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
673         cuWordLength = 0;
674         cpWordLength = 0;
675 
676         // Look for candidate words at the current position
677         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
678 
679         // If we found exactly one, use that
680         if (candidates == 1) {
681             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
682             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
683             wordsFound += 1;
684         }
685         // If there was more than one, see which one can take us forward the most words
686         else if (candidates > 1) {
687             // If we're already at the end of the range, we're done
688             if (utext_getNativeIndex(text) >= rangeEnd) {
689                 goto foundBest;
690             }
691             do {
692                 int32_t wordsMatched = 1;
693                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
694                     if (wordsMatched < 2) {
695                         // Followed by another dictionary word; mark first word as a good candidate
696                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
697                         wordsMatched = 2;
698                     }
699 
700                     // If we're already at the end of the range, we're done
701                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
702                         goto foundBest;
703                     }
704 
705                     // See if any of the possible second words is followed by a third word
706                     do {
707                         // If we find a third word, stop right away
708                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
709                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
710                             goto foundBest;
711                         }
712                     }
713                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
714                 }
715             }
716             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
717 foundBest:
718             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
719             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
720             wordsFound += 1;
721         }
722 
723         // We come here after having either found a word or not. We look ahead to the
724         // next word. If it's not a dictionary word, we will combine it withe the word we
725         // just found (if there is one), but only if the preceding word does not exceed
726         // the threshold.
727         // The text iterator should now be positioned at the end of the word we found.
728         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
729             // if it is a dictionary word, do nothing. If it isn't, then if there is
730             // no preceding word, or the non-word shares less than the minimum threshold
731             // of characters with a dictionary word, then scan to resynchronize
732             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
733                   && (cuWordLength == 0
734                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
735                 // Look for a plausible word boundary
736                 int32_t remaining = rangeEnd - (current + cuWordLength);
737                 UChar32 pc;
738                 UChar32 uc;
739                 int32_t chars = 0;
740                 for (;;) {
741                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
742                     pc = utext_next32(text);
743                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
744                     chars += pcSize;
745                     remaining -= pcSize;
746                     if (remaining <= 0) {
747                         break;
748                     }
749                     uc = utext_current32(text);
750                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
751                         // Maybe. See if it's in the dictionary.
752                         // TODO: this looks iffy; compare with old code.
753                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
754                         utext_setNativeIndex(text, current + cuWordLength + chars);
755                         if (num_candidates > 0) {
756                             break;
757                         }
758                     }
759                 }
760 
761                 // Bump the word count if there wasn't already one
762                 if (cuWordLength <= 0) {
763                     wordsFound += 1;
764                 }
765 
766                 // Update the length with the passed-over characters
767                 cuWordLength += chars;
768             }
769             else {
770                 // Back up to where we were for next iteration
771                 utext_setNativeIndex(text, current + cuWordLength);
772             }
773         }
774 
775         // Never stop before a combining mark.
776         int32_t currPos;
777         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
778             utext_next32(text);
779             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
780         }
781 
782         // Look ahead for possible suffixes if a dictionary word does not follow.
783         // We do this in code rather than using a rule so that the heuristic
784         // resynch continues to function. For example, one of the suffix characters
785         // could be a typo in the middle of a word.
786         // NOT CURRENTLY APPLICABLE TO BURMESE
787 
788         // Did we find a word on this iteration? If so, push it on the break stack
789         if (cuWordLength > 0) {
790             foundBreaks.push((current+cuWordLength), status);
791         }
792     }
793 
794     // Don't return a break for the end of the dictionary range if there is one there.
795     if (foundBreaks.peeki() >= rangeEnd) {
796         (void) foundBreaks.popi();
797         wordsFound -= 1;
798     }
799 
800     return wordsFound;
801 }
802 
803 /*
804  ******************************************************************
805  * KhmerBreakEngine
806  */
807 
808 // How many words in a row are "good enough"?
809 static const int32_t KHMER_LOOKAHEAD = 3;
810 
811 // Will not combine a non-word with a preceding dictionary word longer than this
812 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
813 
814 // Will not combine a non-word that shares at least this much prefix with a
815 // dictionary word, with a preceding word
816 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
817 
818 // Minimum word size
819 static const int32_t KHMER_MIN_WORD = 2;
820 
821 // Minimum number of characters for two words
822 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
823 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)824 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
825     : DictionaryBreakEngine(),
826       fDictionary(adoptDictionary)
827 {
828     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
829     if (U_SUCCESS(status)) {
830         setCharacters(fKhmerWordSet);
831     }
832     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
833     fMarkSet.add(0x0020);
834     fEndWordSet = fKhmerWordSet;
835     fBeginWordSet.add(0x1780, 0x17B3);
836     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
837     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
838     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
839     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
840     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
841 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
842 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
843 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
844 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
845 //    fSuffixSet.add(THAI_PAIYANNOI);
846 //    fSuffixSet.add(THAI_MAIYAMOK);
847 
848     // Compact for caching.
849     fMarkSet.compact();
850     fEndWordSet.compact();
851     fBeginWordSet.compact();
852 //    fSuffixSet.compact();
853 }
854 
~KhmerBreakEngine()855 KhmerBreakEngine::~KhmerBreakEngine() {
856     delete fDictionary;
857 }
858 
859 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const860 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
861                                                 int32_t rangeStart,
862                                                 int32_t rangeEnd,
863                                                 UVector32 &foundBreaks ) const {
864     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
865         return 0;       // Not enough characters for two words
866     }
867 
868     uint32_t wordsFound = 0;
869     int32_t cpWordLength = 0;
870     int32_t cuWordLength = 0;
871     int32_t current;
872     UErrorCode status = U_ZERO_ERROR;
873     PossibleWord words[KHMER_LOOKAHEAD];
874 
875     utext_setNativeIndex(text, rangeStart);
876 
877     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
878         cuWordLength = 0;
879         cpWordLength = 0;
880 
881         // Look for candidate words at the current position
882         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
883 
884         // If we found exactly one, use that
885         if (candidates == 1) {
886             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
887             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
888             wordsFound += 1;
889         }
890 
891         // If there was more than one, see which one can take us forward the most words
892         else if (candidates > 1) {
893             // If we're already at the end of the range, we're done
894             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
895                 goto foundBest;
896             }
897             do {
898                 int32_t wordsMatched = 1;
899                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
900                     if (wordsMatched < 2) {
901                         // Followed by another dictionary word; mark first word as a good candidate
902                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
903                         wordsMatched = 2;
904                     }
905 
906                     // If we're already at the end of the range, we're done
907                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
908                         goto foundBest;
909                     }
910 
911                     // See if any of the possible second words is followed by a third word
912                     do {
913                         // If we find a third word, stop right away
914                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
915                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
916                             goto foundBest;
917                         }
918                     }
919                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
920                 }
921             }
922             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
923 foundBest:
924             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
925             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
926             wordsFound += 1;
927         }
928 
929         // We come here after having either found a word or not. We look ahead to the
930         // next word. If it's not a dictionary word, we will combine it with the word we
931         // just found (if there is one), but only if the preceding word does not exceed
932         // the threshold.
933         // The text iterator should now be positioned at the end of the word we found.
934         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
935             // if it is a dictionary word, do nothing. If it isn't, then if there is
936             // no preceding word, or the non-word shares less than the minimum threshold
937             // of characters with a dictionary word, then scan to resynchronize
938             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
939                   && (cuWordLength == 0
940                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
941                 // Look for a plausible word boundary
942                 int32_t remaining = rangeEnd - (current+cuWordLength);
943                 UChar32 pc;
944                 UChar32 uc;
945                 int32_t chars = 0;
946                 for (;;) {
947                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
948                     pc = utext_next32(text);
949                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
950                     chars += pcSize;
951                     remaining -= pcSize;
952                     if (remaining <= 0) {
953                         break;
954                     }
955                     uc = utext_current32(text);
956                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
957                         // Maybe. See if it's in the dictionary.
958                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
959                         utext_setNativeIndex(text, current+cuWordLength+chars);
960                         if (num_candidates > 0) {
961                             break;
962                         }
963                     }
964                 }
965 
966                 // Bump the word count if there wasn't already one
967                 if (cuWordLength <= 0) {
968                     wordsFound += 1;
969                 }
970 
971                 // Update the length with the passed-over characters
972                 cuWordLength += chars;
973             }
974             else {
975                 // Back up to where we were for next iteration
976                 utext_setNativeIndex(text, current+cuWordLength);
977             }
978         }
979 
980         // Never stop before a combining mark.
981         int32_t currPos;
982         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
983             utext_next32(text);
984             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
985         }
986 
987         // Look ahead for possible suffixes if a dictionary word does not follow.
988         // We do this in code rather than using a rule so that the heuristic
989         // resynch continues to function. For example, one of the suffix characters
990         // could be a typo in the middle of a word.
991 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
992 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
993 //                && fSuffixSet.contains(uc = utext_current32(text))) {
994 //                if (uc == KHMER_PAIYANNOI) {
995 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
996 //                        // Skip over previous end and PAIYANNOI
997 //                        utext_next32(text);
998 //                        utext_next32(text);
999 //                        wordLength += 1;            // Add PAIYANNOI to word
1000 //                        uc = utext_current32(text);     // Fetch next character
1001 //                    }
1002 //                    else {
1003 //                        // Restore prior position
1004 //                        utext_next32(text);
1005 //                    }
1006 //                }
1007 //                if (uc == KHMER_MAIYAMOK) {
1008 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1009 //                        // Skip over previous end and MAIYAMOK
1010 //                        utext_next32(text);
1011 //                        utext_next32(text);
1012 //                        wordLength += 1;            // Add MAIYAMOK to word
1013 //                    }
1014 //                    else {
1015 //                        // Restore prior position
1016 //                        utext_next32(text);
1017 //                    }
1018 //                }
1019 //            }
1020 //            else {
1021 //                utext_setNativeIndex(text, current+wordLength);
1022 //            }
1023 //        }
1024 
1025         // Did we find a word on this iteration? If so, push it on the break stack
1026         if (cuWordLength > 0) {
1027             foundBreaks.push((current+cuWordLength), status);
1028         }
1029     }
1030 
1031     // Don't return a break for the end of the dictionary range if there is one there.
1032     if (foundBreaks.peeki() >= rangeEnd) {
1033         (void) foundBreaks.popi();
1034         wordsFound -= 1;
1035     }
1036 
1037     return wordsFound;
1038 }
1039 
1040 #if !UCONFIG_NO_NORMALIZATION
1041 /*
1042  ******************************************************************
1043  * CjkBreakEngine
1044  */
1045 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1046 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1047 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1048     // Korean dictionary only includes Hangul syllables
1049     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1050     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1051     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1052     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1053     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1054 
1055     if (U_SUCCESS(status)) {
1056         // handle Korean and Japanese/Chinese using different dictionaries
1057         if (type == kKorean) {
1058             setCharacters(fHangulWordSet);
1059         } else { //Chinese and Japanese
1060             UnicodeSet cjSet;
1061             cjSet.addAll(fHanWordSet);
1062             cjSet.addAll(fKatakanaWordSet);
1063             cjSet.addAll(fHiraganaWordSet);
1064             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1066             setCharacters(cjSet);
1067         }
1068     }
1069 }
1070 
~CjkBreakEngine()1071 CjkBreakEngine::~CjkBreakEngine(){
1072     delete fDictionary;
1073 }
1074 
1075 // The katakanaCost values below are based on the length frequencies of all
1076 // katakana phrases in the dictionary
1077 static const int32_t kMaxKatakanaLength = 8;
1078 static const int32_t kMaxKatakanaGroupLength = 20;
1079 static const uint32_t maxSnlp = 255;
1080 
getKatakanaCost(int32_t wordLength)1081 static inline uint32_t getKatakanaCost(int32_t wordLength){
1082     //TODO: fill array with actual values from dictionary!
1083     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1084                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1085     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1086 }
1087 
isKatakana(UChar32 value)1088 static inline bool isKatakana(UChar32 value) {
1089     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1090             (value >= 0xFF66 && value <= 0xFF9f);
1091 }
1092 
1093 
1094 // Function for accessing internal utext flags.
1095 //   Replicates an internal UText function.
1096 
utext_i32_flag(int32_t bitIndex)1097 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1098     return (int32_t)1 << bitIndex;
1099 }
1100 
1101 
1102 /*
1103  * @param text A UText representing the text
1104  * @param rangeStart The start of the range of dictionary characters
1105  * @param rangeEnd The end of the range of dictionary characters
1106  * @param foundBreaks vector<int32> to receive the break positions
1107  * @return The number of breaks found
1108  */
1109 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1110 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1111         int32_t rangeStart,
1112         int32_t rangeEnd,
1113         UVector32 &foundBreaks ) const {
1114     if (rangeStart >= rangeEnd) {
1115         return 0;
1116     }
1117 
1118     // UnicodeString version of input UText, NFKC normalized if necessary.
1119     UnicodeString inString;
1120 
1121     // inputMap[inStringIndex] = corresponding native index from UText inText.
1122     // If NULL then mapping is 1:1
1123     LocalPointer<UVector32>     inputMap;
1124 
1125     UErrorCode     status      = U_ZERO_ERROR;
1126 
1127 
1128     // if UText has the input string as one contiguous UTF-16 chunk
1129     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1130          inText->chunkNativeStart <= rangeStart &&
1131          inText->chunkNativeLimit >= rangeEnd   &&
1132          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1133 
1134         // Input UText is in one contiguous UTF-16 chunk.
1135         // Use Read-only aliasing UnicodeString.
1136         inString.setTo(FALSE,
1137                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1138                        rangeEnd - rangeStart);
1139     } else {
1140         // Copy the text from the original inText (UText) to inString (UnicodeString).
1141         // Create a map from UnicodeString indices -> UText offsets.
1142         utext_setNativeIndex(inText, rangeStart);
1143         int32_t limit = rangeEnd;
1144         U_ASSERT(limit <= utext_nativeLength(inText));
1145         if (limit > utext_nativeLength(inText)) {
1146             limit = (int32_t)utext_nativeLength(inText);
1147         }
1148         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1149         if (U_FAILURE(status)) {
1150             return 0;
1151         }
1152         while (utext_getNativeIndex(inText) < limit) {
1153             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1154             UChar32 c = utext_next32(inText);
1155             U_ASSERT(c != U_SENTINEL);
1156             inString.append(c);
1157             while (inputMap->size() < inString.length()) {
1158                 inputMap->addElement(nativePosition, status);
1159             }
1160         }
1161         inputMap->addElement(limit, status);
1162     }
1163 
1164 
1165     if (!nfkcNorm2->isNormalized(inString, status)) {
1166         UnicodeString normalizedInput;
1167         //  normalizedMap[normalizedInput position] ==  original UText position.
1168         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1169         if (U_FAILURE(status)) {
1170             return 0;
1171         }
1172 
1173         UnicodeString fragment;
1174         UnicodeString normalizedFragment;
1175         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1176             fragment.remove();
1177             int32_t fragmentStartI = srcI;
1178             UChar32 c = inString.char32At(srcI);
1179             for (;;) {
1180                 fragment.append(c);
1181                 srcI = inString.moveIndex32(srcI, 1);
1182                 if (srcI == inString.length()) {
1183                     break;
1184                 }
1185                 c = inString.char32At(srcI);
1186                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1187                     break;
1188                 }
1189             }
1190             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1191             normalizedInput.append(normalizedFragment);
1192 
1193             // Map every position in the normalized chunk to the start of the chunk
1194             //   in the original input.
1195             int32_t fragmentOriginalStart = inputMap.isValid() ?
1196                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1197             while (normalizedMap->size() < normalizedInput.length()) {
1198                 normalizedMap->addElement(fragmentOriginalStart, status);
1199                 if (U_FAILURE(status)) {
1200                     break;
1201                 }
1202             }
1203         }
1204         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1205         int32_t nativeEnd = inputMap.isValid() ?
1206                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1207         normalizedMap->addElement(nativeEnd, status);
1208 
1209         inputMap = std::move(normalizedMap);
1210         inString = std::move(normalizedInput);
1211     }
1212 
1213     int32_t numCodePts = inString.countChar32();
1214     if (numCodePts != inString.length()) {
1215         // There are supplementary characters in the input.
1216         // The dictionary will produce boundary positions in terms of code point indexes,
1217         //   not in terms of code unit string indexes.
1218         // Use the inputMap mechanism to take care of this in addition to indexing differences
1219         //    from normalization and/or UTF-8 input.
1220         UBool hadExistingMap = inputMap.isValid();
1221         if (!hadExistingMap) {
1222             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1223             if (U_FAILURE(status)) {
1224                 return 0;
1225             }
1226         }
1227         int32_t cpIdx = 0;
1228         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1229             U_ASSERT(cuIdx >= cpIdx);
1230             if (hadExistingMap) {
1231                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1232             } else {
1233                 inputMap->addElement(cuIdx+rangeStart, status);
1234             }
1235             cpIdx++;
1236             if (cuIdx == inString.length()) {
1237                break;
1238             }
1239         }
1240     }
1241 
1242     // bestSnlp[i] is the snlp of the best segmentation of the first i
1243     // code points in the range to be matched.
1244     UVector32 bestSnlp(numCodePts + 1, status);
1245     bestSnlp.addElement(0, status);
1246     for(int32_t i = 1; i <= numCodePts; i++) {
1247         bestSnlp.addElement(kuint32max, status);
1248     }
1249 
1250 
1251     // prev[i] is the index of the last CJK code point in the previous word in
1252     // the best segmentation of the first i characters.
1253     UVector32 prev(numCodePts + 1, status);
1254     for(int32_t i = 0; i <= numCodePts; i++){
1255         prev.addElement(-1, status);
1256     }
1257 
1258     const int32_t maxWordSize = 20;
1259     UVector32 values(numCodePts, status);
1260     values.setSize(numCodePts);
1261     UVector32 lengths(numCodePts, status);
1262     lengths.setSize(numCodePts);
1263 
1264     UText fu = UTEXT_INITIALIZER;
1265     utext_openUnicodeString(&fu, &inString, &status);
1266 
1267     // Dynamic programming to find the best segmentation.
1268 
1269     // In outer loop, i  is the code point index,
1270     //                ix is the corresponding string (code unit) index.
1271     //    They differ when the string contains supplementary characters.
1272     int32_t ix = 0;
1273     bool is_prev_katakana = false;
1274     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1275         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1276             continue;
1277         }
1278 
1279         int32_t count;
1280         utext_setNativeIndex(&fu, ix);
1281         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1282                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1283                              // Note: lengths is filled with code point lengths
1284                              //       The NULL parameter is the ignored code unit lengths.
1285 
1286         // if there are no single character matches found in the dictionary
1287         // starting with this character, treat character as a 1-character word
1288         // with the highest value possible, i.e. the least likely to occur.
1289         // Exclude Korean characters from this treatment, as they should be left
1290         // together by default.
1291         if ((count == 0 || lengths.elementAti(0) != 1) &&
1292                 !fHangulWordSet.contains(inString.char32At(ix))) {
1293             values.setElementAt(maxSnlp, count);   // 255
1294             lengths.setElementAt(1, count++);
1295         }
1296 
1297         for (int32_t j = 0; j < count; j++) {
1298             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1299             int32_t ln_j_i = lengths.elementAti(j) + i;
1300             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1301                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1302                 prev.setElementAt(i, ln_j_i);
1303             }
1304         }
1305 
1306         // In Japanese,
1307         // Katakana word in single character is pretty rare. So we apply
1308         // the following heuristic to Katakana: any continuous run of Katakana
1309         // characters is considered a candidate word with a default cost
1310         // specified in the katakanaCost table according to its length.
1311 
1312         bool is_katakana = isKatakana(inString.char32At(ix));
1313         int32_t katakanaRunLength = 1;
1314         if (!is_prev_katakana && is_katakana) {
1315             int32_t j = inString.moveIndex32(ix, 1);
1316             // Find the end of the continuous run of Katakana characters
1317             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1318                     isKatakana(inString.char32At(j))) {
1319                 j = inString.moveIndex32(j, 1);
1320                 katakanaRunLength++;
1321             }
1322             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1323                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1324                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1325                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1326                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1327                 }
1328             }
1329         }
1330         is_prev_katakana = is_katakana;
1331     }
1332     utext_close(&fu);
1333 
1334     // Start pushing the optimal offset index into t_boundary (t for tentative).
1335     // prev[numCodePts] is guaranteed to be meaningful.
1336     // We'll first push in the reverse order, i.e.,
1337     // t_boundary[0] = numCodePts, and afterwards do a swap.
1338     UVector32 t_boundary(numCodePts+1, status);
1339 
1340     int32_t numBreaks = 0;
1341     // No segmentation found, set boundary to end of range
1342     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1343         t_boundary.addElement(numCodePts, status);
1344         numBreaks++;
1345     } else {
1346         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1347             t_boundary.addElement(i, status);
1348             numBreaks++;
1349         }
1350         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1351     }
1352 
1353     // Add a break for the start of the dictionary range if there is not one
1354     // there already.
1355     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1356         t_boundary.addElement(0, status);
1357         numBreaks++;
1358     }
1359 
1360     // Now that we're done, convert positions in t_boundary[] (indices in
1361     // the normalized input string) back to indices in the original input UText
1362     // while reversing t_boundary and pushing values to foundBreaks.
1363     int32_t prevCPPos = -1;
1364     int32_t prevUTextPos = -1;
1365     for (int32_t i = numBreaks-1; i >= 0; i--) {
1366         int32_t cpPos = t_boundary.elementAti(i);
1367         U_ASSERT(cpPos > prevCPPos);
1368         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1369         U_ASSERT(utextPos >= prevUTextPos);
1370         if (utextPos > prevUTextPos) {
1371             // Boundaries are added to foundBreaks output in ascending order.
1372             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1373             foundBreaks.push(utextPos, status);
1374         } else {
1375             // Normalization expanded the input text, the dictionary found a boundary
1376             // within the expansion, giving two boundaries with the same index in the
1377             // original text. Ignore the second. See ticket #12918.
1378             --numBreaks;
1379         }
1380         prevCPPos = cpPos;
1381         prevUTextPos = utextPos;
1382     }
1383     (void)prevCPPos; // suppress compiler warnings about unused variable
1384 
1385     // inString goes out of scope
1386     // inputMap goes out of scope
1387     return numBreaks;
1388 }
1389 #endif
1390 
1391 U_NAMESPACE_END
1392 
1393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1394