1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include <utility>
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
21 #include "utracimp.h"
22 #include "uvectr32.h"
23 #include "uvector.h"
24 #include "uassert.h"
25 #include "unicode/normlzr.h"
26 #include "cmemory.h"
27 #include "dictionarydata.h"
28
29 U_NAMESPACE_BEGIN
30
31 /*
32 ******************************************************************
33 */
34
DictionaryBreakEngine()35 DictionaryBreakEngine::DictionaryBreakEngine() {
36 }
37
~DictionaryBreakEngine()38 DictionaryBreakEngine::~DictionaryBreakEngine() {
39 }
40
41 UBool
handles(UChar32 c) const42 DictionaryBreakEngine::handles(UChar32 c) const {
43 return fSet.contains(c);
44 }
45
46 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks,UErrorCode & status) const47 DictionaryBreakEngine::findBreaks( UText *text,
48 int32_t startPos,
49 int32_t endPos,
50 UVector32 &foundBreaks,
51 UErrorCode& status) const {
52 if (U_FAILURE(status)) return 0;
53 (void)startPos; // TODO: remove this param?
54 int32_t result = 0;
55
56 // Find the span of characters included in the set.
57 // The span to break begins at the current position in the text, and
58 // extends towards the start or end of the text, depending on 'reverse'.
59
60 int32_t start = (int32_t)utext_getNativeIndex(text);
61 int32_t current;
62 int32_t rangeStart;
63 int32_t rangeEnd;
64 UChar32 c = utext_current32(text);
65 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
66 utext_next32(text); // TODO: recast loop for postincrement
67 c = utext_current32(text);
68 }
69 rangeStart = start;
70 rangeEnd = current;
71 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
72 utext_setNativeIndex(text, current);
73
74 return result;
75 }
76
77 void
setCharacters(const UnicodeSet & set)78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
79 fSet = set;
80 // Compact for caching
81 fSet.compact();
82 }
83
84 /*
85 ******************************************************************
86 * PossibleWord
87 */
88
89 // Helper class for improving readability of the Thai/Lao/Khmer word break
90 // algorithm. The implementation is completely inline.
91
92 // List size, limited by the maximum number of words in the dictionary
93 // that form a nested sequence.
94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
95
96 class PossibleWord {
97 private:
98 // list of word candidate lengths, in increasing length order
99 // TODO: bytes would be sufficient for word lengths.
100 int32_t count; // Count of candidates
101 int32_t prefix; // The longest match with a dictionary word
102 int32_t offset; // Offset in the text of these candidates
103 int32_t mark; // The preferred candidate's offset
104 int32_t current; // The candidate we're currently looking at
105 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
106 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
107
108 public:
PossibleWord()109 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
~PossibleWord()110 ~PossibleWord() {}
111
112 // Fill the list of candidates if needed, select the longest, and return the number found
113 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
114
115 // Select the currently marked candidate, point after it in the text, and invalidate self
116 int32_t acceptMarked( UText *text );
117
118 // Back up from the current candidate to the next shorter one; return TRUE if that exists
119 // and point the text after it
120 UBool backUp( UText *text );
121
122 // Return the longest prefix this candidate location shares with a dictionary word
123 // Return value is in code points.
longestPrefix()124 int32_t longestPrefix() { return prefix; }
125
126 // Mark the current candidate as the one we like
markCurrent()127 void markCurrent() { mark = current; }
128
129 // Get length in code points of the marked word.
markedCPLength()130 int32_t markedCPLength() { return cpLengths[mark]; }
131 };
132
133
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
135 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
136 int32_t start = (int32_t)utext_getNativeIndex(text);
137 if (start != offset) {
138 offset = start;
139 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
140 // Dictionary leaves text after longest prefix, not longest word. Back up.
141 if (count <= 0) {
142 utext_setNativeIndex(text, start);
143 }
144 }
145 if (count > 0) {
146 utext_setNativeIndex(text, start+cuLengths[count-1]);
147 }
148 current = count-1;
149 mark = current;
150 return count;
151 }
152
153 int32_t
acceptMarked(UText * text)154 PossibleWord::acceptMarked( UText *text ) {
155 utext_setNativeIndex(text, offset + cuLengths[mark]);
156 return cuLengths[mark];
157 }
158
159
160 UBool
backUp(UText * text)161 PossibleWord::backUp( UText *text ) {
162 if (current > 0) {
163 utext_setNativeIndex(text, offset + cuLengths[--current]);
164 return TRUE;
165 }
166 return FALSE;
167 }
168
169 /*
170 ******************************************************************
171 * ThaiBreakEngine
172 */
173
174 // How many words in a row are "good enough"?
175 static const int32_t THAI_LOOKAHEAD = 3;
176
177 // Will not combine a non-word with a preceding dictionary word longer than this
178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
179
180 // Will not combine a non-word that shares at least this much prefix with a
181 // dictionary word, with a preceding word
182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
183
184 // Elision character
185 static const int32_t THAI_PAIYANNOI = 0x0E2F;
186
187 // Repeat character
188 static const int32_t THAI_MAIYAMOK = 0x0E46;
189
190 // Minimum word size
191 static const int32_t THAI_MIN_WORD = 2;
192
193 // Minimum number of characters for two words
194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
195
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
197 : DictionaryBreakEngine(),
198 fDictionary(adoptDictionary)
199 {
200 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
201 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
202 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
203 if (U_SUCCESS(status)) {
204 setCharacters(fThaiWordSet);
205 }
206 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
207 fMarkSet.add(0x0020);
208 fEndWordSet = fThaiWordSet;
209 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
210 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
211 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
212 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
213 fSuffixSet.add(THAI_PAIYANNOI);
214 fSuffixSet.add(THAI_MAIYAMOK);
215
216 // Compact for caching.
217 fMarkSet.compact();
218 fEndWordSet.compact();
219 fBeginWordSet.compact();
220 fSuffixSet.compact();
221 UTRACE_EXIT_STATUS(status);
222 }
223
~ThaiBreakEngine()224 ThaiBreakEngine::~ThaiBreakEngine() {
225 delete fDictionary;
226 }
227
228 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const229 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
230 int32_t rangeStart,
231 int32_t rangeEnd,
232 UVector32 &foundBreaks,
233 UErrorCode& status) const {
234 if (U_FAILURE(status)) return 0;
235 utext_setNativeIndex(text, rangeStart);
236 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
237 if (utext_getNativeIndex(text) >= rangeEnd) {
238 return 0; // Not enough characters for two words
239 }
240 utext_setNativeIndex(text, rangeStart);
241
242
243 uint32_t wordsFound = 0;
244 int32_t cpWordLength = 0; // Word Length in Code Points.
245 int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
246 int32_t current;
247 PossibleWord words[THAI_LOOKAHEAD];
248
249 utext_setNativeIndex(text, rangeStart);
250
251 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
252 cpWordLength = 0;
253 cuWordLength = 0;
254
255 // Look for candidate words at the current position
256 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
257
258 // If we found exactly one, use that
259 if (candidates == 1) {
260 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
261 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
262 wordsFound += 1;
263 }
264 // If there was more than one, see which one can take us forward the most words
265 else if (candidates > 1) {
266 // If we're already at the end of the range, we're done
267 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
268 goto foundBest;
269 }
270 do {
271 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
272 // Followed by another dictionary word; mark first word as a good candidate
273 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
274
275 // If we're already at the end of the range, we're done
276 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277 goto foundBest;
278 }
279
280 // See if any of the possible second words is followed by a third word
281 do {
282 // If we find a third word, stop right away
283 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
284 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
285 goto foundBest;
286 }
287 }
288 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
289 }
290 }
291 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
292 foundBest:
293 // Set UText position to after the accepted word.
294 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
295 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
296 wordsFound += 1;
297 }
298
299 // We come here after having either found a word or not. We look ahead to the
300 // next word. If it's not a dictionary word, we will combine it with the word we
301 // just found (if there is one), but only if the preceding word does not exceed
302 // the threshold.
303 // The text iterator should now be positioned at the end of the word we found.
304
305 UChar32 uc = 0;
306 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307 // if it is a dictionary word, do nothing. If it isn't, then if there is
308 // no preceding word, or the non-word shares less than the minimum threshold
309 // of characters with a dictionary word, then scan to resynchronize
310 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311 && (cuWordLength == 0
312 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313 // Look for a plausible word boundary
314 int32_t remaining = rangeEnd - (current+cuWordLength);
315 UChar32 pc;
316 int32_t chars = 0;
317 for (;;) {
318 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
319 pc = utext_next32(text);
320 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
321 chars += pcSize;
322 remaining -= pcSize;
323 if (remaining <= 0) {
324 break;
325 }
326 uc = utext_current32(text);
327 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328 // Maybe. See if it's in the dictionary.
329 // NOTE: In the original Apple code, checked that the next
330 // two characters after uc were not 0x0E4C THANTHAKHAT before
331 // checking the dictionary. That is just a performance filter,
332 // but it's not clear it's faster than checking the trie.
333 int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334 utext_setNativeIndex(text, current + cuWordLength + chars);
335 if (num_candidates > 0) {
336 break;
337 }
338 }
339 }
340
341 // Bump the word count if there wasn't already one
342 if (cuWordLength <= 0) {
343 wordsFound += 1;
344 }
345
346 // Update the length with the passed-over characters
347 cuWordLength += chars;
348 }
349 else {
350 // Back up to where we were for next iteration
351 utext_setNativeIndex(text, current+cuWordLength);
352 }
353 }
354
355 // Never stop before a combining mark.
356 int32_t currPos;
357 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
358 utext_next32(text);
359 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
360 }
361
362 // Look ahead for possible suffixes if a dictionary word does not follow.
363 // We do this in code rather than using a rule so that the heuristic
364 // resynch continues to function. For example, one of the suffix characters
365 // could be a typo in the middle of a word.
366 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
367 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
368 && fSuffixSet.contains(uc = utext_current32(text))) {
369 if (uc == THAI_PAIYANNOI) {
370 if (!fSuffixSet.contains(utext_previous32(text))) {
371 // Skip over previous end and PAIYANNOI
372 utext_next32(text);
373 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
374 utext_next32(text);
375 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
376 uc = utext_current32(text); // Fetch next character
377 }
378 else {
379 // Restore prior position
380 utext_next32(text);
381 }
382 }
383 if (uc == THAI_MAIYAMOK) {
384 if (utext_previous32(text) != THAI_MAIYAMOK) {
385 // Skip over previous end and MAIYAMOK
386 utext_next32(text);
387 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
388 utext_next32(text);
389 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
390 }
391 else {
392 // Restore prior position
393 utext_next32(text);
394 }
395 }
396 }
397 else {
398 utext_setNativeIndex(text, current+cuWordLength);
399 }
400 }
401
402 // Did we find a word on this iteration? If so, push it on the break stack
403 if (cuWordLength > 0) {
404 foundBreaks.push((current+cuWordLength), status);
405 }
406 }
407
408 // Don't return a break for the end of the dictionary range if there is one there.
409 if (foundBreaks.peeki() >= rangeEnd) {
410 (void) foundBreaks.popi();
411 wordsFound -= 1;
412 }
413
414 return wordsFound;
415 }
416
417 /*
418 ******************************************************************
419 * LaoBreakEngine
420 */
421
422 // How many words in a row are "good enough"?
423 static const int32_t LAO_LOOKAHEAD = 3;
424
425 // Will not combine a non-word with a preceding dictionary word longer than this
426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
427
428 // Will not combine a non-word that shares at least this much prefix with a
429 // dictionary word, with a preceding word
430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
431
432 // Minimum word size
433 static const int32_t LAO_MIN_WORD = 2;
434
435 // Minimum number of characters for two words
436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
437
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
439 : DictionaryBreakEngine(),
440 fDictionary(adoptDictionary)
441 {
442 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
443 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
444 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
445 if (U_SUCCESS(status)) {
446 setCharacters(fLaoWordSet);
447 }
448 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
449 fMarkSet.add(0x0020);
450 fEndWordSet = fLaoWordSet;
451 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
452 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
453 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
454 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
455
456 // Compact for caching.
457 fMarkSet.compact();
458 fEndWordSet.compact();
459 fBeginWordSet.compact();
460 UTRACE_EXIT_STATUS(status);
461 }
462
~LaoBreakEngine()463 LaoBreakEngine::~LaoBreakEngine() {
464 delete fDictionary;
465 }
466
467 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const468 LaoBreakEngine::divideUpDictionaryRange( UText *text,
469 int32_t rangeStart,
470 int32_t rangeEnd,
471 UVector32 &foundBreaks,
472 UErrorCode& status) const {
473 if (U_FAILURE(status)) return 0;
474 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
475 return 0; // Not enough characters for two words
476 }
477
478 uint32_t wordsFound = 0;
479 int32_t cpWordLength = 0;
480 int32_t cuWordLength = 0;
481 int32_t current;
482 PossibleWord words[LAO_LOOKAHEAD];
483
484 utext_setNativeIndex(text, rangeStart);
485
486 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487 cuWordLength = 0;
488 cpWordLength = 0;
489
490 // Look for candidate words at the current position
491 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492
493 // If we found exactly one, use that
494 if (candidates == 1) {
495 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497 wordsFound += 1;
498 }
499 // If there was more than one, see which one can take us forward the most words
500 else if (candidates > 1) {
501 // If we're already at the end of the range, we're done
502 if (utext_getNativeIndex(text) >= rangeEnd) {
503 goto foundBest;
504 }
505 do {
506 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
507 // Followed by another dictionary word; mark first word as a good candidate
508 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
509
510 // If we're already at the end of the range, we're done
511 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
512 goto foundBest;
513 }
514
515 // See if any of the possible second words is followed by a third word
516 do {
517 // If we find a third word, stop right away
518 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
519 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
520 goto foundBest;
521 }
522 }
523 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
524 }
525 }
526 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
527 foundBest:
528 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
529 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
530 wordsFound += 1;
531 }
532
533 // We come here after having either found a word or not. We look ahead to the
534 // next word. If it's not a dictionary word, we will combine it with the word we
535 // just found (if there is one), but only if the preceding word does not exceed
536 // the threshold.
537 // The text iterator should now be positioned at the end of the word we found.
538 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
539 // if it is a dictionary word, do nothing. If it isn't, then if there is
540 // no preceding word, or the non-word shares less than the minimum threshold
541 // of characters with a dictionary word, then scan to resynchronize
542 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
543 && (cuWordLength == 0
544 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
545 // Look for a plausible word boundary
546 int32_t remaining = rangeEnd - (current + cuWordLength);
547 UChar32 pc;
548 UChar32 uc;
549 int32_t chars = 0;
550 for (;;) {
551 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
552 pc = utext_next32(text);
553 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
554 chars += pcSize;
555 remaining -= pcSize;
556 if (remaining <= 0) {
557 break;
558 }
559 uc = utext_current32(text);
560 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
561 // Maybe. See if it's in the dictionary.
562 // TODO: this looks iffy; compare with old code.
563 int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
564 utext_setNativeIndex(text, current + cuWordLength + chars);
565 if (num_candidates > 0) {
566 break;
567 }
568 }
569 }
570
571 // Bump the word count if there wasn't already one
572 if (cuWordLength <= 0) {
573 wordsFound += 1;
574 }
575
576 // Update the length with the passed-over characters
577 cuWordLength += chars;
578 }
579 else {
580 // Back up to where we were for next iteration
581 utext_setNativeIndex(text, current + cuWordLength);
582 }
583 }
584
585 // Never stop before a combining mark.
586 int32_t currPos;
587 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
588 utext_next32(text);
589 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
590 }
591
592 // Look ahead for possible suffixes if a dictionary word does not follow.
593 // We do this in code rather than using a rule so that the heuristic
594 // resynch continues to function. For example, one of the suffix characters
595 // could be a typo in the middle of a word.
596 // NOT CURRENTLY APPLICABLE TO LAO
597
598 // Did we find a word on this iteration? If so, push it on the break stack
599 if (cuWordLength > 0) {
600 foundBreaks.push((current+cuWordLength), status);
601 }
602 }
603
604 // Don't return a break for the end of the dictionary range if there is one there.
605 if (foundBreaks.peeki() >= rangeEnd) {
606 (void) foundBreaks.popi();
607 wordsFound -= 1;
608 }
609
610 return wordsFound;
611 }
612
613 /*
614 ******************************************************************
615 * BurmeseBreakEngine
616 */
617
618 // How many words in a row are "good enough"?
619 static const int32_t BURMESE_LOOKAHEAD = 3;
620
621 // Will not combine a non-word with a preceding dictionary word longer than this
622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
623
624 // Will not combine a non-word that shares at least this much prefix with a
625 // dictionary word, with a preceding word
626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
627
628 // Minimum word size
629 static const int32_t BURMESE_MIN_WORD = 2;
630
631 // Minimum number of characters for two words
632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
633
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
635 : DictionaryBreakEngine(),
636 fDictionary(adoptDictionary)
637 {
638 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
639 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
640 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
641 if (U_SUCCESS(status)) {
642 setCharacters(fBurmeseWordSet);
643 }
644 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
645 fMarkSet.add(0x0020);
646 fEndWordSet = fBurmeseWordSet;
647 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
648
649 // Compact for caching.
650 fMarkSet.compact();
651 fEndWordSet.compact();
652 fBeginWordSet.compact();
653 UTRACE_EXIT_STATUS(status);
654 }
655
~BurmeseBreakEngine()656 BurmeseBreakEngine::~BurmeseBreakEngine() {
657 delete fDictionary;
658 }
659
660 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const661 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
662 int32_t rangeStart,
663 int32_t rangeEnd,
664 UVector32 &foundBreaks,
665 UErrorCode& status ) const {
666 if (U_FAILURE(status)) return 0;
667 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
668 return 0; // Not enough characters for two words
669 }
670
671 uint32_t wordsFound = 0;
672 int32_t cpWordLength = 0;
673 int32_t cuWordLength = 0;
674 int32_t current;
675 PossibleWord words[BURMESE_LOOKAHEAD];
676
677 utext_setNativeIndex(text, rangeStart);
678
679 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
680 cuWordLength = 0;
681 cpWordLength = 0;
682
683 // Look for candidate words at the current position
684 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
685
686 // If we found exactly one, use that
687 if (candidates == 1) {
688 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
689 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
690 wordsFound += 1;
691 }
692 // If there was more than one, see which one can take us forward the most words
693 else if (candidates > 1) {
694 // If we're already at the end of the range, we're done
695 if (utext_getNativeIndex(text) >= rangeEnd) {
696 goto foundBest;
697 }
698 do {
699 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
700 // Followed by another dictionary word; mark first word as a good candidate
701 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
702
703 // If we're already at the end of the range, we're done
704 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705 goto foundBest;
706 }
707
708 // See if any of the possible second words is followed by a third word
709 do {
710 // If we find a third word, stop right away
711 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
712 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
713 goto foundBest;
714 }
715 }
716 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
717 }
718 }
719 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
720 foundBest:
721 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
722 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
723 wordsFound += 1;
724 }
725
726 // We come here after having either found a word or not. We look ahead to the
727 // next word. If it's not a dictionary word, we will combine it with the word we
728 // just found (if there is one), but only if the preceding word does not exceed
729 // the threshold.
730 // The text iterator should now be positioned at the end of the word we found.
731 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
732 // if it is a dictionary word, do nothing. If it isn't, then if there is
733 // no preceding word, or the non-word shares less than the minimum threshold
734 // of characters with a dictionary word, then scan to resynchronize
735 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
736 && (cuWordLength == 0
737 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
738 // Look for a plausible word boundary
739 int32_t remaining = rangeEnd - (current + cuWordLength);
740 UChar32 pc;
741 UChar32 uc;
742 int32_t chars = 0;
743 for (;;) {
744 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
745 pc = utext_next32(text);
746 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
747 chars += pcSize;
748 remaining -= pcSize;
749 if (remaining <= 0) {
750 break;
751 }
752 uc = utext_current32(text);
753 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
754 // Maybe. See if it's in the dictionary.
755 // TODO: this looks iffy; compare with old code.
756 int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
757 utext_setNativeIndex(text, current + cuWordLength + chars);
758 if (num_candidates > 0) {
759 break;
760 }
761 }
762 }
763
764 // Bump the word count if there wasn't already one
765 if (cuWordLength <= 0) {
766 wordsFound += 1;
767 }
768
769 // Update the length with the passed-over characters
770 cuWordLength += chars;
771 }
772 else {
773 // Back up to where we were for next iteration
774 utext_setNativeIndex(text, current + cuWordLength);
775 }
776 }
777
778 // Never stop before a combining mark.
779 int32_t currPos;
780 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
781 utext_next32(text);
782 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
783 }
784
785 // Look ahead for possible suffixes if a dictionary word does not follow.
786 // We do this in code rather than using a rule so that the heuristic
787 // resynch continues to function. For example, one of the suffix characters
788 // could be a typo in the middle of a word.
789 // NOT CURRENTLY APPLICABLE TO BURMESE
790
791 // Did we find a word on this iteration? If so, push it on the break stack
792 if (cuWordLength > 0) {
793 foundBreaks.push((current+cuWordLength), status);
794 }
795 }
796
797 // Don't return a break for the end of the dictionary range if there is one there.
798 if (foundBreaks.peeki() >= rangeEnd) {
799 (void) foundBreaks.popi();
800 wordsFound -= 1;
801 }
802
803 return wordsFound;
804 }
805
806 /*
807 ******************************************************************
808 * KhmerBreakEngine
809 */
810
811 // How many words in a row are "good enough"?
812 static const int32_t KHMER_LOOKAHEAD = 3;
813
814 // Will not combine a non-word with a preceding dictionary word longer than this
815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
816
817 // Will not combine a non-word that shares at least this much prefix with a
818 // dictionary word, with a preceding word
819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
820
821 // Minimum word size
822 static const int32_t KHMER_MIN_WORD = 2;
823
824 // Minimum number of characters for two words
825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
826
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
828 : DictionaryBreakEngine(),
829 fDictionary(adoptDictionary)
830 {
831 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
832 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
833 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
834 if (U_SUCCESS(status)) {
835 setCharacters(fKhmerWordSet);
836 }
837 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
838 fMarkSet.add(0x0020);
839 fEndWordSet = fKhmerWordSet;
840 fBeginWordSet.add(0x1780, 0x17B3);
841 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
842 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
843 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
844 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
845 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
846 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
847 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
848 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
849 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
850 // fSuffixSet.add(THAI_PAIYANNOI);
851 // fSuffixSet.add(THAI_MAIYAMOK);
852
853 // Compact for caching.
854 fMarkSet.compact();
855 fEndWordSet.compact();
856 fBeginWordSet.compact();
857 // fSuffixSet.compact();
858 UTRACE_EXIT_STATUS(status);
859 }
860
~KhmerBreakEngine()861 KhmerBreakEngine::~KhmerBreakEngine() {
862 delete fDictionary;
863 }
864
865 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const866 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
867 int32_t rangeStart,
868 int32_t rangeEnd,
869 UVector32 &foundBreaks,
870 UErrorCode& status ) const {
871 if (U_FAILURE(status)) return 0;
872 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
873 return 0; // Not enough characters for two words
874 }
875
876 uint32_t wordsFound = 0;
877 int32_t cpWordLength = 0;
878 int32_t cuWordLength = 0;
879 int32_t current;
880 PossibleWord words[KHMER_LOOKAHEAD];
881
882 utext_setNativeIndex(text, rangeStart);
883
884 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
885 cuWordLength = 0;
886 cpWordLength = 0;
887
888 // Look for candidate words at the current position
889 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
890
891 // If we found exactly one, use that
892 if (candidates == 1) {
893 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
894 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
895 wordsFound += 1;
896 }
897
898 // If there was more than one, see which one can take us forward the most words
899 else if (candidates > 1) {
900 // If we're already at the end of the range, we're done
901 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
902 goto foundBest;
903 }
904 do {
905 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
906 // Followed by another dictionary word; mark first word as a good candidate
907 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
908
909 // If we're already at the end of the range, we're done
910 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
911 goto foundBest;
912 }
913
914 // See if any of the possible second words is followed by a third word
915 do {
916 // If we find a third word, stop right away
917 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
918 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
919 goto foundBest;
920 }
921 }
922 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
923 }
924 }
925 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
926 foundBest:
927 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
928 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
929 wordsFound += 1;
930 }
931
932 // We come here after having either found a word or not. We look ahead to the
933 // next word. If it's not a dictionary word, we will combine it with the word we
934 // just found (if there is one), but only if the preceding word does not exceed
935 // the threshold.
936 // The text iterator should now be positioned at the end of the word we found.
937 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
938 // if it is a dictionary word, do nothing. If it isn't, then if there is
939 // no preceding word, or the non-word shares less than the minimum threshold
940 // of characters with a dictionary word, then scan to resynchronize
941 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
942 && (cuWordLength == 0
943 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
944 // Look for a plausible word boundary
945 int32_t remaining = rangeEnd - (current+cuWordLength);
946 UChar32 pc;
947 UChar32 uc;
948 int32_t chars = 0;
949 for (;;) {
950 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
951 pc = utext_next32(text);
952 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
953 chars += pcSize;
954 remaining -= pcSize;
955 if (remaining <= 0) {
956 break;
957 }
958 uc = utext_current32(text);
959 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
960 // Maybe. See if it's in the dictionary.
961 int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
962 utext_setNativeIndex(text, current+cuWordLength+chars);
963 if (num_candidates > 0) {
964 break;
965 }
966 }
967 }
968
969 // Bump the word count if there wasn't already one
970 if (cuWordLength <= 0) {
971 wordsFound += 1;
972 }
973
974 // Update the length with the passed-over characters
975 cuWordLength += chars;
976 }
977 else {
978 // Back up to where we were for next iteration
979 utext_setNativeIndex(text, current+cuWordLength);
980 }
981 }
982
983 // Never stop before a combining mark.
984 int32_t currPos;
985 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
986 utext_next32(text);
987 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
988 }
989
990 // Look ahead for possible suffixes if a dictionary word does not follow.
991 // We do this in code rather than using a rule so that the heuristic
992 // resynch continues to function. For example, one of the suffix characters
993 // could be a typo in the middle of a word.
994 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
995 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
996 // && fSuffixSet.contains(uc = utext_current32(text))) {
997 // if (uc == KHMER_PAIYANNOI) {
998 // if (!fSuffixSet.contains(utext_previous32(text))) {
999 // // Skip over previous end and PAIYANNOI
1000 // utext_next32(text);
1001 // utext_next32(text);
1002 // wordLength += 1; // Add PAIYANNOI to word
1003 // uc = utext_current32(text); // Fetch next character
1004 // }
1005 // else {
1006 // // Restore prior position
1007 // utext_next32(text);
1008 // }
1009 // }
1010 // if (uc == KHMER_MAIYAMOK) {
1011 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
1012 // // Skip over previous end and MAIYAMOK
1013 // utext_next32(text);
1014 // utext_next32(text);
1015 // wordLength += 1; // Add MAIYAMOK to word
1016 // }
1017 // else {
1018 // // Restore prior position
1019 // utext_next32(text);
1020 // }
1021 // }
1022 // }
1023 // else {
1024 // utext_setNativeIndex(text, current+wordLength);
1025 // }
1026 // }
1027
1028 // Did we find a word on this iteration? If so, push it on the break stack
1029 if (cuWordLength > 0) {
1030 foundBreaks.push((current+cuWordLength), status);
1031 }
1032 }
1033
1034 // Don't return a break for the end of the dictionary range if there is one there.
1035 if (foundBreaks.peeki() >= rangeEnd) {
1036 (void) foundBreaks.popi();
1037 wordsFound -= 1;
1038 }
1039
1040 return wordsFound;
1041 }
1042
1043 #if !UCONFIG_NO_NORMALIZATION
1044 /*
1045 ******************************************************************
1046 * CjkBreakEngine
1047 */
1048 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1050 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1051 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1052 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1053 // Korean dictionary only includes Hangul syllables
1054 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1055 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1056 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1057 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1058 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1059
1060 if (U_SUCCESS(status)) {
1061 // handle Korean and Japanese/Chinese using different dictionaries
1062 if (type == kKorean) {
1063 setCharacters(fHangulWordSet);
1064 } else { //Chinese and Japanese
1065 UnicodeSet cjSet;
1066 cjSet.addAll(fHanWordSet);
1067 cjSet.addAll(fKatakanaWordSet);
1068 cjSet.addAll(fHiraganaWordSet);
1069 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1070 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1071 setCharacters(cjSet);
1072 }
1073 }
1074 UTRACE_EXIT_STATUS(status);
1075 }
1076
~CjkBreakEngine()1077 CjkBreakEngine::~CjkBreakEngine(){
1078 delete fDictionary;
1079 }
1080
1081 // The katakanaCost values below are based on the length frequencies of all
1082 // katakana phrases in the dictionary
1083 static const int32_t kMaxKatakanaLength = 8;
1084 static const int32_t kMaxKatakanaGroupLength = 20;
1085 static const uint32_t maxSnlp = 255;
1086
getKatakanaCost(int32_t wordLength)1087 static inline uint32_t getKatakanaCost(int32_t wordLength){
1088 //TODO: fill array with actual values from dictionary!
1089 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1090 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1091 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1092 }
1093
isKatakana(UChar32 value)1094 static inline bool isKatakana(UChar32 value) {
1095 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1096 (value >= 0xFF66 && value <= 0xFF9f);
1097 }
1098
1099
1100 // Function for accessing internal utext flags.
1101 // Replicates an internal UText function.
1102
utext_i32_flag(int32_t bitIndex)1103 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1104 return (int32_t)1 << bitIndex;
1105 }
1106
1107
1108 /*
1109 * @param text A UText representing the text
1110 * @param rangeStart The start of the range of dictionary characters
1111 * @param rangeEnd The end of the range of dictionary characters
1112 * @param foundBreaks vector<int32> to receive the break positions
1113 * @return The number of breaks found
1114 */
1115 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks,UErrorCode & status) const1116 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1117 int32_t rangeStart,
1118 int32_t rangeEnd,
1119 UVector32 &foundBreaks,
1120 UErrorCode& status) const {
1121 if (U_FAILURE(status)) return 0;
1122 if (rangeStart >= rangeEnd) {
1123 return 0;
1124 }
1125
1126 // UnicodeString version of input UText, NFKC normalized if necessary.
1127 UnicodeString inString;
1128
1129 // inputMap[inStringIndex] = corresponding native index from UText inText.
1130 // If NULL then mapping is 1:1
1131 LocalPointer<UVector32> inputMap;
1132
1133 // if UText has the input string as one contiguous UTF-16 chunk
1134 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1135 inText->chunkNativeStart <= rangeStart &&
1136 inText->chunkNativeLimit >= rangeEnd &&
1137 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1138
1139 // Input UText is in one contiguous UTF-16 chunk.
1140 // Use Read-only aliasing UnicodeString.
1141 inString.setTo(FALSE,
1142 inText->chunkContents + rangeStart - inText->chunkNativeStart,
1143 rangeEnd - rangeStart);
1144 } else {
1145 // Copy the text from the original inText (UText) to inString (UnicodeString).
1146 // Create a map from UnicodeString indices -> UText offsets.
1147 utext_setNativeIndex(inText, rangeStart);
1148 int32_t limit = rangeEnd;
1149 U_ASSERT(limit <= utext_nativeLength(inText));
1150 if (limit > utext_nativeLength(inText)) {
1151 limit = (int32_t)utext_nativeLength(inText);
1152 }
1153 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1154 if (U_FAILURE(status)) {
1155 return 0;
1156 }
1157 while (utext_getNativeIndex(inText) < limit) {
1158 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1159 UChar32 c = utext_next32(inText);
1160 U_ASSERT(c != U_SENTINEL);
1161 inString.append(c);
1162 while (inputMap->size() < inString.length()) {
1163 inputMap->addElement(nativePosition, status);
1164 }
1165 }
1166 inputMap->addElement(limit, status);
1167 }
1168
1169
1170 if (!nfkcNorm2->isNormalized(inString, status)) {
1171 UnicodeString normalizedInput;
1172 // normalizedMap[normalizedInput position] == original UText position.
1173 LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1174 if (U_FAILURE(status)) {
1175 return 0;
1176 }
1177
1178 UnicodeString fragment;
1179 UnicodeString normalizedFragment;
1180 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
1181 fragment.remove();
1182 int32_t fragmentStartI = srcI;
1183 UChar32 c = inString.char32At(srcI);
1184 for (;;) {
1185 fragment.append(c);
1186 srcI = inString.moveIndex32(srcI, 1);
1187 if (srcI == inString.length()) {
1188 break;
1189 }
1190 c = inString.char32At(srcI);
1191 if (nfkcNorm2->hasBoundaryBefore(c)) {
1192 break;
1193 }
1194 }
1195 nfkcNorm2->normalize(fragment, normalizedFragment, status);
1196 normalizedInput.append(normalizedFragment);
1197
1198 // Map every position in the normalized chunk to the start of the chunk
1199 // in the original input.
1200 int32_t fragmentOriginalStart = inputMap.isValid() ?
1201 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1202 while (normalizedMap->size() < normalizedInput.length()) {
1203 normalizedMap->addElement(fragmentOriginalStart, status);
1204 if (U_FAILURE(status)) {
1205 break;
1206 }
1207 }
1208 }
1209 U_ASSERT(normalizedMap->size() == normalizedInput.length());
1210 int32_t nativeEnd = inputMap.isValid() ?
1211 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1212 normalizedMap->addElement(nativeEnd, status);
1213
1214 inputMap = std::move(normalizedMap);
1215 inString = std::move(normalizedInput);
1216 }
1217
1218 int32_t numCodePts = inString.countChar32();
1219 if (numCodePts != inString.length()) {
1220 // There are supplementary characters in the input.
1221 // The dictionary will produce boundary positions in terms of code point indexes,
1222 // not in terms of code unit string indexes.
1223 // Use the inputMap mechanism to take care of this in addition to indexing differences
1224 // from normalization and/or UTF-8 input.
1225 UBool hadExistingMap = inputMap.isValid();
1226 if (!hadExistingMap) {
1227 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1228 if (U_FAILURE(status)) {
1229 return 0;
1230 }
1231 }
1232 int32_t cpIdx = 0;
1233 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1234 U_ASSERT(cuIdx >= cpIdx);
1235 if (hadExistingMap) {
1236 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1237 } else {
1238 inputMap->addElement(cuIdx+rangeStart, status);
1239 }
1240 cpIdx++;
1241 if (cuIdx == inString.length()) {
1242 break;
1243 }
1244 }
1245 }
1246
1247 // bestSnlp[i] is the snlp of the best segmentation of the first i
1248 // code points in the range to be matched.
1249 UVector32 bestSnlp(numCodePts + 1, status);
1250 bestSnlp.addElement(0, status);
1251 for(int32_t i = 1; i <= numCodePts; i++) {
1252 bestSnlp.addElement(kuint32max, status);
1253 }
1254
1255
1256 // prev[i] is the index of the last CJK code point in the previous word in
1257 // the best segmentation of the first i characters.
1258 UVector32 prev(numCodePts + 1, status);
1259 for(int32_t i = 0; i <= numCodePts; i++){
1260 prev.addElement(-1, status);
1261 }
1262
1263 const int32_t maxWordSize = 20;
1264 UVector32 values(numCodePts, status);
1265 values.setSize(numCodePts);
1266 UVector32 lengths(numCodePts, status);
1267 lengths.setSize(numCodePts);
1268
1269 UText fu = UTEXT_INITIALIZER;
1270 utext_openUnicodeString(&fu, &inString, &status);
1271
1272 // Dynamic programming to find the best segmentation.
1273
1274 // In outer loop, i is the code point index,
1275 // ix is the corresponding string (code unit) index.
1276 // They differ when the string contains supplementary characters.
1277 int32_t ix = 0;
1278 bool is_prev_katakana = false;
1279 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
1280 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1281 continue;
1282 }
1283
1284 int32_t count;
1285 utext_setNativeIndex(&fu, ix);
1286 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1287 NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1288 // Note: lengths is filled with code point lengths
1289 // The NULL parameter is the ignored code unit lengths.
1290
1291 // if there are no single character matches found in the dictionary
1292 // starting with this character, treat character as a 1-character word
1293 // with the highest value possible, i.e. the least likely to occur.
1294 // Exclude Korean characters from this treatment, as they should be left
1295 // together by default.
1296 if ((count == 0 || lengths.elementAti(0) != 1) &&
1297 !fHangulWordSet.contains(inString.char32At(ix))) {
1298 values.setElementAt(maxSnlp, count); // 255
1299 lengths.setElementAt(1, count++);
1300 }
1301
1302 for (int32_t j = 0; j < count; j++) {
1303 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1304 int32_t ln_j_i = lengths.elementAti(j) + i;
1305 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1306 bestSnlp.setElementAt(newSnlp, ln_j_i);
1307 prev.setElementAt(i, ln_j_i);
1308 }
1309 }
1310
1311 // In Japanese,
1312 // Katakana word in single character is pretty rare. So we apply
1313 // the following heuristic to Katakana: any continuous run of Katakana
1314 // characters is considered a candidate word with a default cost
1315 // specified in the katakanaCost table according to its length.
1316
1317 bool is_katakana = isKatakana(inString.char32At(ix));
1318 int32_t katakanaRunLength = 1;
1319 if (!is_prev_katakana && is_katakana) {
1320 int32_t j = inString.moveIndex32(ix, 1);
1321 // Find the end of the continuous run of Katakana characters
1322 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1323 isKatakana(inString.char32At(j))) {
1324 j = inString.moveIndex32(j, 1);
1325 katakanaRunLength++;
1326 }
1327 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1328 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1329 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1330 bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1331 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1332 }
1333 }
1334 }
1335 is_prev_katakana = is_katakana;
1336 }
1337 utext_close(&fu);
1338
1339 // Start pushing the optimal offset index into t_boundary (t for tentative).
1340 // prev[numCodePts] is guaranteed to be meaningful.
1341 // We'll first push in the reverse order, i.e.,
1342 // t_boundary[0] = numCodePts, and afterwards do a swap.
1343 UVector32 t_boundary(numCodePts+1, status);
1344
1345 int32_t numBreaks = 0;
1346 // No segmentation found, set boundary to end of range
1347 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1348 t_boundary.addElement(numCodePts, status);
1349 numBreaks++;
1350 } else {
1351 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1352 t_boundary.addElement(i, status);
1353 numBreaks++;
1354 }
1355 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1356 }
1357
1358 // Add a break for the start of the dictionary range if there is not one
1359 // there already.
1360 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1361 t_boundary.addElement(0, status);
1362 numBreaks++;
1363 }
1364
1365 // Now that we're done, convert positions in t_boundary[] (indices in
1366 // the normalized input string) back to indices in the original input UText
1367 // while reversing t_boundary and pushing values to foundBreaks.
1368 int32_t prevCPPos = -1;
1369 int32_t prevUTextPos = -1;
1370 for (int32_t i = numBreaks-1; i >= 0; i--) {
1371 int32_t cpPos = t_boundary.elementAti(i);
1372 U_ASSERT(cpPos > prevCPPos);
1373 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1374 U_ASSERT(utextPos >= prevUTextPos);
1375 if (utextPos > prevUTextPos) {
1376 // Boundaries are added to foundBreaks output in ascending order.
1377 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1378 foundBreaks.push(utextPos, status);
1379 } else {
1380 // Normalization expanded the input text, the dictionary found a boundary
1381 // within the expansion, giving two boundaries with the same index in the
1382 // original text. Ignore the second. See ticket #12918.
1383 --numBreaks;
1384 }
1385 prevCPPos = cpPos;
1386 prevUTextPos = utextPos;
1387 }
1388 (void)prevCPPos; // suppress compiler warnings about unused variable
1389
1390 // inString goes out of scope
1391 // inputMap goes out of scope
1392 return numBreaks;
1393 }
1394 #endif
1395
1396 U_NAMESPACE_END
1397
1398 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1399
1400