1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "brkeng.h"
15 #include "dictbe.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ubrk.h"
19 #include "uvectr32.h"
20 #include "uvector.h"
21 #include "uassert.h"
22 #include "unicode/normlzr.h"
23 #include "cmemory.h"
24 #include "dictionarydata.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29 ******************************************************************
30 */
31
DictionaryBreakEngine(uint32_t breakTypes)32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
33 fTypes = breakTypes;
34 }
35
~DictionaryBreakEngine()36 DictionaryBreakEngine::~DictionaryBreakEngine() {
37 }
38
39 UBool
handles(UChar32 c,int32_t breakType) const40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
41 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
42 && fSet.contains(c));
43 }
44
45 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,int32_t breakType,UVector32 & foundBreaks) const46 DictionaryBreakEngine::findBreaks( UText *text,
47 int32_t startPos,
48 int32_t endPos,
49 int32_t breakType,
50 UVector32 &foundBreaks ) const {
51 (void)startPos; // TODO: remove this param?
52 int32_t result = 0;
53
54 // Find the span of characters included in the set.
55 // The span to break begins at the current position in the text, and
56 // extends towards the start or end of the text, depending on 'reverse'.
57
58 int32_t start = (int32_t)utext_getNativeIndex(text);
59 int32_t current;
60 int32_t rangeStart;
61 int32_t rangeEnd;
62 UChar32 c = utext_current32(text);
63 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64 utext_next32(text); // TODO: recast loop for postincrement
65 c = utext_current32(text);
66 }
67 rangeStart = start;
68 rangeEnd = current;
69 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
70 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
71 utext_setNativeIndex(text, current);
72 }
73
74 return result;
75 }
76
77 void
setCharacters(const UnicodeSet & set)78 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
79 fSet = set;
80 // Compact for caching
81 fSet.compact();
82 }
83
84 /*
85 ******************************************************************
86 * PossibleWord
87 */
88
89 // Helper class for improving readability of the Thai/Lao/Khmer word break
90 // algorithm. The implementation is completely inline.
91
92 // List size, limited by the maximum number of words in the dictionary
93 // that form a nested sequence.
94 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
95
96 class PossibleWord {
97 private:
98 // list of word candidate lengths, in increasing length order
99 // TODO: bytes would be sufficient for word lengths.
100 int32_t count; // Count of candidates
101 int32_t prefix; // The longest match with a dictionary word
102 int32_t offset; // Offset in the text of these candidates
103 int32_t mark; // The preferred candidate's offset
104 int32_t current; // The candidate we're currently looking at
105 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
106 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
107
108 public:
PossibleWord()109 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
~PossibleWord()110 ~PossibleWord() {};
111
112 // Fill the list of candidates if needed, select the longest, and return the number found
113 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
114
115 // Select the currently marked candidate, point after it in the text, and invalidate self
116 int32_t acceptMarked( UText *text );
117
118 // Back up from the current candidate to the next shorter one; return TRUE if that exists
119 // and point the text after it
120 UBool backUp( UText *text );
121
122 // Return the longest prefix this candidate location shares with a dictionary word
123 // Return value is in code points.
longestPrefix()124 int32_t longestPrefix() { return prefix; };
125
126 // Mark the current candidate as the one we like
markCurrent()127 void markCurrent() { mark = current; };
128
129 // Get length in code points of the marked word.
markedCPLength()130 int32_t markedCPLength() { return cpLengths[mark]; };
131 };
132
133
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)134 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
135 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
136 int32_t start = (int32_t)utext_getNativeIndex(text);
137 if (start != offset) {
138 offset = start;
139 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
140 // Dictionary leaves text after longest prefix, not longest word. Back up.
141 if (count <= 0) {
142 utext_setNativeIndex(text, start);
143 }
144 }
145 if (count > 0) {
146 utext_setNativeIndex(text, start+cuLengths[count-1]);
147 }
148 current = count-1;
149 mark = current;
150 return count;
151 }
152
153 int32_t
acceptMarked(UText * text)154 PossibleWord::acceptMarked( UText *text ) {
155 utext_setNativeIndex(text, offset + cuLengths[mark]);
156 return cuLengths[mark];
157 }
158
159
160 UBool
backUp(UText * text)161 PossibleWord::backUp( UText *text ) {
162 if (current > 0) {
163 utext_setNativeIndex(text, offset + cuLengths[--current]);
164 return TRUE;
165 }
166 return FALSE;
167 }
168
169 /*
170 ******************************************************************
171 * ThaiBreakEngine
172 */
173
174 // How many words in a row are "good enough"?
175 static const int32_t THAI_LOOKAHEAD = 3;
176
177 // Will not combine a non-word with a preceding dictionary word longer than this
178 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
179
180 // Will not combine a non-word that shares at least this much prefix with a
181 // dictionary word, with a preceding word
182 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
183
184 // Ellision character
185 static const int32_t THAI_PAIYANNOI = 0x0E2F;
186
187 // Repeat character
188 static const int32_t THAI_MAIYAMOK = 0x0E46;
189
190 // Minimum word size
191 static const int32_t THAI_MIN_WORD = 2;
192
193 // Minimum number of characters for two words
194 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
195
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)196 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
197 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
198 fDictionary(adoptDictionary)
199 {
200 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201 if (U_SUCCESS(status)) {
202 setCharacters(fThaiWordSet);
203 }
204 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205 fMarkSet.add(0x0020);
206 fEndWordSet = fThaiWordSet;
207 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
208 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
209 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
210 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
211 fSuffixSet.add(THAI_PAIYANNOI);
212 fSuffixSet.add(THAI_MAIYAMOK);
213
214 // Compact for caching.
215 fMarkSet.compact();
216 fEndWordSet.compact();
217 fBeginWordSet.compact();
218 fSuffixSet.compact();
219 }
220
~ThaiBreakEngine()221 ThaiBreakEngine::~ThaiBreakEngine() {
222 delete fDictionary;
223 }
224
225 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const226 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
227 int32_t rangeStart,
228 int32_t rangeEnd,
229 UVector32 &foundBreaks ) const {
230 utext_setNativeIndex(text, rangeStart);
231 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
232 if (utext_getNativeIndex(text) >= rangeEnd) {
233 return 0; // Not enough characters for two words
234 }
235 utext_setNativeIndex(text, rangeStart);
236
237
238 uint32_t wordsFound = 0;
239 int32_t cpWordLength = 0; // Word Length in Code Points.
240 int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
241 int32_t current;
242 UErrorCode status = U_ZERO_ERROR;
243 PossibleWord words[THAI_LOOKAHEAD];
244
245 utext_setNativeIndex(text, rangeStart);
246
247 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
248 cpWordLength = 0;
249 cuWordLength = 0;
250
251 // Look for candidate words at the current position
252 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
253
254 // If we found exactly one, use that
255 if (candidates == 1) {
256 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
257 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
258 wordsFound += 1;
259 }
260 // If there was more than one, see which one can take us forward the most words
261 else if (candidates > 1) {
262 // If we're already at the end of the range, we're done
263 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
264 goto foundBest;
265 }
266 do {
267 int32_t wordsMatched = 1;
268 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
269 if (wordsMatched < 2) {
270 // Followed by another dictionary word; mark first word as a good candidate
271 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
272 wordsMatched = 2;
273 }
274
275 // If we're already at the end of the range, we're done
276 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277 goto foundBest;
278 }
279
280 // See if any of the possible second words is followed by a third word
281 do {
282 // If we find a third word, stop right away
283 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
284 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
285 goto foundBest;
286 }
287 }
288 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
289 }
290 }
291 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
292 foundBest:
293 // Set UText position to after the accepted word.
294 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
295 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
296 wordsFound += 1;
297 }
298
299 // We come here after having either found a word or not. We look ahead to the
300 // next word. If it's not a dictionary word, we will combine it with the word we
301 // just found (if there is one), but only if the preceding word does not exceed
302 // the threshold.
303 // The text iterator should now be positioned at the end of the word we found.
304
305 UChar32 uc = 0;
306 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307 // if it is a dictionary word, do nothing. If it isn't, then if there is
308 // no preceding word, or the non-word shares less than the minimum threshold
309 // of characters with a dictionary word, then scan to resynchronize
310 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311 && (cuWordLength == 0
312 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313 // Look for a plausible word boundary
314 int32_t remaining = rangeEnd - (current+cuWordLength);
315 UChar32 pc;
316 int32_t chars = 0;
317 for (;;) {
318 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
319 pc = utext_next32(text);
320 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
321 chars += pcSize;
322 remaining -= pcSize;
323 if (remaining <= 0) {
324 break;
325 }
326 uc = utext_current32(text);
327 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328 // Maybe. See if it's in the dictionary.
329 // NOTE: In the original Apple code, checked that the next
330 // two characters after uc were not 0x0E4C THANTHAKHAT before
331 // checking the dictionary. That is just a performance filter,
332 // but it's not clear it's faster than checking the trie.
333 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334 utext_setNativeIndex(text, current + cuWordLength + chars);
335 if (candidates > 0) {
336 break;
337 }
338 }
339 }
340
341 // Bump the word count if there wasn't already one
342 if (cuWordLength <= 0) {
343 wordsFound += 1;
344 }
345
346 // Update the length with the passed-over characters
347 cuWordLength += chars;
348 }
349 else {
350 // Back up to where we were for next iteration
351 utext_setNativeIndex(text, current+cuWordLength);
352 }
353 }
354
355 // Never stop before a combining mark.
356 int32_t currPos;
357 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
358 utext_next32(text);
359 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
360 }
361
362 // Look ahead for possible suffixes if a dictionary word does not follow.
363 // We do this in code rather than using a rule so that the heuristic
364 // resynch continues to function. For example, one of the suffix characters
365 // could be a typo in the middle of a word.
366 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
367 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
368 && fSuffixSet.contains(uc = utext_current32(text))) {
369 if (uc == THAI_PAIYANNOI) {
370 if (!fSuffixSet.contains(utext_previous32(text))) {
371 // Skip over previous end and PAIYANNOI
372 utext_next32(text);
373 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
374 utext_next32(text);
375 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
376 uc = utext_current32(text); // Fetch next character
377 }
378 else {
379 // Restore prior position
380 utext_next32(text);
381 }
382 }
383 if (uc == THAI_MAIYAMOK) {
384 if (utext_previous32(text) != THAI_MAIYAMOK) {
385 // Skip over previous end and MAIYAMOK
386 utext_next32(text);
387 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
388 utext_next32(text);
389 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
390 }
391 else {
392 // Restore prior position
393 utext_next32(text);
394 }
395 }
396 }
397 else {
398 utext_setNativeIndex(text, current+cuWordLength);
399 }
400 }
401
402 // Did we find a word on this iteration? If so, push it on the break stack
403 if (cuWordLength > 0) {
404 foundBreaks.push((current+cuWordLength), status);
405 }
406 }
407
408 // Don't return a break for the end of the dictionary range if there is one there.
409 if (foundBreaks.peeki() >= rangeEnd) {
410 (void) foundBreaks.popi();
411 wordsFound -= 1;
412 }
413
414 return wordsFound;
415 }
416
417 /*
418 ******************************************************************
419 * LaoBreakEngine
420 */
421
422 // How many words in a row are "good enough"?
423 static const int32_t LAO_LOOKAHEAD = 3;
424
425 // Will not combine a non-word with a preceding dictionary word longer than this
426 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
427
428 // Will not combine a non-word that shares at least this much prefix with a
429 // dictionary word, with a preceding word
430 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
431
432 // Minimum word size
433 static const int32_t LAO_MIN_WORD = 2;
434
435 // Minimum number of characters for two words
436 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
437
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)438 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
439 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
440 fDictionary(adoptDictionary)
441 {
442 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
443 if (U_SUCCESS(status)) {
444 setCharacters(fLaoWordSet);
445 }
446 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
447 fMarkSet.add(0x0020);
448 fEndWordSet = fLaoWordSet;
449 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
450 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
451 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
452 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
453
454 // Compact for caching.
455 fMarkSet.compact();
456 fEndWordSet.compact();
457 fBeginWordSet.compact();
458 }
459
~LaoBreakEngine()460 LaoBreakEngine::~LaoBreakEngine() {
461 delete fDictionary;
462 }
463
464 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const465 LaoBreakEngine::divideUpDictionaryRange( UText *text,
466 int32_t rangeStart,
467 int32_t rangeEnd,
468 UVector32 &foundBreaks ) const {
469 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
470 return 0; // Not enough characters for two words
471 }
472
473 uint32_t wordsFound = 0;
474 int32_t cpWordLength = 0;
475 int32_t cuWordLength = 0;
476 int32_t current;
477 UErrorCode status = U_ZERO_ERROR;
478 PossibleWord words[LAO_LOOKAHEAD];
479
480 utext_setNativeIndex(text, rangeStart);
481
482 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
483 cuWordLength = 0;
484 cpWordLength = 0;
485
486 // Look for candidate words at the current position
487 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
488
489 // If we found exactly one, use that
490 if (candidates == 1) {
491 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
492 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
493 wordsFound += 1;
494 }
495 // If there was more than one, see which one can take us forward the most words
496 else if (candidates > 1) {
497 // If we're already at the end of the range, we're done
498 if (utext_getNativeIndex(text) >= rangeEnd) {
499 goto foundBest;
500 }
501 do {
502 int32_t wordsMatched = 1;
503 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
504 if (wordsMatched < 2) {
505 // Followed by another dictionary word; mark first word as a good candidate
506 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
507 wordsMatched = 2;
508 }
509
510 // If we're already at the end of the range, we're done
511 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
512 goto foundBest;
513 }
514
515 // See if any of the possible second words is followed by a third word
516 do {
517 // If we find a third word, stop right away
518 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
519 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
520 goto foundBest;
521 }
522 }
523 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
524 }
525 }
526 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
527 foundBest:
528 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
529 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
530 wordsFound += 1;
531 }
532
533 // We come here after having either found a word or not. We look ahead to the
534 // next word. If it's not a dictionary word, we will combine it withe the word we
535 // just found (if there is one), but only if the preceding word does not exceed
536 // the threshold.
537 // The text iterator should now be positioned at the end of the word we found.
538 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
539 // if it is a dictionary word, do nothing. If it isn't, then if there is
540 // no preceding word, or the non-word shares less than the minimum threshold
541 // of characters with a dictionary word, then scan to resynchronize
542 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
543 && (cuWordLength == 0
544 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
545 // Look for a plausible word boundary
546 int32_t remaining = rangeEnd - (current + cuWordLength);
547 UChar32 pc;
548 UChar32 uc;
549 int32_t chars = 0;
550 for (;;) {
551 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
552 pc = utext_next32(text);
553 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
554 chars += pcSize;
555 remaining -= pcSize;
556 if (remaining <= 0) {
557 break;
558 }
559 uc = utext_current32(text);
560 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
561 // Maybe. See if it's in the dictionary.
562 // TODO: this looks iffy; compare with old code.
563 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
564 utext_setNativeIndex(text, current + cuWordLength + chars);
565 if (candidates > 0) {
566 break;
567 }
568 }
569 }
570
571 // Bump the word count if there wasn't already one
572 if (cuWordLength <= 0) {
573 wordsFound += 1;
574 }
575
576 // Update the length with the passed-over characters
577 cuWordLength += chars;
578 }
579 else {
580 // Back up to where we were for next iteration
581 utext_setNativeIndex(text, current + cuWordLength);
582 }
583 }
584
585 // Never stop before a combining mark.
586 int32_t currPos;
587 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
588 utext_next32(text);
589 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
590 }
591
592 // Look ahead for possible suffixes if a dictionary word does not follow.
593 // We do this in code rather than using a rule so that the heuristic
594 // resynch continues to function. For example, one of the suffix characters
595 // could be a typo in the middle of a word.
596 // NOT CURRENTLY APPLICABLE TO LAO
597
598 // Did we find a word on this iteration? If so, push it on the break stack
599 if (cuWordLength > 0) {
600 foundBreaks.push((current+cuWordLength), status);
601 }
602 }
603
604 // Don't return a break for the end of the dictionary range if there is one there.
605 if (foundBreaks.peeki() >= rangeEnd) {
606 (void) foundBreaks.popi();
607 wordsFound -= 1;
608 }
609
610 return wordsFound;
611 }
612
613 /*
614 ******************************************************************
615 * BurmeseBreakEngine
616 */
617
618 // How many words in a row are "good enough"?
619 static const int32_t BURMESE_LOOKAHEAD = 3;
620
621 // Will not combine a non-word with a preceding dictionary word longer than this
622 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
623
624 // Will not combine a non-word that shares at least this much prefix with a
625 // dictionary word, with a preceding word
626 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
627
628 // Minimum word size
629 static const int32_t BURMESE_MIN_WORD = 2;
630
631 // Minimum number of characters for two words
632 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
633
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)634 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
635 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
636 fDictionary(adoptDictionary)
637 {
638 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
639 if (U_SUCCESS(status)) {
640 setCharacters(fBurmeseWordSet);
641 }
642 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
643 fMarkSet.add(0x0020);
644 fEndWordSet = fBurmeseWordSet;
645 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
646
647 // Compact for caching.
648 fMarkSet.compact();
649 fEndWordSet.compact();
650 fBeginWordSet.compact();
651 }
652
~BurmeseBreakEngine()653 BurmeseBreakEngine::~BurmeseBreakEngine() {
654 delete fDictionary;
655 }
656
657 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const658 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
659 int32_t rangeStart,
660 int32_t rangeEnd,
661 UVector32 &foundBreaks ) const {
662 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
663 return 0; // Not enough characters for two words
664 }
665
666 uint32_t wordsFound = 0;
667 int32_t cpWordLength = 0;
668 int32_t cuWordLength = 0;
669 int32_t current;
670 UErrorCode status = U_ZERO_ERROR;
671 PossibleWord words[BURMESE_LOOKAHEAD];
672
673 utext_setNativeIndex(text, rangeStart);
674
675 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
676 cuWordLength = 0;
677 cpWordLength = 0;
678
679 // Look for candidate words at the current position
680 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
681
682 // If we found exactly one, use that
683 if (candidates == 1) {
684 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
685 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
686 wordsFound += 1;
687 }
688 // If there was more than one, see which one can take us forward the most words
689 else if (candidates > 1) {
690 // If we're already at the end of the range, we're done
691 if (utext_getNativeIndex(text) >= rangeEnd) {
692 goto foundBest;
693 }
694 do {
695 int32_t wordsMatched = 1;
696 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
697 if (wordsMatched < 2) {
698 // Followed by another dictionary word; mark first word as a good candidate
699 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
700 wordsMatched = 2;
701 }
702
703 // If we're already at the end of the range, we're done
704 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705 goto foundBest;
706 }
707
708 // See if any of the possible second words is followed by a third word
709 do {
710 // If we find a third word, stop right away
711 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
712 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
713 goto foundBest;
714 }
715 }
716 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
717 }
718 }
719 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
720 foundBest:
721 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
722 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
723 wordsFound += 1;
724 }
725
726 // We come here after having either found a word or not. We look ahead to the
727 // next word. If it's not a dictionary word, we will combine it withe the word we
728 // just found (if there is one), but only if the preceding word does not exceed
729 // the threshold.
730 // The text iterator should now be positioned at the end of the word we found.
731 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
732 // if it is a dictionary word, do nothing. If it isn't, then if there is
733 // no preceding word, or the non-word shares less than the minimum threshold
734 // of characters with a dictionary word, then scan to resynchronize
735 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
736 && (cuWordLength == 0
737 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
738 // Look for a plausible word boundary
739 int32_t remaining = rangeEnd - (current + cuWordLength);
740 UChar32 pc;
741 UChar32 uc;
742 int32_t chars = 0;
743 for (;;) {
744 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
745 pc = utext_next32(text);
746 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
747 chars += pcSize;
748 remaining -= pcSize;
749 if (remaining <= 0) {
750 break;
751 }
752 uc = utext_current32(text);
753 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
754 // Maybe. See if it's in the dictionary.
755 // TODO: this looks iffy; compare with old code.
756 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
757 utext_setNativeIndex(text, current + cuWordLength + chars);
758 if (candidates > 0) {
759 break;
760 }
761 }
762 }
763
764 // Bump the word count if there wasn't already one
765 if (cuWordLength <= 0) {
766 wordsFound += 1;
767 }
768
769 // Update the length with the passed-over characters
770 cuWordLength += chars;
771 }
772 else {
773 // Back up to where we were for next iteration
774 utext_setNativeIndex(text, current + cuWordLength);
775 }
776 }
777
778 // Never stop before a combining mark.
779 int32_t currPos;
780 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
781 utext_next32(text);
782 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
783 }
784
785 // Look ahead for possible suffixes if a dictionary word does not follow.
786 // We do this in code rather than using a rule so that the heuristic
787 // resynch continues to function. For example, one of the suffix characters
788 // could be a typo in the middle of a word.
789 // NOT CURRENTLY APPLICABLE TO BURMESE
790
791 // Did we find a word on this iteration? If so, push it on the break stack
792 if (cuWordLength > 0) {
793 foundBreaks.push((current+cuWordLength), status);
794 }
795 }
796
797 // Don't return a break for the end of the dictionary range if there is one there.
798 if (foundBreaks.peeki() >= rangeEnd) {
799 (void) foundBreaks.popi();
800 wordsFound -= 1;
801 }
802
803 return wordsFound;
804 }
805
806 /*
807 ******************************************************************
808 * KhmerBreakEngine
809 */
810
811 // How many words in a row are "good enough"?
812 static const int32_t KHMER_LOOKAHEAD = 3;
813
814 // Will not combine a non-word with a preceding dictionary word longer than this
815 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
816
817 // Will not combine a non-word that shares at least this much prefix with a
818 // dictionary word, with a preceding word
819 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
820
821 // Minimum word size
822 static const int32_t KHMER_MIN_WORD = 2;
823
824 // Minimum number of characters for two words
825 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
826
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)827 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
828 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
829 fDictionary(adoptDictionary)
830 {
831 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
832 if (U_SUCCESS(status)) {
833 setCharacters(fKhmerWordSet);
834 }
835 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
836 fMarkSet.add(0x0020);
837 fEndWordSet = fKhmerWordSet;
838 fBeginWordSet.add(0x1780, 0x17B3);
839 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
840 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
841 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
842 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
843 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
844 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
845 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
846 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
847 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
848 // fSuffixSet.add(THAI_PAIYANNOI);
849 // fSuffixSet.add(THAI_MAIYAMOK);
850
851 // Compact for caching.
852 fMarkSet.compact();
853 fEndWordSet.compact();
854 fBeginWordSet.compact();
855 // fSuffixSet.compact();
856 }
857
~KhmerBreakEngine()858 KhmerBreakEngine::~KhmerBreakEngine() {
859 delete fDictionary;
860 }
861
862 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const863 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
864 int32_t rangeStart,
865 int32_t rangeEnd,
866 UVector32 &foundBreaks ) const {
867 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
868 return 0; // Not enough characters for two words
869 }
870
871 uint32_t wordsFound = 0;
872 int32_t cpWordLength = 0;
873 int32_t cuWordLength = 0;
874 int32_t current;
875 UErrorCode status = U_ZERO_ERROR;
876 PossibleWord words[KHMER_LOOKAHEAD];
877
878 utext_setNativeIndex(text, rangeStart);
879
880 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
881 cuWordLength = 0;
882 cpWordLength = 0;
883
884 // Look for candidate words at the current position
885 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
886
887 // If we found exactly one, use that
888 if (candidates == 1) {
889 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
890 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
891 wordsFound += 1;
892 }
893
894 // If there was more than one, see which one can take us forward the most words
895 else if (candidates > 1) {
896 // If we're already at the end of the range, we're done
897 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
898 goto foundBest;
899 }
900 do {
901 int32_t wordsMatched = 1;
902 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
903 if (wordsMatched < 2) {
904 // Followed by another dictionary word; mark first word as a good candidate
905 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
906 wordsMatched = 2;
907 }
908
909 // If we're already at the end of the range, we're done
910 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
911 goto foundBest;
912 }
913
914 // See if any of the possible second words is followed by a third word
915 do {
916 // If we find a third word, stop right away
917 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
918 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
919 goto foundBest;
920 }
921 }
922 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
923 }
924 }
925 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
926 foundBest:
927 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
928 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
929 wordsFound += 1;
930 }
931
932 // We come here after having either found a word or not. We look ahead to the
933 // next word. If it's not a dictionary word, we will combine it with the word we
934 // just found (if there is one), but only if the preceding word does not exceed
935 // the threshold.
936 // The text iterator should now be positioned at the end of the word we found.
937 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
938 // if it is a dictionary word, do nothing. If it isn't, then if there is
939 // no preceding word, or the non-word shares less than the minimum threshold
940 // of characters with a dictionary word, then scan to resynchronize
941 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
942 && (cuWordLength == 0
943 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
944 // Look for a plausible word boundary
945 int32_t remaining = rangeEnd - (current+cuWordLength);
946 UChar32 pc;
947 UChar32 uc;
948 int32_t chars = 0;
949 for (;;) {
950 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
951 pc = utext_next32(text);
952 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
953 chars += pcSize;
954 remaining -= pcSize;
955 if (remaining <= 0) {
956 break;
957 }
958 uc = utext_current32(text);
959 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
960 // Maybe. See if it's in the dictionary.
961 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
962 utext_setNativeIndex(text, current+cuWordLength+chars);
963 if (candidates > 0) {
964 break;
965 }
966 }
967 }
968
969 // Bump the word count if there wasn't already one
970 if (cuWordLength <= 0) {
971 wordsFound += 1;
972 }
973
974 // Update the length with the passed-over characters
975 cuWordLength += chars;
976 }
977 else {
978 // Back up to where we were for next iteration
979 utext_setNativeIndex(text, current+cuWordLength);
980 }
981 }
982
983 // Never stop before a combining mark.
984 int32_t currPos;
985 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
986 utext_next32(text);
987 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
988 }
989
990 // Look ahead for possible suffixes if a dictionary word does not follow.
991 // We do this in code rather than using a rule so that the heuristic
992 // resynch continues to function. For example, one of the suffix characters
993 // could be a typo in the middle of a word.
994 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
995 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
996 // && fSuffixSet.contains(uc = utext_current32(text))) {
997 // if (uc == KHMER_PAIYANNOI) {
998 // if (!fSuffixSet.contains(utext_previous32(text))) {
999 // // Skip over previous end and PAIYANNOI
1000 // utext_next32(text);
1001 // utext_next32(text);
1002 // wordLength += 1; // Add PAIYANNOI to word
1003 // uc = utext_current32(text); // Fetch next character
1004 // }
1005 // else {
1006 // // Restore prior position
1007 // utext_next32(text);
1008 // }
1009 // }
1010 // if (uc == KHMER_MAIYAMOK) {
1011 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
1012 // // Skip over previous end and MAIYAMOK
1013 // utext_next32(text);
1014 // utext_next32(text);
1015 // wordLength += 1; // Add MAIYAMOK to word
1016 // }
1017 // else {
1018 // // Restore prior position
1019 // utext_next32(text);
1020 // }
1021 // }
1022 // }
1023 // else {
1024 // utext_setNativeIndex(text, current+wordLength);
1025 // }
1026 // }
1027
1028 // Did we find a word on this iteration? If so, push it on the break stack
1029 if (cuWordLength > 0) {
1030 foundBreaks.push((current+cuWordLength), status);
1031 }
1032 }
1033
1034 // Don't return a break for the end of the dictionary range if there is one there.
1035 if (foundBreaks.peeki() >= rangeEnd) {
1036 (void) foundBreaks.popi();
1037 wordsFound -= 1;
1038 }
1039
1040 return wordsFound;
1041 }
1042
1043 #if !UCONFIG_NO_NORMALIZATION
1044 /*
1045 ******************************************************************
1046 * CjkBreakEngine
1047 */
1048 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1049 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1050 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1051 // Korean dictionary only includes Hangul syllables
1052 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1053 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1054 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1055 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1056 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1057
1058 if (U_SUCCESS(status)) {
1059 // handle Korean and Japanese/Chinese using different dictionaries
1060 if (type == kKorean) {
1061 setCharacters(fHangulWordSet);
1062 } else { //Chinese and Japanese
1063 UnicodeSet cjSet;
1064 cjSet.addAll(fHanWordSet);
1065 cjSet.addAll(fKatakanaWordSet);
1066 cjSet.addAll(fHiraganaWordSet);
1067 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1068 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1069 setCharacters(cjSet);
1070 }
1071 }
1072 }
1073
~CjkBreakEngine()1074 CjkBreakEngine::~CjkBreakEngine(){
1075 delete fDictionary;
1076 }
1077
1078 // The katakanaCost values below are based on the length frequencies of all
1079 // katakana phrases in the dictionary
1080 static const int32_t kMaxKatakanaLength = 8;
1081 static const int32_t kMaxKatakanaGroupLength = 20;
1082 static const uint32_t maxSnlp = 255;
1083
getKatakanaCost(int32_t wordLength)1084 static inline uint32_t getKatakanaCost(int32_t wordLength){
1085 //TODO: fill array with actual values from dictionary!
1086 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1087 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1088 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1089 }
1090
isKatakana(UChar32 value)1091 static inline bool isKatakana(UChar32 value) {
1092 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1093 (value >= 0xFF66 && value <= 0xFF9f);
1094 }
1095
1096
1097 // Function for accessing internal utext flags.
1098 // Replicates an internal UText function.
1099
utext_i32_flag(int32_t bitIndex)1100 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1101 return (int32_t)1 << bitIndex;
1102 }
1103
1104
1105 /*
1106 * @param text A UText representing the text
1107 * @param rangeStart The start of the range of dictionary characters
1108 * @param rangeEnd The end of the range of dictionary characters
1109 * @param foundBreaks vector<int32> to receive the break positions
1110 * @return The number of breaks found
1111 */
1112 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1113 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1114 int32_t rangeStart,
1115 int32_t rangeEnd,
1116 UVector32 &foundBreaks ) const {
1117 if (rangeStart >= rangeEnd) {
1118 return 0;
1119 }
1120
1121 // UnicodeString version of input UText, NFKC normalized if necessary.
1122 UnicodeString inString;
1123
1124 // inputMap[inStringIndex] = corresponding native index from UText inText.
1125 // If NULL then mapping is 1:1
1126 LocalPointer<UVector32> inputMap;
1127
1128 UErrorCode status = U_ZERO_ERROR;
1129
1130
1131 // if UText has the input string as one contiguous UTF-16 chunk
1132 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1133 inText->chunkNativeStart <= rangeStart &&
1134 inText->chunkNativeLimit >= rangeEnd &&
1135 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1136
1137 // Input UText is in one contiguous UTF-16 chunk.
1138 // Use Read-only aliasing UnicodeString.
1139 inString.setTo(FALSE,
1140 inText->chunkContents + rangeStart - inText->chunkNativeStart,
1141 rangeEnd - rangeStart);
1142 } else {
1143 // Copy the text from the original inText (UText) to inString (UnicodeString).
1144 // Create a map from UnicodeString indices -> UText offsets.
1145 utext_setNativeIndex(inText, rangeStart);
1146 int32_t limit = rangeEnd;
1147 U_ASSERT(limit <= utext_nativeLength(inText));
1148 if (limit > utext_nativeLength(inText)) {
1149 limit = (int32_t)utext_nativeLength(inText);
1150 }
1151 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1152 if (U_FAILURE(status)) {
1153 return 0;
1154 }
1155 while (utext_getNativeIndex(inText) < limit) {
1156 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1157 UChar32 c = utext_next32(inText);
1158 U_ASSERT(c != U_SENTINEL);
1159 inString.append(c);
1160 while (inputMap->size() < inString.length()) {
1161 inputMap->addElement(nativePosition, status);
1162 }
1163 }
1164 inputMap->addElement(limit, status);
1165 }
1166
1167
1168 if (!nfkcNorm2->isNormalized(inString, status)) {
1169 UnicodeString normalizedInput;
1170 // normalizedMap[normalizedInput position] == original UText position.
1171 LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1172 if (U_FAILURE(status)) {
1173 return 0;
1174 }
1175
1176 UnicodeString fragment;
1177 UnicodeString normalizedFragment;
1178 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
1179 fragment.remove();
1180 int32_t fragmentStartI = srcI;
1181 UChar32 c = inString.char32At(srcI);
1182 for (;;) {
1183 fragment.append(c);
1184 srcI = inString.moveIndex32(srcI, 1);
1185 if (srcI == inString.length()) {
1186 break;
1187 }
1188 c = inString.char32At(srcI);
1189 if (nfkcNorm2->hasBoundaryBefore(c)) {
1190 break;
1191 }
1192 }
1193 nfkcNorm2->normalize(fragment, normalizedFragment, status);
1194 normalizedInput.append(normalizedFragment);
1195
1196 // Map every position in the normalized chunk to the start of the chunk
1197 // in the original input.
1198 int32_t fragmentOriginalStart = inputMap.isValid() ?
1199 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1200 while (normalizedMap->size() < normalizedInput.length()) {
1201 normalizedMap->addElement(fragmentOriginalStart, status);
1202 if (U_FAILURE(status)) {
1203 break;
1204 }
1205 }
1206 }
1207 U_ASSERT(normalizedMap->size() == normalizedInput.length());
1208 int32_t nativeEnd = inputMap.isValid() ?
1209 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1210 normalizedMap->addElement(nativeEnd, status);
1211
1212 inputMap.moveFrom(normalizedMap);
1213 inString.moveFrom(normalizedInput);
1214 }
1215
1216 int32_t numCodePts = inString.countChar32();
1217 if (numCodePts != inString.length()) {
1218 // There are supplementary characters in the input.
1219 // The dictionary will produce boundary positions in terms of code point indexes,
1220 // not in terms of code unit string indexes.
1221 // Use the inputMap mechanism to take care of this in addition to indexing differences
1222 // from normalization and/or UTF-8 input.
1223 UBool hadExistingMap = inputMap.isValid();
1224 if (!hadExistingMap) {
1225 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1226 if (U_FAILURE(status)) {
1227 return 0;
1228 }
1229 }
1230 int32_t cpIdx = 0;
1231 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1232 U_ASSERT(cuIdx >= cpIdx);
1233 if (hadExistingMap) {
1234 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1235 } else {
1236 inputMap->addElement(cuIdx+rangeStart, status);
1237 }
1238 cpIdx++;
1239 if (cuIdx == inString.length()) {
1240 break;
1241 }
1242 }
1243 }
1244
1245 // bestSnlp[i] is the snlp of the best segmentation of the first i
1246 // code points in the range to be matched.
1247 UVector32 bestSnlp(numCodePts + 1, status);
1248 bestSnlp.addElement(0, status);
1249 for(int32_t i = 1; i <= numCodePts; i++) {
1250 bestSnlp.addElement(kuint32max, status);
1251 }
1252
1253
1254 // prev[i] is the index of the last CJK code point in the previous word in
1255 // the best segmentation of the first i characters.
1256 UVector32 prev(numCodePts + 1, status);
1257 for(int32_t i = 0; i <= numCodePts; i++){
1258 prev.addElement(-1, status);
1259 }
1260
1261 const int32_t maxWordSize = 20;
1262 UVector32 values(numCodePts, status);
1263 values.setSize(numCodePts);
1264 UVector32 lengths(numCodePts, status);
1265 lengths.setSize(numCodePts);
1266
1267 UText fu = UTEXT_INITIALIZER;
1268 utext_openUnicodeString(&fu, &inString, &status);
1269
1270 // Dynamic programming to find the best segmentation.
1271
1272 // In outer loop, i is the code point index,
1273 // ix is the corresponding string (code unit) index.
1274 // They differ when the string contains supplementary characters.
1275 int32_t ix = 0;
1276 bool is_prev_katakana = false;
1277 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
1278 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1279 continue;
1280 }
1281
1282 int32_t count;
1283 utext_setNativeIndex(&fu, ix);
1284 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1285 NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1286 // Note: lengths is filled with code point lengths
1287 // The NULL parameter is the ignored code unit lengths.
1288
1289 // if there are no single character matches found in the dictionary
1290 // starting with this character, treat character as a 1-character word
1291 // with the highest value possible, i.e. the least likely to occur.
1292 // Exclude Korean characters from this treatment, as they should be left
1293 // together by default.
1294 if ((count == 0 || lengths.elementAti(0) != 1) &&
1295 !fHangulWordSet.contains(inString.char32At(ix))) {
1296 values.setElementAt(maxSnlp, count); // 255
1297 lengths.setElementAt(1, count++);
1298 }
1299
1300 for (int32_t j = 0; j < count; j++) {
1301 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1302 int32_t ln_j_i = lengths.elementAti(j) + i;
1303 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1304 bestSnlp.setElementAt(newSnlp, ln_j_i);
1305 prev.setElementAt(i, ln_j_i);
1306 }
1307 }
1308
1309 // In Japanese,
1310 // Katakana word in single character is pretty rare. So we apply
1311 // the following heuristic to Katakana: any continuous run of Katakana
1312 // characters is considered a candidate word with a default cost
1313 // specified in the katakanaCost table according to its length.
1314
1315 bool is_katakana = isKatakana(inString.char32At(ix));
1316 int32_t katakanaRunLength = 1;
1317 if (!is_prev_katakana && is_katakana) {
1318 int32_t j = inString.moveIndex32(ix, 1);
1319 // Find the end of the continuous run of Katakana characters
1320 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1321 isKatakana(inString.char32At(j))) {
1322 j = inString.moveIndex32(j, 1);
1323 katakanaRunLength++;
1324 }
1325 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1326 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1327 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1328 bestSnlp.setElementAt(newSnlp, j);
1329 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1330 }
1331 }
1332 }
1333 is_prev_katakana = is_katakana;
1334 }
1335 utext_close(&fu);
1336
1337 // Start pushing the optimal offset index into t_boundary (t for tentative).
1338 // prev[numCodePts] is guaranteed to be meaningful.
1339 // We'll first push in the reverse order, i.e.,
1340 // t_boundary[0] = numCodePts, and afterwards do a swap.
1341 UVector32 t_boundary(numCodePts+1, status);
1342
1343 int32_t numBreaks = 0;
1344 // No segmentation found, set boundary to end of range
1345 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1346 t_boundary.addElement(numCodePts, status);
1347 numBreaks++;
1348 } else {
1349 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1350 t_boundary.addElement(i, status);
1351 numBreaks++;
1352 }
1353 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1354 }
1355
1356 // Add a break for the start of the dictionary range if there is not one
1357 // there already.
1358 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1359 t_boundary.addElement(0, status);
1360 numBreaks++;
1361 }
1362
1363 // Now that we're done, convert positions in t_boundary[] (indices in
1364 // the normalized input string) back to indices in the original input UText
1365 // while reversing t_boundary and pushing values to foundBreaks.
1366 int32_t prevCPPos = -1;
1367 int32_t prevUTextPos = -1;
1368 for (int32_t i = numBreaks-1; i >= 0; i--) {
1369 int32_t cpPos = t_boundary.elementAti(i);
1370 U_ASSERT(cpPos > prevCPPos);
1371 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1372 U_ASSERT(utextPos >= prevUTextPos);
1373 if (utextPos > prevUTextPos) {
1374 // Boundaries are added to foundBreaks output in ascending order.
1375 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1376 foundBreaks.push(utextPos, status);
1377 } else {
1378 // Normalization expanded the input text, the dictionary found a boundary
1379 // within the expansion, giving two boundaries with the same index in the
1380 // original text. Ignore the second. See ticket #12918.
1381 --numBreaks;
1382 }
1383 prevCPPos = cpPos;
1384 prevUTextPos = utextPos;
1385 }
1386 (void)prevCPPos; // suppress compiler warnings about unused variable
1387
1388 // inString goes out of scope
1389 // inputMap goes out of scope
1390 return numBreaks;
1391 }
1392 #endif
1393
1394 U_NAMESPACE_END
1395
1396 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1397
1398