1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12
13 #include "cmemory.h"
14
15 #include "unicode/filteredbrk.h"
16 #include "unicode/ucharstriebuilder.h"
17 #include "unicode/ures.h"
18
19 #include "uresimp.h" // ures_getByKeyWithFallback
20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
21 #include "uvector.h"
22 #include "cmemory.h"
23 #include "umutex.h"
24
25 U_NAMESPACE_BEGIN
26
27 #ifndef FB_DEBUG
28 #define FB_DEBUG 0
29 #endif
30
31 #if FB_DEBUG
32 #include <stdio.h>
_fb_trace(const char * m,const UnicodeString * s,UBool b,int32_t d,const char * f,int l)33 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
34 char buf[2048];
35 if(s) {
36 s->extract(0,s->length(),buf,2048);
37 } else {
38 strcpy(buf,"NULL");
39 }
40 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
41 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
42 }
43
44 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
45 #else
46 #define FB_TRACE(m,s,b,d)
47 #endif
48
49 /**
50 * Used with sortedInsert()
51 */
compareUnicodeString(UElement t1,UElement t2)52 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
53 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
54 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
55 return a.compare(b);
56 }
57
58 /**
59 * A UVector which implements a set of strings.
60 */
61 class U_COMMON_API UStringSet : public UVector {
62 public:
UStringSet(UErrorCode & status)63 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
64 uhash_compareUnicodeString,
65 1,
66 status) {}
67 virtual ~UStringSet();
68 /**
69 * Is this UnicodeSet contained?
70 */
contains(const UnicodeString & s)71 inline UBool contains(const UnicodeString& s) {
72 return contains((void*) &s);
73 }
74 using UVector::contains;
75 /**
76 * Return the ith UnicodeString alias
77 */
getStringAt(int32_t i) const78 inline const UnicodeString* getStringAt(int32_t i) const {
79 return (const UnicodeString*)elementAt(i);
80 }
81 /**
82 * Adopt the UnicodeString if not already contained.
83 * Caller no longer owns the pointer in any case.
84 * @return true if adopted successfully, false otherwise (error, or else duplicate)
85 */
adopt(UnicodeString * str,UErrorCode & status)86 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
87 if(U_FAILURE(status) || contains(*str)) {
88 delete str;
89 return false;
90 } else {
91 sortedInsert(str, compareUnicodeString, status);
92 if(U_FAILURE(status)) {
93 delete str;
94 return false;
95 }
96 return true;
97 }
98 }
99 /**
100 * Add by value.
101 * @return true if successfully adopted.
102 */
add(const UnicodeString & str,UErrorCode & status)103 inline UBool add(const UnicodeString& str, UErrorCode &status) {
104 if(U_FAILURE(status)) return false;
105 UnicodeString *t = new UnicodeString(str);
106 if(t==NULL) {
107 status = U_MEMORY_ALLOCATION_ERROR; return false;
108 }
109 return adopt(t, status);
110 }
111 /**
112 * Remove this string.
113 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
114 */
remove(const UnicodeString & s,UErrorCode & status)115 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
116 if(U_FAILURE(status)) return false;
117 return removeElement((void*) &s);
118 }
119 };
120
121 /**
122 * Virtual, won't be inlined
123 */
~UStringSet()124 UStringSet::~UStringSet() {}
125
126 /* ----------------------------------------------------------- */
127
128
129 /* Filtered Break constants */
130 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
131 static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
132 static const int32_t kSuppressInReverse = (1<<0);
133 static const int32_t kAddToForward = (1<<1);
134 static const UChar kFULLSTOP = 0x002E; // '.'
135
136 /**
137 * Shared data for SimpleFilteredSentenceBreakIterator
138 */
139 class SimpleFilteredSentenceBreakData : public UMemory {
140 public:
SimpleFilteredSentenceBreakData(UCharsTrie * forwards,UCharsTrie * backwards)141 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
142 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
incr()143 SimpleFilteredSentenceBreakData *incr() {
144 umtx_atomic_inc(&refcount);
145 return this;
146 }
decr()147 SimpleFilteredSentenceBreakData *decr() {
148 if(umtx_atomic_dec(&refcount) <= 0) {
149 delete this;
150 }
151 return 0;
152 }
153 virtual ~SimpleFilteredSentenceBreakData();
154
hasForwardsPartialTrie() const155 bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
hasBackwardsTrie() const156 bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
157
getForwardsPartialTrie() const158 const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
getBackwardsTrie() const159 const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
160
161 private:
162 // These tries own their data arrays.
163 // They are shared and must therefore not be modified.
164 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
165 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
166 u_atomic_int32_t refcount;
167 };
168
~SimpleFilteredSentenceBreakData()169 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
170
171 /**
172 * Concrete implementation
173 */
174 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
175 public:
176 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
177 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
178 virtual ~SimpleFilteredSentenceBreakIterator();
179 private:
180 SimpleFilteredSentenceBreakData *fData;
181 LocalPointer<BreakIterator> fDelegate;
182 LocalUTextPointer fText;
183
184 /* -- subclass interface -- */
185 public:
186 /* -- cloning and other subclass stuff -- */
createBufferClone(void *,int32_t &,UErrorCode & status)187 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
188 int32_t &/*BufferSize*/,
189 UErrorCode &status) {
190 // for now - always deep clone
191 status = U_SAFECLONE_ALLOCATED_WARNING;
192 return clone();
193 }
clone() const194 virtual SimpleFilteredSentenceBreakIterator* clone() const { return new SimpleFilteredSentenceBreakIterator(*this); }
getDynamicClassID(void) const195 virtual UClassID getDynamicClassID(void) const { return NULL; }
operator ==(const BreakIterator & o) const196 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
197
198 /* -- text modifying -- */
setText(UText * text,UErrorCode & status)199 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
refreshInputText(UText * input,UErrorCode & status)200 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
adoptText(CharacterIterator * it)201 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
setText(const UnicodeString & text)202 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
203
204 /* -- other functions that are just delegated -- */
getUText(UText * fillIn,UErrorCode & status) const205 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
getText(void) const206 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
207
208 /* -- ITERATION -- */
209 virtual int32_t first(void);
210 virtual int32_t preceding(int32_t offset);
211 virtual int32_t previous(void);
212 virtual UBool isBoundary(int32_t offset);
current(void) const213 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
214
215 virtual int32_t next(void);
216
217 virtual int32_t next(int32_t n);
218 virtual int32_t following(int32_t offset);
219 virtual int32_t last(void);
220
221 private:
222 /**
223 * Given that the fDelegate has already given its "initial" answer,
224 * find the NEXT actual (non-excepted) break.
225 * @param n initial position from delegate
226 * @return new break position or UBRK_DONE
227 */
228 int32_t internalNext(int32_t n);
229 /**
230 * Given that the fDelegate has already given its "initial" answer,
231 * find the PREV actual (non-excepted) break.
232 * @param n initial position from delegate
233 * @return new break position or UBRK_DONE
234 */
235 int32_t internalPrev(int32_t n);
236 /**
237 * set up the UText with the value of the fDelegate.
238 * Call this before calling breakExceptionAt.
239 * May be able to avoid excess calls
240 */
241 void resetState(UErrorCode &status);
242 /**
243 * Is there a match (exception) at this spot?
244 */
245 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
246 /**
247 * Determine if there is an exception at this spot
248 * @param n spot to check
249 * @return kNoExceptionHere or kExceptionHere
250 **/
251 enum EFBMatchResult breakExceptionAt(int32_t n);
252 };
253
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator & other)254 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
255 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
256 {
257 }
258
259
SimpleFilteredSentenceBreakIterator(BreakIterator * adopt,UCharsTrie * forwards,UCharsTrie * backwards,UErrorCode & status)260 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
261 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
262 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
263 fDelegate(adopt)
264 {
265 if (fData == nullptr) {
266 delete forwards;
267 delete backwards;
268 if (U_SUCCESS(status)) {
269 status = U_MEMORY_ALLOCATION_ERROR;
270 }
271 }
272 }
273
~SimpleFilteredSentenceBreakIterator()274 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
275 fData = fData->decr();
276 }
277
resetState(UErrorCode & status)278 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
279 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
280 }
281
282 SimpleFilteredSentenceBreakIterator::EFBMatchResult
breakExceptionAt(int32_t n)283 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
284 int64_t bestPosn = -1;
285 int32_t bestValue = -1;
286 // loops while 'n' points to an exception.
287 utext_setNativeIndex(fText.getAlias(), n); // from n..
288
289 //if(debug2) u_printf(" n@ %d\n", n);
290 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
291 if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
292 // TODO only do this the 1st time?
293 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
294 } else {
295 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
296 utext_next32(fText.getAlias());
297 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
298 }
299
300 {
301 // Do not modify the shared trie!
302 UCharsTrie iter(fData->getBackwardsTrie());
303 UChar32 uch;
304 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
305 UStringTrieResult r = iter.nextForCodePoint(uch);
306 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
307 bestPosn = utext_getNativeIndex(fText.getAlias());
308 bestValue = iter.getValue();
309 }
310 if(!USTRINGTRIE_HAS_NEXT(r)) {
311 break;
312 }
313 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
314 }
315 }
316
317 //if(bestValue >= 0) {
318 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
319 //}
320
321 if(bestPosn>=0) {
322 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
323
324 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
325 //int32_t bestValue = iter.getValue();
326 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
327
328 if(bestValue == kMATCH) { // exact match!
329 //if(debug2) u_printf(" exact backward match\n");
330 return kExceptionHere; // See if the next is another exception.
331 } else if(bestValue == kPARTIAL
332 && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
333 //if(debug2) u_printf(" partial backward match\n");
334 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
335 // to see if it matches something going forward.
336 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
337 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
338 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
339 // Do not modify the shared trie!
340 UCharsTrie iter(fData->getForwardsPartialTrie());
341 UChar32 uch;
342 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
343 USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
344 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
345 }
346 if(USTRINGTRIE_MATCHES(rfwd)) {
347 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
348 // only full matches here, nothing to check
349 // skip the next:
350 return kExceptionHere;
351 } else {
352 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
353 // no match (no exception) -return the 'underlying' break
354 return kNoExceptionHere;
355 }
356 } else {
357 return kNoExceptionHere; // internal error and/or no forwards trie
358 }
359 } else {
360 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
361 return kNoExceptionHere; // No match - so exit. Not an exception.
362 }
363 }
364
365 // the workhorse single next.
366 int32_t
internalNext(int32_t n)367 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
368 if(n == UBRK_DONE || // at end or
369 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
370 return n;
371 }
372 // OK, do we need to break here?
373 UErrorCode status = U_ZERO_ERROR;
374 // refresh text
375 resetState(status);
376 if(U_FAILURE(status)) return UBRK_DONE; // bail out
377 int64_t utextLen = utext_nativeLength(fText.getAlias());
378
379 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
380 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
381 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
382
383 switch(m) {
384 case kExceptionHere:
385 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
386 continue;
387
388 default:
389 case kNoExceptionHere:
390 return n;
391 }
392 }
393 return n;
394 }
395
396 int32_t
internalPrev(int32_t n)397 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
398 if(n == 0 || n == UBRK_DONE || // at end or
399 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
400 return n;
401 }
402 // OK, do we need to break here?
403 UErrorCode status = U_ZERO_ERROR;
404 // refresh text
405 resetState(status);
406 if(U_FAILURE(status)) return UBRK_DONE; // bail out
407
408 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
409 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
410 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
411
412 switch(m) {
413 case kExceptionHere:
414 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
415 continue;
416
417 default:
418 case kNoExceptionHere:
419 return n;
420 }
421 }
422 return n;
423 }
424
425
426 int32_t
next()427 SimpleFilteredSentenceBreakIterator::next() {
428 return internalNext(fDelegate->next());
429 }
430
431 int32_t
first(void)432 SimpleFilteredSentenceBreakIterator::first(void) {
433 // Don't suppress a break opportunity at the beginning of text.
434 return fDelegate->first();
435 }
436
437 int32_t
preceding(int32_t offset)438 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
439 return internalPrev(fDelegate->preceding(offset));
440 }
441
442 int32_t
previous(void)443 SimpleFilteredSentenceBreakIterator::previous(void) {
444 return internalPrev(fDelegate->previous());
445 }
446
isBoundary(int32_t offset)447 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
448 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
449
450 if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
451
452 UErrorCode status = U_ZERO_ERROR;
453 resetState(status);
454
455 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
456
457 switch(m) {
458 case kExceptionHere:
459 return false;
460 default:
461 case kNoExceptionHere:
462 return true;
463 }
464 }
465
466 int32_t
next(int32_t offset)467 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
468 return internalNext(fDelegate->next(offset));
469 }
470
471 int32_t
following(int32_t offset)472 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
473 return internalNext(fDelegate->following(offset));
474 }
475
476 int32_t
last(void)477 SimpleFilteredSentenceBreakIterator::last(void) {
478 // Don't suppress a break opportunity at the end of text.
479 return fDelegate->last();
480 }
481
482
483 /**
484 * Concrete implementation of builder class.
485 */
486 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
487 public:
488 virtual ~SimpleFilteredBreakIteratorBuilder();
489 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
490 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
491 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
492 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
493 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
494 private:
495 UStringSet fSet;
496 };
497
~SimpleFilteredBreakIteratorBuilder()498 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
499 {
500 }
501
SimpleFilteredBreakIteratorBuilder(UErrorCode & status)502 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
503 : fSet(status)
504 {
505 }
506
SimpleFilteredBreakIteratorBuilder(const Locale & fromLocale,UErrorCode & status)507 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
508 : fSet(status)
509 {
510 if(U_SUCCESS(status)) {
511 UErrorCode subStatus = U_ZERO_ERROR;
512 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
513 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
514 status = subStatus; // copy the failing status
515 #if FB_DEBUG
516 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
517 #endif
518 return; // leaves the builder empty, if you try to use it.
519 }
520 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
521 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
522 status = subStatus; // copy the failing status
523 #if FB_DEBUG
524 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
525 #endif
526 return; // leaves the builder empty, if you try to use it.
527 }
528 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
529
530 #if FB_DEBUG
531 {
532 UErrorCode subsub = subStatus;
533 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
534 }
535 #endif
536
537 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
538 status = subStatus; // copy the failing status
539 #if FB_DEBUG
540 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
541 #endif
542 return; // leaves the builder empty, if you try to use it.
543 }
544
545 LocalUResourceBundlePointer strs;
546 subStatus = status; // Pick up inherited warning status now
547 do {
548 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
549 if(strs.isValid() && U_SUCCESS(subStatus)) {
550 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
551 suppressBreakAfter(str, status); // load the string
552 }
553 } while (strs.isValid() && U_SUCCESS(subStatus));
554 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
555 status = subStatus;
556 }
557 }
558 }
559
560 UBool
suppressBreakAfter(const UnicodeString & exception,UErrorCode & status)561 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
562 {
563 UBool r = fSet.add(exception, status);
564 FB_TRACE("suppressBreakAfter",&exception,r,0);
565 return r;
566 }
567
568 UBool
unsuppressBreakAfter(const UnicodeString & exception,UErrorCode & status)569 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
570 {
571 UBool r = fSet.remove(exception, status);
572 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
573 return r;
574 }
575
576 /**
577 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
578 * Work around this.
579 *
580 * Note: "new UnicodeString[subCount]" ends up calling global operator new
581 * on MSVC2012 for some reason.
582 */
newUnicodeStringArray(size_t count)583 static inline UnicodeString* newUnicodeStringArray(size_t count) {
584 return new UnicodeString[count ? count : 1];
585 }
586
587 BreakIterator *
build(BreakIterator * adoptBreakIterator,UErrorCode & status)588 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
589 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
590
591 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
592 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
593 if(U_FAILURE(status)) {
594 return NULL;
595 }
596
597 int32_t revCount = 0;
598 int32_t fwdCount = 0;
599
600 int32_t subCount = fSet.size();
601
602 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
603
604 LocalArray<UnicodeString> ustrs(ustrs_ptr);
605
606 LocalMemory<int> partials;
607 partials.allocateInsteadAndReset(subCount);
608
609 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
610 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
611
612 int n=0;
613 for ( int32_t i = 0;
614 i<fSet.size();
615 i++) {
616 const UnicodeString *abbr = fSet.getStringAt(i);
617 if(abbr) {
618 FB_TRACE("build",abbr,TRUE,i);
619 ustrs[n] = *abbr; // copy by value
620 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
621 } else {
622 FB_TRACE("build",abbr,FALSE,i);
623 status = U_MEMORY_ALLOCATION_ERROR;
624 return NULL;
625 }
626 partials[n] = 0; // default: not partial
627 n++;
628 }
629 // first pass - find partials.
630 for(int i=0;i<subCount;i++) {
631 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
632 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
633 FB_TRACE("partial",&ustrs[i],FALSE,i);
634 // is partial.
635 // is it unique?
636 int sameAs = -1;
637 for(int j=0;j<subCount;j++) {
638 if(j==i) continue;
639 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
640 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
641 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
642 if(partials[j]==0) { // hasn't been processed yet
643 partials[j] = kSuppressInReverse | kAddToForward;
644 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
645 } else if(partials[j] & kSuppressInReverse) {
646 sameAs = j; // the other entry is already in the reverse table.
647 }
648 }
649 }
650 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
651 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
652 UnicodeString prefix(ustrs[i], 0, nn+1);
653 if(sameAs == -1 && partials[i] == 0) {
654 // first one - add the prefix to the reverse table.
655 prefix.reverse();
656 builder->add(prefix, kPARTIAL, status);
657 revCount++;
658 FB_TRACE("Added partial",&prefix,FALSE, i);
659 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
660 partials[i] = kSuppressInReverse | kAddToForward;
661 } else {
662 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
663 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
664 }
665 }
666 }
667 for(int i=0;i<subCount;i++) {
668 if(partials[i]==0) {
669 ustrs[i].reverse();
670 builder->add(ustrs[i], kMATCH, status);
671 revCount++;
672 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
673 } else {
674 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
675
676 // an optimization would be to only add the portion after the '.'
677 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
678 // instead of "Ph.D." since we already know the "Ph." part is a match.
679 // would need the trie to be able to hold 0-length strings, though.
680 builder2->add(ustrs[i], kMATCH, status); // forward
681 fwdCount++;
682 //ustrs[i].reverse();
683 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
684 }
685 }
686 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
687
688 if(revCount>0) {
689 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
690 if(U_FAILURE(status)) {
691 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
692 return NULL;
693 }
694 }
695
696 if(fwdCount>0) {
697 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
698 if(U_FAILURE(status)) {
699 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
700 return NULL;
701 }
702 }
703
704 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
705 }
706
707
708 // ----------- Base class implementation
709
FilteredBreakIteratorBuilder()710 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
711 }
712
~FilteredBreakIteratorBuilder()713 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
714 }
715
716 FilteredBreakIteratorBuilder *
createInstance(const Locale & where,UErrorCode & status)717 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
718 if(U_FAILURE(status)) return NULL;
719 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
720 return (U_SUCCESS(status))? ret.orphan(): NULL;
721 }
722
723 FilteredBreakIteratorBuilder *
createInstance(UErrorCode & status)724 FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
725 return createEmptyInstance(status);
726 }
727
728 FilteredBreakIteratorBuilder *
createEmptyInstance(UErrorCode & status)729 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
730 if(U_FAILURE(status)) return NULL;
731 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
732 return (U_SUCCESS(status))? ret.orphan(): NULL;
733 }
734
735 U_NAMESPACE_END
736
737 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
738