1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2008-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   05/11/2008  Andy Heninger  Port from Java
10 **********************************************************************
11 */
12 
13 #include <utility>
14 
15 #include "unicode/utypes.h"
16 
17 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
18 
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/uchar.h"
22 #include "unicode/unifilt.h"
23 #include "unicode/uniset.h"
24 
25 #include "brktrans.h"
26 #include "cmemory.h"
27 #include "mutex.h"
28 #include "uprops.h"
29 #include "uinvchar.h"
30 #include "util.h"
31 #include "uvectr32.h"
32 
33 U_NAMESPACE_BEGIN
34 
35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
36 
37 static const UChar SPACE       = 32;  // ' '
38 
39 
40 /**
41  * Constructs a transliterator with the default delimiters '{' and
42  * '}'.
43  */
BreakTransliterator(UnicodeFilter * adoptedFilter)44 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
45         Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
46         cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
47     }
48 
49 
50 /**
51  * Destructor.
52  */
~BreakTransliterator()53 BreakTransliterator::~BreakTransliterator() {
54 }
55 
56 /**
57  * Copy constructor.
58  */
BreakTransliterator(const BreakTransliterator & o)59 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
60         Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
61 }
62 
63 
64 /**
65  * Transliterator API.
66  */
clone() const67 BreakTransliterator* BreakTransliterator::clone() const {
68     return new BreakTransliterator(*this);
69 }
70 
71 /**
72  * Implements {@link Transliterator#handleTransliterate}.
73  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const74 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
75                                                     UBool isIncremental ) const {
76 
77         UErrorCode status = U_ZERO_ERROR;
78         LocalPointer<BreakIterator> bi;
79         LocalPointer<UVector32> boundaries;
80 
81         {
82             Mutex m;
83             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
84             boundaries = std::move(nonConstThis->cachedBoundaries);
85             bi = std::move(nonConstThis->cachedBI);
86         }
87         if (bi.isNull()) {
88             bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
89         }
90         if (boundaries.isNull()) {
91             boundaries.adoptInstead(new UVector32(status));
92         }
93 
94         if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
95             return;
96         }
97 
98         boundaries->removeAllElements();
99         UnicodeString sText = replaceableAsString(text);
100         bi->setText(sText);
101         bi->preceding(offsets.start);
102 
103         // To make things much easier, we will stack the boundaries, and then insert at the end.
104         // generally, we won't need too many, since we will be filtered.
105 
106         int32_t boundary;
107         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
108             if (boundary == 0) continue;
109             // HACK: Check to see that preceeding item was a letter
110 
111             UChar32 cp = sText.char32At(boundary-1);
112             int type = u_charType(cp);
113             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
114             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
115 
116             cp = sText.char32At(boundary);
117             type = u_charType(cp);
118             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
119             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
120 
121             boundaries->addElement(boundary, status);
122             // printf("Boundary at %d\n", boundary);
123         }
124 
125         int delta = 0;
126         int lastBoundary = 0;
127 
128         if (boundaries->size() != 0) { // if we found something, adjust
129             delta = boundaries->size() * fInsertion.length();
130             lastBoundary = boundaries->lastElementi();
131 
132             // we do this from the end backwards, so that we don't have to keep updating.
133 
134             while (boundaries->size() > 0) {
135                 boundary = boundaries->popi();
136                 text.handleReplaceBetween(boundary, boundary, fInsertion);
137             }
138         }
139 
140         // Now fix up the return values
141         offsets.contextLimit += delta;
142         offsets.limit += delta;
143         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
144 
145         // Return break iterator & boundaries vector to the cache.
146         {
147             Mutex m;
148             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
149             if (nonConstThis->cachedBI.isNull()) {
150                 nonConstThis->cachedBI = std::move(bi);
151             }
152             if (nonConstThis->cachedBoundaries.isNull()) {
153                 nonConstThis->cachedBoundaries = std::move(boundaries);
154             }
155         }
156 
157         // TODO:  do something with U_FAILURE(status);
158         //        (need to look at transliterators overall, not just here.)
159 }
160 
161 //
162 //  getInsertion()
163 //
getInsertion() const164 const UnicodeString &BreakTransliterator::getInsertion() const {
165     return fInsertion;
166 }
167 
168 //
169 //  setInsertion()
170 //
setInsertion(const UnicodeString & insertion)171 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
172     this->fInsertion = insertion;
173 }
174 
175 //
176 //   replaceableAsString   Hack to let break iterators work
177 //                         on the replaceable text from transliterators.
178 //                         In practice, the only real Replaceable type that we
179 //                         will be seeing is UnicodeString, so this function
180 //                         will normally be efficient.
181 //
replaceableAsString(Replaceable & r)182 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
183     UnicodeString s;
184     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
185     if (rs != NULL) {
186         s = *rs;
187     } else {
188         r.extractBetween(0, r.length(), s);
189     }
190     return s;
191 }
192 
193 U_NAMESPACE_END
194 
195 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
196