1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ************************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ************************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "charstr.h"
29 #include "dictionarydata.h"
30 #include "mutex.h"
31 #include "uvector.h"
32 #include "umutex.h"
33 #include "uresimp.h"
34 #include "ubrkimpl.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 /*
39  ******************************************************************
40  */
41 
LanguageBreakEngine()42 LanguageBreakEngine::LanguageBreakEngine() {
43 }
44 
~LanguageBreakEngine()45 LanguageBreakEngine::~LanguageBreakEngine() {
46 }
47 
48 /*
49  ******************************************************************
50  */
51 
LanguageBreakFactory()52 LanguageBreakFactory::LanguageBreakFactory() {
53 }
54 
~LanguageBreakFactory()55 LanguageBreakFactory::~LanguageBreakFactory() {
56 }
57 
58 /*
59  ******************************************************************
60  */
61 
UnhandledEngine(UErrorCode & status)62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
63     (void)status;
64 }
65 
~UnhandledEngine()66 UnhandledEngine::~UnhandledEngine() {
67     delete fHandled;
68     fHandled = nullptr;
69 }
70 
71 UBool
handles(UChar32 c) const72 UnhandledEngine::handles(UChar32 c) const {
73     return fHandled && fHandled->contains(c);
74 }
75 
76 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,UVector32 &) const77 UnhandledEngine::findBreaks( UText *text,
78                              int32_t /* startPos */,
79                              int32_t endPos,
80                              UVector32 &/*foundBreaks*/ ) const {
81     UChar32 c = utext_current32(text);
82     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
83         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
84         c = utext_current32(text);
85     }
86     return 0;
87 }
88 
89 void
handleCharacter(UChar32 c)90 UnhandledEngine::handleCharacter(UChar32 c) {
91     if (fHandled == nullptr) {
92         fHandled = new UnicodeSet();
93         if (fHandled == nullptr) {
94             return;
95         }
96     }
97     if (!fHandled->contains(c)) {
98         UErrorCode status = U_ZERO_ERROR;
99         // Apply the entire script of the character.
100         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
101         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
102     }
103 }
104 
105 /*
106  ******************************************************************
107  */
108 
ICULanguageBreakFactory(UErrorCode &)109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
110     fEngines = 0;
111 }
112 
~ICULanguageBreakFactory()113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
114     if (fEngines != 0) {
115         delete fEngines;
116     }
117 }
118 
119 U_NAMESPACE_END
120 U_CDECL_BEGIN
_deleteEngine(void * obj)121 static void U_CALLCONV _deleteEngine(void *obj) {
122     delete (const icu::LanguageBreakEngine *) obj;
123 }
124 U_CDECL_END
125 U_NAMESPACE_BEGIN
126 
127 const LanguageBreakEngine *
getEngineFor(UChar32 c)128 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
129     const LanguageBreakEngine *lbe = NULL;
130     UErrorCode  status = U_ZERO_ERROR;
131 
132     static UMutex gBreakEngineMutex;
133     Mutex m(&gBreakEngineMutex);
134 
135     if (fEngines == NULL) {
136         UStack  *engines = new UStack(_deleteEngine, NULL, status);
137         if (U_FAILURE(status) || engines == NULL) {
138             // Note: no way to return error code to caller.
139             delete engines;
140             return NULL;
141         }
142         fEngines = engines;
143     } else {
144         int32_t i = fEngines->size();
145         while (--i >= 0) {
146             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
147             if (lbe != NULL && lbe->handles(c)) {
148                 return lbe;
149             }
150         }
151     }
152 
153     // We didn't find an engine. Create one.
154     lbe = loadEngineFor(c);
155     if (lbe != NULL) {
156         fEngines->push((void *)lbe, status);
157     }
158     return lbe;
159 }
160 
161 const LanguageBreakEngine *
loadEngineFor(UChar32 c)162 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
163     UErrorCode status = U_ZERO_ERROR;
164     UScriptCode code = uscript_getScript(c, &status);
165     if (U_SUCCESS(status)) {
166         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
167         if (m != NULL) {
168             const LanguageBreakEngine *engine = NULL;
169             switch(code) {
170             case USCRIPT_THAI:
171                 engine = new ThaiBreakEngine(m, status);
172                 break;
173             case USCRIPT_LAO:
174                 engine = new LaoBreakEngine(m, status);
175                 break;
176             case USCRIPT_MYANMAR:
177                 engine = new BurmeseBreakEngine(m, status);
178                 break;
179             case USCRIPT_KHMER:
180                 engine = new KhmerBreakEngine(m, status);
181                 break;
182 
183 #if !UCONFIG_NO_NORMALIZATION
184                 // CJK not available w/o normalization
185             case USCRIPT_HANGUL:
186                 engine = new CjkBreakEngine(m, kKorean, status);
187                 break;
188 
189             // use same BreakEngine and dictionary for both Chinese and Japanese
190             case USCRIPT_HIRAGANA:
191             case USCRIPT_KATAKANA:
192             case USCRIPT_HAN:
193                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
194                 break;
195 #if 0
196             // TODO: Have to get some characters with script=common handled
197             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
198             // them to CjkBreakEngine does not work. The engine has to
199             // special-case them.
200             case USCRIPT_COMMON:
201             {
202                 UBlockCode block = ublock_getCode(code);
203                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
204                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
205                 break;
206             }
207 #endif
208 #endif
209 
210             default:
211                 break;
212             }
213             if (engine == NULL) {
214                 delete m;
215             }
216             else if (U_FAILURE(status)) {
217                 delete engine;
218                 engine = NULL;
219             }
220             return engine;
221         }
222     }
223     return NULL;
224 }
225 
226 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script)227 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
228     UErrorCode status = U_ZERO_ERROR;
229     // open root from brkitr tree.
230     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
231     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
232     int32_t dictnlength = 0;
233     const UChar *dictfname =
234         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
235     if (U_FAILURE(status)) {
236         ures_close(b);
237         return NULL;
238     }
239     CharString dictnbuf;
240     CharString ext;
241     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
242     if (extStart != NULL) {
243         int32_t len = (int32_t)(extStart - dictfname);
244         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
245         dictnlength = len;
246     }
247     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
248     ures_close(b);
249 
250     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
251     if (U_SUCCESS(status)) {
252         // build trie
253         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
254         const int32_t *indexes = (const int32_t *)data;
255         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
256         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
257         DictionaryMatcher *m = NULL;
258         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
259             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
260             const char *characters = (const char *)(data + offset);
261             m = new BytesDictionaryMatcher(characters, transform, file);
262         }
263         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
264             const UChar *characters = (const UChar *)(data + offset);
265             m = new UCharsDictionaryMatcher(characters, file);
266         }
267         if (m == NULL) {
268             // no matcher exists to take ownership - either we are an invalid
269             // type or memory allocation failed
270             udata_close(file);
271         }
272         return m;
273     } else if (dictfname != NULL) {
274         // we don't have a dictionary matcher.
275         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
276         status = U_ZERO_ERROR;
277         return NULL;
278     }
279     return NULL;
280 }
281 
282 U_NAMESPACE_END
283 
284 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
285