1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 
19 U_NAMESPACE_BEGIN
20 
21 class DictionaryMatcher;
22 class Normalizer2;
23 
24 /*******************************************************************
25  * DictionaryBreakEngine
26  */
27 
28 /**
29  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
30  * dictionary to determine language-specific breaks.</p>
31  *
32  * <p>After it is constructed a DictionaryBreakEngine may be shared between
33  * threads without synchronization.</p>
34  */
35 class DictionaryBreakEngine : public LanguageBreakEngine {
36  private:
37     /**
38      * The set of characters handled by this engine
39      * @internal
40      */
41 
42   UnicodeSet    fSet;
43 
44     /**
45      * The set of break types handled by this engine
46      * @internal
47      */
48 
49   uint32_t      fTypes;
50 
51   /**
52    * <p>Default constructor.</p>
53    *
54    */
55   DictionaryBreakEngine();
56 
57  public:
58 
59   /**
60    * <p>Constructor setting the break types handled.</p>
61    *
62    * @param breakTypes A bitmap of types handled by the engine.
63    */
64   DictionaryBreakEngine( uint32_t breakTypes );
65 
66   /**
67    * <p>Virtual destructor.</p>
68    */
69   virtual ~DictionaryBreakEngine();
70 
71   /**
72    * <p>Indicate whether this engine handles a particular character for
73    * a particular kind of break.</p>
74    *
75    * @param c A character which begins a run that the engine might handle
76    * @param breakType The type of text break which the caller wants to determine
77    * @return TRUE if this engine handles the particular character and break
78    * type.
79    */
80   virtual UBool handles( UChar32 c, int32_t breakType ) const;
81 
82   /**
83    * <p>Find any breaks within a run in the supplied text.</p>
84    *
85    * @param text A UText representing the text. The iterator is left at
86    * the end of the run of characters which the engine is capable of handling
87    * that starts from the first (or last) character in the range.
88    * @param startPos The start of the run within the supplied text.
89    * @param endPos The end of the run within the supplied text.
90    * @param reverse Whether the caller is looking for breaks in a reverse
91    * direction.
92    * @param breakType The type of break desired, or -1.
93    * @param foundBreaks An allocated C array of the breaks found, if any
94    * @return The number of breaks found.
95    */
96   virtual int32_t findBreaks( UText *text,
97                               int32_t startPos,
98                               int32_t endPos,
99                               UBool reverse,
100                               int32_t breakType,
101                               UStack &foundBreaks ) const;
102 
103  protected:
104 
105  /**
106   * <p>Set the character set handled by this engine.</p>
107   *
108   * @param set A UnicodeSet of the set of characters handled by the engine
109   */
110   virtual void setCharacters( const UnicodeSet &set );
111 
112  /**
113   * <p>Set the break types handled by this engine.</p>
114   *
115   * @param breakTypes A bitmap of types handled by the engine.
116   */
117 //  virtual void setBreakTypes( uint32_t breakTypes );
118 
119  /**
120   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
121   *
122   * @param text A UText representing the text
123   * @param rangeStart The start of the range of dictionary characters
124   * @param rangeEnd The end of the range of dictionary characters
125   * @param foundBreaks Output of C array of int32_t break positions, or 0
126   * @return The number of breaks found
127   */
128   virtual int32_t divideUpDictionaryRange( UText *text,
129                                            int32_t rangeStart,
130                                            int32_t rangeEnd,
131                                            UStack &foundBreaks ) const = 0;
132 
133 };
134 
135 /*******************************************************************
136  * ThaiBreakEngine
137  */
138 
139 /**
140  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
141  * dictionary and heuristics to determine Thai-specific breaks.</p>
142  *
143  * <p>After it is constructed a ThaiBreakEngine may be shared between
144  * threads without synchronization.</p>
145  */
146 class ThaiBreakEngine : public DictionaryBreakEngine {
147  private:
148     /**
149      * The set of characters handled by this engine
150      * @internal
151      */
152 
153   UnicodeSet                fThaiWordSet;
154   UnicodeSet                fEndWordSet;
155   UnicodeSet                fBeginWordSet;
156   UnicodeSet                fSuffixSet;
157   UnicodeSet                fMarkSet;
158   DictionaryMatcher  *fDictionary;
159 
160  public:
161 
162   /**
163    * <p>Default constructor.</p>
164    *
165    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
166    * engine is deleted.
167    */
168   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
169 
170   /**
171    * <p>Virtual destructor.</p>
172    */
173   virtual ~ThaiBreakEngine();
174 
175  protected:
176  /**
177   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
178   *
179   * @param text A UText representing the text
180   * @param rangeStart The start of the range of dictionary characters
181   * @param rangeEnd The end of the range of dictionary characters
182   * @param foundBreaks Output of C array of int32_t break positions, or 0
183   * @return The number of breaks found
184   */
185   virtual int32_t divideUpDictionaryRange( UText *text,
186                                            int32_t rangeStart,
187                                            int32_t rangeEnd,
188                                            UStack &foundBreaks ) const;
189 
190 };
191 
192 /*******************************************************************
193  * LaoBreakEngine
194  */
195 
196 /**
197  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
198  * dictionary and heuristics to determine Lao-specific breaks.</p>
199  *
200  * <p>After it is constructed a LaoBreakEngine may be shared between
201  * threads without synchronization.</p>
202  */
203 class LaoBreakEngine : public DictionaryBreakEngine {
204  private:
205     /**
206      * The set of characters handled by this engine
207      * @internal
208      */
209 
210   UnicodeSet                fLaoWordSet;
211   UnicodeSet                fEndWordSet;
212   UnicodeSet                fBeginWordSet;
213   UnicodeSet                fMarkSet;
214   DictionaryMatcher  *fDictionary;
215 
216  public:
217 
218   /**
219    * <p>Default constructor.</p>
220    *
221    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
222    * engine is deleted.
223    */
224   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
225 
226   /**
227    * <p>Virtual destructor.</p>
228    */
229   virtual ~LaoBreakEngine();
230 
231  protected:
232  /**
233   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
234   *
235   * @param text A UText representing the text
236   * @param rangeStart The start of the range of dictionary characters
237   * @param rangeEnd The end of the range of dictionary characters
238   * @param foundBreaks Output of C array of int32_t break positions, or 0
239   * @return The number of breaks found
240   */
241   virtual int32_t divideUpDictionaryRange( UText *text,
242                                            int32_t rangeStart,
243                                            int32_t rangeEnd,
244                                            UStack &foundBreaks ) const;
245 
246 };
247 
248 /*******************************************************************
249  * BurmeseBreakEngine
250  */
251 
252 /**
253  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
254  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
255  *
256  * <p>After it is constructed a BurmeseBreakEngine may be shared between
257  * threads without synchronization.</p>
258  */
259 class BurmeseBreakEngine : public DictionaryBreakEngine {
260  private:
261     /**
262      * The set of characters handled by this engine
263      * @internal
264      */
265 
266   UnicodeSet                fBurmeseWordSet;
267   UnicodeSet                fEndWordSet;
268   UnicodeSet                fBeginWordSet;
269   UnicodeSet                fMarkSet;
270   DictionaryMatcher  *fDictionary;
271 
272  public:
273 
274   /**
275    * <p>Default constructor.</p>
276    *
277    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
278    * engine is deleted.
279    */
280   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
281 
282   /**
283    * <p>Virtual destructor.</p>
284    */
285   virtual ~BurmeseBreakEngine();
286 
287  protected:
288  /**
289   * <p>Divide up a range of known dictionary characters.</p>
290   *
291   * @param text A UText representing the text
292   * @param rangeStart The start of the range of dictionary characters
293   * @param rangeEnd The end of the range of dictionary characters
294   * @param foundBreaks Output of C array of int32_t break positions, or 0
295   * @return The number of breaks found
296   */
297   virtual int32_t divideUpDictionaryRange( UText *text,
298                                            int32_t rangeStart,
299                                            int32_t rangeEnd,
300                                            UStack &foundBreaks ) const;
301 
302 };
303 
304 /*******************************************************************
305  * KhmerBreakEngine
306  */
307 
308 /**
309  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
310  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
311  *
312  * <p>After it is constructed a KhmerBreakEngine may be shared between
313  * threads without synchronization.</p>
314  */
315 class KhmerBreakEngine : public DictionaryBreakEngine {
316  private:
317     /**
318      * The set of characters handled by this engine
319      * @internal
320      */
321 
322   UnicodeSet                fKhmerWordSet;
323   UnicodeSet                fEndWordSet;
324   UnicodeSet                fBeginWordSet;
325   UnicodeSet                fMarkSet;
326   DictionaryMatcher  *fDictionary;
327 
328  public:
329 
330   /**
331    * <p>Default constructor.</p>
332    *
333    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
334    * engine is deleted.
335    */
336   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
337 
338   /**
339    * <p>Virtual destructor.</p>
340    */
341   virtual ~KhmerBreakEngine();
342 
343  protected:
344  /**
345   * <p>Divide up a range of known dictionary characters.</p>
346   *
347   * @param text A UText representing the text
348   * @param rangeStart The start of the range of dictionary characters
349   * @param rangeEnd The end of the range of dictionary characters
350   * @param foundBreaks Output of C array of int32_t break positions, or 0
351   * @return The number of breaks found
352   */
353   virtual int32_t divideUpDictionaryRange( UText *text,
354                                            int32_t rangeStart,
355                                            int32_t rangeEnd,
356                                            UStack &foundBreaks ) const;
357 
358 };
359 
360 #if !UCONFIG_NO_NORMALIZATION
361 
362 /*******************************************************************
363  * CjkBreakEngine
364  */
365 
366 //indicates language/script that the CjkBreakEngine will handle
367 enum LanguageType {
368     kKorean,
369     kChineseJapanese
370 };
371 
372 /**
373  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
374  * dictionary with costs associated with each word and
375  * Viterbi decoding to determine CJK-specific breaks.</p>
376  */
377 class CjkBreakEngine : public DictionaryBreakEngine {
378  protected:
379     /**
380      * The set of characters handled by this engine
381      * @internal
382      */
383   UnicodeSet                fHangulWordSet;
384   UnicodeSet                fHanWordSet;
385   UnicodeSet                fKatakanaWordSet;
386   UnicodeSet                fHiraganaWordSet;
387 
388   DictionaryMatcher        *fDictionary;
389   const Normalizer2        *nfkcNorm2;
390 
391  public:
392 
393     /**
394      * <p>Default constructor.</p>
395      *
396      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
397      * engine is deleted. The DictionaryMatcher must contain costs for each word
398      * in order for the dictionary to work properly.
399      */
400   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
401 
402     /**
403      * <p>Virtual destructor.</p>
404      */
405   virtual ~CjkBreakEngine();
406 
407  protected:
408     /**
409      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
410      *
411      * @param text A UText representing the text
412      * @param rangeStart The start of the range of dictionary characters
413      * @param rangeEnd The end of the range of dictionary characters
414      * @param foundBreaks Output of C array of int32_t break positions, or 0
415      * @return The number of breaks found
416      */
417   virtual int32_t divideUpDictionaryRange( UText *text,
418           int32_t rangeStart,
419           int32_t rangeEnd,
420           UStack &foundBreaks ) const;
421 
422 };
423 
424 #endif
425 
426 U_NAMESPACE_END
427 
428     /* DICTBE_H */
429 #endif
430