1 /**
2  *******************************************************************************
3  * Copyright (C) 2006-2014, International Business Machines Corporation   *
4  * and others. All Rights Reserved.                                            *
5  *******************************************************************************
6  */
7 
8 #ifndef DICTBE_H
9 #define DICTBE_H
10 
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
14 
15 #include "brkeng.h"
16 
17 U_NAMESPACE_BEGIN
18 
19 class DictionaryMatcher;
20 class Normalizer2;
21 
22 /*******************************************************************
23  * DictionaryBreakEngine
24  */
25 
26 /**
27  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
28  * dictionary to determine language-specific breaks.</p>
29  *
30  * <p>After it is constructed a DictionaryBreakEngine may be shared between
31  * threads without synchronization.</p>
32  */
33 class DictionaryBreakEngine : public LanguageBreakEngine {
34  private:
35     /**
36      * The set of characters handled by this engine
37      * @internal
38      */
39 
40   UnicodeSet    fSet;
41 
42     /**
43      * The set of break types handled by this engine
44      * @internal
45      */
46 
47   uint32_t      fTypes;
48 
49   /**
50    * <p>Default constructor.</p>
51    *
52    */
53   DictionaryBreakEngine();
54 
55  public:
56 
57   /**
58    * <p>Constructor setting the break types handled.</p>
59    *
60    * @param breakTypes A bitmap of types handled by the engine.
61    */
62   DictionaryBreakEngine( uint32_t breakTypes );
63 
64   /**
65    * <p>Virtual destructor.</p>
66    */
67   virtual ~DictionaryBreakEngine();
68 
69   /**
70    * <p>Indicate whether this engine handles a particular character for
71    * a particular kind of break.</p>
72    *
73    * @param c A character which begins a run that the engine might handle
74    * @param breakType The type of text break which the caller wants to determine
75    * @return TRUE if this engine handles the particular character and break
76    * type.
77    */
78   virtual UBool handles( UChar32 c, int32_t breakType ) const;
79 
80   /**
81    * <p>Find any breaks within a run in the supplied text.</p>
82    *
83    * @param text A UText representing the text. The iterator is left at
84    * the end of the run of characters which the engine is capable of handling
85    * that starts from the first (or last) character in the range.
86    * @param startPos The start of the run within the supplied text.
87    * @param endPos The end of the run within the supplied text.
88    * @param reverse Whether the caller is looking for breaks in a reverse
89    * direction.
90    * @param breakType The type of break desired, or -1.
91    * @param foundBreaks An allocated C array of the breaks found, if any
92    * @return The number of breaks found.
93    */
94   virtual int32_t findBreaks( UText *text,
95                               int32_t startPos,
96                               int32_t endPos,
97                               UBool reverse,
98                               int32_t breakType,
99                               UStack &foundBreaks ) const;
100 
101  protected:
102 
103  /**
104   * <p>Set the character set handled by this engine.</p>
105   *
106   * @param set A UnicodeSet of the set of characters handled by the engine
107   */
108   virtual void setCharacters( const UnicodeSet &set );
109 
110  /**
111   * <p>Set the break types handled by this engine.</p>
112   *
113   * @param breakTypes A bitmap of types handled by the engine.
114   */
115 //  virtual void setBreakTypes( uint32_t breakTypes );
116 
117  /**
118   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119   *
120   * @param text A UText representing the text
121   * @param rangeStart The start of the range of dictionary characters
122   * @param rangeEnd The end of the range of dictionary characters
123   * @param foundBreaks Output of C array of int32_t break positions, or 0
124   * @return The number of breaks found
125   */
126   virtual int32_t divideUpDictionaryRange( UText *text,
127                                            int32_t rangeStart,
128                                            int32_t rangeEnd,
129                                            UStack &foundBreaks ) const = 0;
130 
131 };
132 
133 /*******************************************************************
134  * ThaiBreakEngine
135  */
136 
137 /**
138  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
139  * dictionary and heuristics to determine Thai-specific breaks.</p>
140  *
141  * <p>After it is constructed a ThaiBreakEngine may be shared between
142  * threads without synchronization.</p>
143  */
144 class ThaiBreakEngine : public DictionaryBreakEngine {
145  private:
146     /**
147      * The set of characters handled by this engine
148      * @internal
149      */
150 
151   UnicodeSet                fThaiWordSet;
152   UnicodeSet                fEndWordSet;
153   UnicodeSet                fBeginWordSet;
154   UnicodeSet                fSuffixSet;
155   UnicodeSet                fMarkSet;
156   DictionaryMatcher  *fDictionary;
157 
158  public:
159 
160   /**
161    * <p>Default constructor.</p>
162    *
163    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
164    * engine is deleted.
165    */
166   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
167 
168   /**
169    * <p>Virtual destructor.</p>
170    */
171   virtual ~ThaiBreakEngine();
172 
173  protected:
174  /**
175   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176   *
177   * @param text A UText representing the text
178   * @param rangeStart The start of the range of dictionary characters
179   * @param rangeEnd The end of the range of dictionary characters
180   * @param foundBreaks Output of C array of int32_t break positions, or 0
181   * @return The number of breaks found
182   */
183   virtual int32_t divideUpDictionaryRange( UText *text,
184                                            int32_t rangeStart,
185                                            int32_t rangeEnd,
186                                            UStack &foundBreaks ) const;
187 
188 };
189 
190 /*******************************************************************
191  * LaoBreakEngine
192  */
193 
194 /**
195  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
196  * dictionary and heuristics to determine Lao-specific breaks.</p>
197  *
198  * <p>After it is constructed a LaoBreakEngine may be shared between
199  * threads without synchronization.</p>
200  */
201 class LaoBreakEngine : public DictionaryBreakEngine {
202  private:
203     /**
204      * The set of characters handled by this engine
205      * @internal
206      */
207 
208   UnicodeSet                fLaoWordSet;
209   UnicodeSet                fEndWordSet;
210   UnicodeSet                fBeginWordSet;
211   UnicodeSet                fMarkSet;
212   DictionaryMatcher  *fDictionary;
213 
214  public:
215 
216   /**
217    * <p>Default constructor.</p>
218    *
219    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
220    * engine is deleted.
221    */
222   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
223 
224   /**
225    * <p>Virtual destructor.</p>
226    */
227   virtual ~LaoBreakEngine();
228 
229  protected:
230  /**
231   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
232   *
233   * @param text A UText representing the text
234   * @param rangeStart The start of the range of dictionary characters
235   * @param rangeEnd The end of the range of dictionary characters
236   * @param foundBreaks Output of C array of int32_t break positions, or 0
237   * @return The number of breaks found
238   */
239   virtual int32_t divideUpDictionaryRange( UText *text,
240                                            int32_t rangeStart,
241                                            int32_t rangeEnd,
242                                            UStack &foundBreaks ) const;
243 
244 };
245 
246 /*******************************************************************
247  * BurmeseBreakEngine
248  */
249 
250 /**
251  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
253  *
254  * <p>After it is constructed a BurmeseBreakEngine may be shared between
255  * threads without synchronization.</p>
256  */
257 class BurmeseBreakEngine : public DictionaryBreakEngine {
258  private:
259     /**
260      * The set of characters handled by this engine
261      * @internal
262      */
263 
264   UnicodeSet                fBurmeseWordSet;
265   UnicodeSet                fEndWordSet;
266   UnicodeSet                fBeginWordSet;
267   UnicodeSet                fMarkSet;
268   DictionaryMatcher  *fDictionary;
269 
270  public:
271 
272   /**
273    * <p>Default constructor.</p>
274    *
275    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
276    * engine is deleted.
277    */
278   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
279 
280   /**
281    * <p>Virtual destructor.</p>
282    */
283   virtual ~BurmeseBreakEngine();
284 
285  protected:
286  /**
287   * <p>Divide up a range of known dictionary characters.</p>
288   *
289   * @param text A UText representing the text
290   * @param rangeStart The start of the range of dictionary characters
291   * @param rangeEnd The end of the range of dictionary characters
292   * @param foundBreaks Output of C array of int32_t break positions, or 0
293   * @return The number of breaks found
294   */
295   virtual int32_t divideUpDictionaryRange( UText *text,
296                                            int32_t rangeStart,
297                                            int32_t rangeEnd,
298                                            UStack &foundBreaks ) const;
299 
300 };
301 
302 /*******************************************************************
303  * KhmerBreakEngine
304  */
305 
306 /**
307  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
308  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
309  *
310  * <p>After it is constructed a KhmerBreakEngine may be shared between
311  * threads without synchronization.</p>
312  */
313 class KhmerBreakEngine : public DictionaryBreakEngine {
314  private:
315     /**
316      * The set of characters handled by this engine
317      * @internal
318      */
319 
320   UnicodeSet                fKhmerWordSet;
321   UnicodeSet                fEndWordSet;
322   UnicodeSet                fBeginWordSet;
323   UnicodeSet                fMarkSet;
324   DictionaryMatcher  *fDictionary;
325 
326  public:
327 
328   /**
329    * <p>Default constructor.</p>
330    *
331    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
332    * engine is deleted.
333    */
334   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
335 
336   /**
337    * <p>Virtual destructor.</p>
338    */
339   virtual ~KhmerBreakEngine();
340 
341  protected:
342  /**
343   * <p>Divide up a range of known dictionary characters.</p>
344   *
345   * @param text A UText representing the text
346   * @param rangeStart The start of the range of dictionary characters
347   * @param rangeEnd The end of the range of dictionary characters
348   * @param foundBreaks Output of C array of int32_t break positions, or 0
349   * @return The number of breaks found
350   */
351   virtual int32_t divideUpDictionaryRange( UText *text,
352                                            int32_t rangeStart,
353                                            int32_t rangeEnd,
354                                            UStack &foundBreaks ) const;
355 
356 };
357 
358 #if !UCONFIG_NO_NORMALIZATION
359 
360 /*******************************************************************
361  * CjkBreakEngine
362  */
363 
364 //indicates language/script that the CjkBreakEngine will handle
365 enum LanguageType {
366     kKorean,
367     kChineseJapanese
368 };
369 
370 /**
371  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
372  * dictionary with costs associated with each word and
373  * Viterbi decoding to determine CJK-specific breaks.</p>
374  */
375 class CjkBreakEngine : public DictionaryBreakEngine {
376  protected:
377     /**
378      * The set of characters handled by this engine
379      * @internal
380      */
381   UnicodeSet                fHangulWordSet;
382   UnicodeSet                fHanWordSet;
383   UnicodeSet                fKatakanaWordSet;
384   UnicodeSet                fHiraganaWordSet;
385 
386   DictionaryMatcher        *fDictionary;
387   const Normalizer2        *nfkcNorm2;
388 
389  public:
390 
391     /**
392      * <p>Default constructor.</p>
393      *
394      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
395      * engine is deleted. The DictionaryMatcher must contain costs for each word
396      * in order for the dictionary to work properly.
397      */
398   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
399 
400     /**
401      * <p>Virtual destructor.</p>
402      */
403   virtual ~CjkBreakEngine();
404 
405  protected:
406     /**
407      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
408      *
409      * @param text A UText representing the text
410      * @param rangeStart The start of the range of dictionary characters
411      * @param rangeEnd The end of the range of dictionary characters
412      * @param foundBreaks Output of C array of int32_t break positions, or 0
413      * @return The number of breaks found
414      */
415   virtual int32_t divideUpDictionaryRange( UText *text,
416           int32_t rangeStart,
417           int32_t rangeEnd,
418           UStack &foundBreaks ) const;
419 
420 };
421 
422 #endif
423 
424 U_NAMESPACE_END
425 
426     /* DICTBE_H */
427 #endif
428