1 //
2 //  rbbirb.h
3 //
4 //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
5 //  All Rights Reserved.
6 //
7 //  This file contains declarations for several classes from the
8 //    Rule Based Break Iterator rule builder.
9 //
10 
11 
12 #ifndef RBBIRB_H
13 #define RBBIRB_H
14 
15 #include "unicode/utypes.h"
16 #include "unicode/uobject.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/uniset.h"
19 #include "unicode/parseerr.h"
20 #include "uhash.h"
21 #include "uvector.h"
22 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
23                           //    looks up references to $variables within a set.
24 
25 
26 
27 U_NAMESPACE_BEGIN
28 
29 class               RBBIRuleScanner;
30 struct              RBBIRuleTableEl;
31 class               RBBISetBuilder;
32 class               RBBINode;
33 class               RBBITableBuilder;
34 
35 
36 
37 //--------------------------------------------------------------------------------
38 //
39 //   RBBISymbolTable.    Implements SymbolTable interface that is used by the
40 //                       UnicodeSet parser to resolve references to $variables.
41 //
42 //--------------------------------------------------------------------------------
43 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
44 public:                                       //   of these structs for each entry.
45     RBBISymbolTableEntry();
46     UnicodeString          key;
47     RBBINode               *val;
48     ~RBBISymbolTableEntry();
49 
50 private:
51     RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
52     RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
53 };
54 
55 
56 class RBBISymbolTable : public UMemory, public SymbolTable {
57 private:
58     const UnicodeString      &fRules;
59     UHashtable               *fHashTable;
60     RBBIRuleScanner          *fRuleScanner;
61 
62     // These next two fields are part of the mechanism for passing references to
63     //   already-constructed UnicodeSets back to the UnicodeSet constructor
64     //   when the pattern includes $variable references.
65     const UnicodeString      ffffString;      // = "/uffff"
66     UnicodeSet              *fCachedSetLookup;
67 
68 public:
69     //  API inherited from class SymbolTable
70     virtual const UnicodeString*  lookup(const UnicodeString& s) const;
71     virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
72     virtual UnicodeString parseReference(const UnicodeString& text,
73                                          ParsePosition& pos, int32_t limit) const;
74 
75     //  Additional Functions
76     RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
77     virtual ~RBBISymbolTable();
78 
79     virtual RBBINode *lookupNode(const UnicodeString &key) const;
80     virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
81 
82 #ifdef RBBI_DEBUG
83     virtual void      rbbiSymtablePrint() const;
84 #else
85     // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
86     //  or the call sites won't compile.
87     int32_t fFakeField;
88     #define rbbiSymtablePrint() fFakeField=0;
89 #endif
90 
91 private:
92     RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
93     RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
94 };
95 
96 
97 //--------------------------------------------------------------------------------
98 //
99 //  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
100 //
101 //--------------------------------------------------------------------------------
102 class RBBIRuleBuilder : public UMemory {
103 public:
104 
105     //  Create a rule based break iterator from a set of rules.
106     //  This function is the main entry point into the rule builder.  The
107     //   public ICU API for creating RBBIs uses this function to do the actual work.
108     //
109     static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
110                                     UParseError      *parseError,
111                                     UErrorCode       &status);
112 
113 public:
114     // The "public" functions and data members that appear below are accessed
115     //  (and shared) by the various parts that make up the rule builder.  They
116     //  are NOT intended to be accessed by anything outside of the
117     //  rule builder implementation.
118     RBBIRuleBuilder(const UnicodeString  &rules,
119                     UParseError          *parseErr,
120                     UErrorCode           &status
121         );
122 
123     virtual    ~RBBIRuleBuilder();
124     char                          *fDebugEnv;        // controls debug trace output
125     UErrorCode                    *fStatus;          // Error reporting.  Keeping status
126     UParseError                   *fParseError;      //   here avoids passing it everywhere.
127     const UnicodeString           &fRules;           // The rule string that we are compiling
128 
129     RBBIRuleScanner               *fScanner;         // The scanner.
130     RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
131     RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
132     RBBINode                      *fSafeFwdTree;
133     RBBINode                      *fSafeRevTree;
134 
135     RBBINode                      **fDefaultTree;    // For rules not qualified with a !
136                                                      //   the tree to which they belong to.
137 
138     UBool                         fChainRules;       // True for chained Unicode TR style rules.
139                                                      // False for traditional regexp rules.
140 
141     UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
142                                                      //   chars with LineBreak property == CM.
143 
144     UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
145                                                      // immediate break, no continuing for the
146                                                      // longest match.
147 
148     RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
149     UVector                       *fUSetNodes;       // Vector of all uset nodes.
150 
151     RBBITableBuilder              *fForwardTables;   // State transition tables
152     RBBITableBuilder              *fReverseTables;
153     RBBITableBuilder              *fSafeFwdTables;
154     RBBITableBuilder              *fSafeRevTables;
155 
156     UVector                       *fRuleStatusVals;  // The values that can be returned
157                                                      //   from getRuleStatus().
158 
159     RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
160                                                      // data tables..
161 private:
162     RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
163     RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
164 };
165 
166 
167 
168 
169 //----------------------------------------------------------------------------
170 //
171 //   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
172 //                    been encountered.  The val Node will be of nodetype uset
173 //                    and contain pointers to the actual UnicodeSets.
174 //                    The Key is the source string for initializing the set.
175 //
176 //                    The hash table is used to avoid creating duplicate
177 //                    unnamed (not $var references) UnicodeSets.
178 //
179 //                    Memory Management:
180 //                       The Hash Table owns these RBBISetTableEl structs and
181 //                            the key strings.  It does NOT own the val nodes.
182 //
183 //----------------------------------------------------------------------------
184 struct RBBISetTableEl {
185     UnicodeString *key;
186     RBBINode      *val;
187 };
188 
189 
190 //----------------------------------------------------------------------------
191 //
192 //   RBBIDebugPrintf    Printf equivalent, for debugging output.
193 //                      Conditional compilation of the implementation lets us
194 //                      get rid of the stdio dependency in environments where it
195 //                      is unavailable.
196 //
197 //----------------------------------------------------------------------------
198 #ifdef RBBI_DEBUG
199 #include <stdio.h>
200 #define RBBIDebugPrintf printf
201 #define RBBIDebugPuts puts
202 #else
203 #undef RBBIDebugPrintf
204 #define RBBIDebugPuts(arg)
205 #endif
206 
207 U_NAMESPACE_END
208 #endif
209 
210 
211 
212