1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbitblb.h
5 //
6 
7 /*
8 **********************************************************************
9 *   Copyright (c) 2002-2016, International Business Machines
10 *   Corporation and others.  All Rights Reserved.
11 **********************************************************************
12 */
13 
14 #ifndef RBBITBLB_H
15 #define RBBITBLB_H
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_BREAK_ITERATION
20 
21 #include "unicode/uobject.h"
22 #include "unicode/rbbi.h"
23 #include "rbbidata.h"
24 #include "rbbirb.h"
25 #include "rbbinode.h"
26 
27 
28 U_NAMESPACE_BEGIN
29 
30 class RBBIRuleScanner;
31 class RBBIRuleBuilder;
32 class UVector32;
33 
34 //
35 //  class RBBITableBuilder is part of the RBBI rule compiler.
36 //                         It builds the state transition table used by the RBBI runtime
37 //                         from the expression syntax tree generated by the rule scanner.
38 //
39 //                         This class is part of the RBBI implementation only.
40 //                         There is no user-visible public API here.
41 //
42 
43 class RBBITableBuilder : public UMemory {
44 public:
45     RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
46     ~RBBITableBuilder();
47 
48     void     buildForwardTable();
49 
50     /** Return the runtime size in bytes of the built state table.  */
51     int32_t  getTableSize() const;
52 
53     /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
54      */
55     void     exportTable(void *where);
56 
57     /** Use 8 bits to encode the forward table */
58     bool     use8BitsForTable() const;
59 
60     /**
61      *  Find duplicate (redundant) character classes. Begin looking with categories.first.
62      *  Duplicate, if found are returned in the categories parameter.
63      *  This is an iterator-like function, used to identify character classes
64      *  (state table columns) that can be eliminated.
65      *  @param categories in/out parameter, specifies where to start looking for duplicates,
66      *                and returns the first pair of duplicates found, if any.
67      *  @return true if duplicate char classes were found, false otherwise.
68      */
69     bool     findDuplCharClassFrom(IntPair *categories);
70 
71     /** Remove a column from the state table. Used when two character categories
72      *  have been found equivalent, and merged together, to eliminate the uneeded table column.
73      */
74     void     removeColumn(int32_t column);
75 
76     /**
77      * Check for, and remove dupicate states (table rows).
78      * @return the number of states removed.
79      */
80     int32_t  removeDuplicateStates();
81 
82     /** Build the safe reverse table from the already-constructed forward table. */
83     void     buildSafeReverseTable(UErrorCode &status);
84 
85     /** Return the runtime size in bytes of the built safe reverse state table. */
86     int32_t  getSafeTableSize() const;
87 
88     /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
89      */
90     void     exportSafeTable(void *where);
91 
92     /** Use 8 bits to encode the safe reverse table */
93     bool     use8BitsForSafeTable() const;
94 
95 private:
96     void     calcNullable(RBBINode *n);
97     void     calcFirstPos(RBBINode *n);
98     void     calcLastPos(RBBINode  *n);
99     void     calcFollowPos(RBBINode *n);
100     void     calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
101     void     bofFixup();
102     void     buildStateTable();
103     void     mapLookAheadRules();
104     void     flagAcceptingStates();
105     void     flagLookAheadStates();
106     void     flagTaggedStates();
107     void     mergeRuleStatusVals();
108 
109     /**
110      * Merge redundant state table columns, eliminating character classes with identical behavior.
111      * Done after the state tables are generated, just before converting to their run-time format.
112      */
113     int32_t  mergeColumns();
114 
115     void     addRuleRootNodes(UVector *dest, RBBINode *node);
116 
117     /**
118      *  Find duplicate (redundant) states, beginning at the specified pair,
119      *  within this state table. This is an iterator-like function, used to
120      *  identify states (state table rows) that can be eliminated.
121      *  @param states in/out parameter, specifies where to start looking for duplicates,
122      *                and returns the first pair of duplicates found, if any.
123      *  @return true if duplicate states were found, false otherwise.
124      */
125     bool findDuplicateState(IntPair *states);
126 
127     /** Remove a duplicate state.
128      * @param duplStates The duplicate states. The first is kept, the second is removed.
129      *                   All references to the second in the state table are retargeted
130      *                   to the first.
131      */
132     void removeState(IntPair duplStates);
133 
134     /** Find the next duplicate state in the safe reverse table. An iterator function.
135      *  @param states in/out parameter, specifies where to start looking for duplicates,
136      *                and returns the first pair of duplicates found, if any.
137      *  @return true if a duplicate pair of states was found.
138      */
139     bool findDuplicateSafeState(IntPair *states);
140 
141     /** Remove a duplicate state from the safe table.
142      * @param duplStates The duplicate states. The first is kept, the second is removed.
143      *                   All references to the second in the state table are retargeted
144      *                   to the first.
145      */
146     void removeSafeState(IntPair duplStates);
147 
148     // Set functions for UVector.
149     //   TODO:  make a USet subclass of UVector
150 
151     void     setAdd(UVector *dest, UVector *source);
152     UBool    setEquals(UVector *a, UVector *b);
153 
154     void     sortedAdd(UVector **dest, int32_t val);
155 
156 public:
157 #ifdef RBBI_DEBUG
158     void     printSet(UVector *s);
159     void     printPosSets(RBBINode *n /* = NULL*/);
160     void     printStates();
161     void     printRuleStatusTable();
162     void     printReverseTable();
163 #else
164     #define  printSet(s)
165     #define  printPosSets(n)
166     #define  printStates()
167     #define  printRuleStatusTable()
168     #define  printReverseTable()
169 #endif
170 
171 private:
172     RBBIRuleBuilder  *fRB;
173     RBBINode         *&fTree;              // The root node of the parse tree to build a
174                                            //   table for.
175     UErrorCode       *fStatus;
176 
177     /** State Descriptors, UVector<RBBIStateDescriptor> */
178     UVector          *fDStates;            //  D states (Aho's terminology)
179                                            //  Index is state number
180                                            //  Contents are RBBIStateDescriptor pointers.
181 
182     /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */
183     UVector          *fSafeTable;
184 
185     /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
186     UVector32        *fLookAheadRuleMap = nullptr;
187 
188     /* Counter used when assigning lookahead rule numbers.
189      * Contains the last look-ahead number already in use.
190      * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
191      * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.   */
192     int32_t          fLASlotsInUse = ACCEPTING_UNCONDITIONAL;
193 
194 
195     RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
196     RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
197 };
198 
199 //
200 //  RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
201 //                        one for each state.
202 class RBBIStateDescriptor : public UMemory {
203 public:
204     UBool            fMarked;
205     uint32_t         fAccepting;
206     uint32_t         fLookAhead;
207     UVector          *fTagVals;
208     int32_t          fTagsIdx;
209     UVector          *fPositions;          // Set of parse tree positions associated
210                                            //   with this state.  Unordered (it's a set).
211                                            //   UVector contents are RBBINode *
212 
213     UVector32        *fDtran;              // Transitions out of this state.
214                                            //   indexed by input character
215                                            //   contents is int index of dest state
216                                            //   in RBBITableBuilder.fDStates
217 
218     RBBIStateDescriptor(int maxInputSymbol,  UErrorCode *fStatus);
219     ~RBBIStateDescriptor();
220 
221 private:
222     RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
223     RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
224 };
225 
226 
227 
228 U_NAMESPACE_END
229 
230 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
231 
232 #endif
233