1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbitblb.h
5 //
6 
7 /*
8 **********************************************************************
9 *   Copyright (c) 2002-2016, International Business Machines
10 *   Corporation and others.  All Rights Reserved.
11 **********************************************************************
12 */
13 
14 #ifndef RBBITBLB_H
15 #define RBBITBLB_H
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_BREAK_ITERATION
20 
21 #include "unicode/uobject.h"
22 #include "unicode/rbbi.h"
23 #include "rbbirb.h"
24 #include "rbbinode.h"
25 
26 
27 U_NAMESPACE_BEGIN
28 
29 class RBBIRuleScanner;
30 class RBBIRuleBuilder;
31 class UVector32;
32 
33 //
34 //  class RBBITableBuilder is part of the RBBI rule compiler.
35 //                         It builds the state transition table used by the RBBI runtime
36 //                         from the expression syntax tree generated by the rule scanner.
37 //
38 //                         This class is part of the RBBI implementation only.
39 //                         There is no user-visible public API here.
40 //
41 
42 class RBBITableBuilder : public UMemory {
43 public:
44     RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
45     ~RBBITableBuilder();
46 
47     void     buildForwardTable();
48 
49     /** Return the runtime size in bytes of the built state table.  */
50     int32_t  getTableSize() const;
51 
52     /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
53      */
54     void     exportTable(void *where);
55 
56     /**
57      *  Find duplicate (redundant) character classes. Begin looking with categories.first.
58      *  Duplicate, if found are returned in the categories parameter.
59      *  This is an iterator-like function, used to identify character classes
60      *  (state table columns) that can be eliminated.
61      *  @param categories in/out parameter, specifies where to start looking for duplicates,
62      *                and returns the first pair of duplicates found, if any.
63      *  @return true if duplicate char classes were found, false otherwise.
64      */
65     bool     findDuplCharClassFrom(IntPair *categories);
66 
67     /** Remove a column from the state table. Used when two character categories
68      *  have been found equivalent, and merged together, to eliminate the uneeded table column.
69      */
70     void     removeColumn(int32_t column);
71 
72     /**
73      * Check for, and remove dupicate states (table rows).
74      * @return the number of states removed.
75      */
76     int32_t  removeDuplicateStates();
77 
78     /** Build the safe reverse table from the already-constructed forward table. */
79     void     buildSafeReverseTable(UErrorCode &status);
80 
81     /** Return the runtime size in bytes of the built safe reverse state table. */
82     int32_t  getSafeTableSize() const;
83 
84     /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
85      */
86     void     exportSafeTable(void *where);
87 
88 
89 private:
90     void     calcNullable(RBBINode *n);
91     void     calcFirstPos(RBBINode *n);
92     void     calcLastPos(RBBINode  *n);
93     void     calcFollowPos(RBBINode *n);
94     void     calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
95     void     bofFixup();
96     void     buildStateTable();
97     void     mapLookAheadRules();
98     void     flagAcceptingStates();
99     void     flagLookAheadStates();
100     void     flagTaggedStates();
101     void     mergeRuleStatusVals();
102 
103     /**
104      * Merge redundant state table columns, eliminating character classes with identical behavior.
105      * Done after the state tables are generated, just before converting to their run-time format.
106      */
107     int32_t  mergeColumns();
108 
109     void     addRuleRootNodes(UVector *dest, RBBINode *node);
110 
111     /**
112      *  Find duplicate (redundant) states, beginning at the specified pair,
113      *  within this state table. This is an iterator-like function, used to
114      *  identify states (state table rows) that can be eliminated.
115      *  @param states in/out parameter, specifies where to start looking for duplicates,
116      *                and returns the first pair of duplicates found, if any.
117      *  @return true if duplicate states were found, false otherwise.
118      */
119     bool findDuplicateState(IntPair *states);
120 
121     /** Remove a duplicate state.
122      * @param duplStates The duplicate states. The first is kept, the second is removed.
123      *                   All references to the second in the state table are retargeted
124      *                   to the first.
125      */
126     void removeState(IntPair duplStates);
127 
128     /** Find the next duplicate state in the safe reverse table. An iterator function.
129      *  @param states in/out parameter, specifies where to start looking for duplicates,
130      *                and returns the first pair of duplicates found, if any.
131      *  @return true if a duplicate pair of states was found.
132      */
133     bool findDuplicateSafeState(IntPair *states);
134 
135     /** Remove a duplicate state from the safe table.
136      * @param duplStates The duplicate states. The first is kept, the second is removed.
137      *                   All references to the second in the state table are retargeted
138      *                   to the first.
139      */
140     void removeSafeState(IntPair duplStates);
141 
142     // Set functions for UVector.
143     //   TODO:  make a USet subclass of UVector
144 
145     void     setAdd(UVector *dest, UVector *source);
146     UBool    setEquals(UVector *a, UVector *b);
147 
148     void     sortedAdd(UVector **dest, int32_t val);
149 
150 public:
151 #ifdef RBBI_DEBUG
152     void     printSet(UVector *s);
153     void     printPosSets(RBBINode *n /* = NULL*/);
154     void     printStates();
155     void     printRuleStatusTable();
156     void     printReverseTable();
157 #else
158     #define  printSet(s)
159     #define  printPosSets(n)
160     #define  printStates()
161     #define  printRuleStatusTable()
162     #define  printReverseTable()
163 #endif
164 
165 private:
166     RBBIRuleBuilder  *fRB;
167     RBBINode         *&fTree;              // The root node of the parse tree to build a
168                                            //   table for.
169     UErrorCode       *fStatus;
170 
171     /** State Descriptors, UVector<RBBIStateDescriptor> */
172     UVector          *fDStates;            //  D states (Aho's terminology)
173                                            //  Index is state number
174                                            //  Contents are RBBIStateDescriptor pointers.
175 
176     /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */
177     UVector          *fSafeTable;
178 
179     /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
180     UVector32        *fLookAheadRuleMap = nullptr;
181 
182 
183     RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
184     RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
185 };
186 
187 //
188 //  RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
189 //                        one for each state.
190 class RBBIStateDescriptor : public UMemory {
191 public:
192     UBool            fMarked;
193     int32_t          fAccepting;
194     int32_t          fLookAhead;
195     UVector          *fTagVals;
196     int32_t          fTagsIdx;
197     UVector          *fPositions;          // Set of parse tree positions associated
198                                            //   with this state.  Unordered (it's a set).
199                                            //   UVector contents are RBBINode *
200 
201     UVector32        *fDtran;              // Transitions out of this state.
202                                            //   indexed by input character
203                                            //   contents is int index of dest state
204                                            //   in RBBITableBuilder.fDStates
205 
206     RBBIStateDescriptor(int maxInputSymbol,  UErrorCode *fStatus);
207     ~RBBIStateDescriptor();
208 
209 private:
210     RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
211     RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
212 };
213 
214 
215 
216 U_NAMESPACE_END
217 
218 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
219 
220 #endif
221