1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbitblb.h 5 // 6 7 /* 8 ********************************************************************** 9 * Copyright (c) 2002-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 ********************************************************************** 12 */ 13 14 #ifndef RBBITBLB_H 15 #define RBBITBLB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/uobject.h" 22 #include "unicode/rbbi.h" 23 #include "rbbirb.h" 24 #include "rbbinode.h" 25 26 27 U_NAMESPACE_BEGIN 28 29 class RBBIRuleScanner; 30 class RBBIRuleBuilder; 31 class UVector32; 32 33 // 34 // class RBBITableBuilder is part of the RBBI rule compiler. 35 // It builds the state transition table used by the RBBI runtime 36 // from the expression syntax tree generated by the rule scanner. 37 // 38 // This class is part of the RBBI implementation only. 39 // There is no user-visible public API here. 40 // 41 42 class RBBITableBuilder : public UMemory { 43 public: 44 RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); 45 ~RBBITableBuilder(); 46 47 void buildForwardTable(); 48 49 /** Return the runtime size in bytes of the built state table. */ 50 int32_t getTableSize() const; 51 52 /** Fill in the runtime state table. Sufficient memory must exist at the specified location. 53 */ 54 void exportTable(void *where); 55 56 /** 57 * Find duplicate (redundant) character classes. Begin looking with categories.first. 58 * Duplicate, if found are returned in the categories parameter. 59 * This is an iterator-like function, used to identify character classes 60 * (state table columns) that can be eliminated. 61 * @param categories in/out parameter, specifies where to start looking for duplicates, 62 * and returns the first pair of duplicates found, if any. 63 * @return true if duplicate char classes were found, false otherwise. 64 */ 65 bool findDuplCharClassFrom(IntPair *categories); 66 67 /** Remove a column from the state table. Used when two character categories 68 * have been found equivalent, and merged together, to eliminate the uneeded table column. 69 */ 70 void removeColumn(int32_t column); 71 72 /** 73 * Check for, and remove dupicate states (table rows). 74 * @return the number of states removed. 75 */ 76 int32_t removeDuplicateStates(); 77 78 /** Build the safe reverse table from the already-constructed forward table. */ 79 void buildSafeReverseTable(UErrorCode &status); 80 81 /** Return the runtime size in bytes of the built safe reverse state table. */ 82 int32_t getSafeTableSize() const; 83 84 /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. 85 */ 86 void exportSafeTable(void *where); 87 88 89 private: 90 void calcNullable(RBBINode *n); 91 void calcFirstPos(RBBINode *n); 92 void calcLastPos(RBBINode *n); 93 void calcFollowPos(RBBINode *n); 94 void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); 95 void bofFixup(); 96 void buildStateTable(); 97 void mapLookAheadRules(); 98 void flagAcceptingStates(); 99 void flagLookAheadStates(); 100 void flagTaggedStates(); 101 void mergeRuleStatusVals(); 102 103 /** 104 * Merge redundant state table columns, eliminating character classes with identical behavior. 105 * Done after the state tables are generated, just before converting to their run-time format. 106 */ 107 int32_t mergeColumns(); 108 109 void addRuleRootNodes(UVector *dest, RBBINode *node); 110 111 /** 112 * Find duplicate (redundant) states, beginning at the specified pair, 113 * within this state table. This is an iterator-like function, used to 114 * identify states (state table rows) that can be eliminated. 115 * @param states in/out parameter, specifies where to start looking for duplicates, 116 * and returns the first pair of duplicates found, if any. 117 * @return true if duplicate states were found, false otherwise. 118 */ 119 bool findDuplicateState(IntPair *states); 120 121 /** Remove a duplicate state. 122 * @param duplStates The duplicate states. The first is kept, the second is removed. 123 * All references to the second in the state table are retargeted 124 * to the first. 125 */ 126 void removeState(IntPair duplStates); 127 128 /** Find the next duplicate state in the safe reverse table. An iterator function. 129 * @param states in/out parameter, specifies where to start looking for duplicates, 130 * and returns the first pair of duplicates found, if any. 131 * @return true if a duplicate pair of states was found. 132 */ 133 bool findDuplicateSafeState(IntPair *states); 134 135 /** Remove a duplicate state from the safe table. 136 * @param duplStates The duplicate states. The first is kept, the second is removed. 137 * All references to the second in the state table are retargeted 138 * to the first. 139 */ 140 void removeSafeState(IntPair duplStates); 141 142 // Set functions for UVector. 143 // TODO: make a USet subclass of UVector 144 145 void setAdd(UVector *dest, UVector *source); 146 UBool setEquals(UVector *a, UVector *b); 147 148 void sortedAdd(UVector **dest, int32_t val); 149 150 public: 151 #ifdef RBBI_DEBUG 152 void printSet(UVector *s); 153 void printPosSets(RBBINode *n /* = NULL*/); 154 void printStates(); 155 void printRuleStatusTable(); 156 void printReverseTable(); 157 #else 158 #define printSet(s) 159 #define printPosSets(n) 160 #define printStates() 161 #define printRuleStatusTable() 162 #define printReverseTable() 163 #endif 164 165 private: 166 RBBIRuleBuilder *fRB; 167 RBBINode *&fTree; // The root node of the parse tree to build a 168 // table for. 169 UErrorCode *fStatus; 170 171 /** State Descriptors, UVector<RBBIStateDescriptor> */ 172 UVector *fDStates; // D states (Aho's terminology) 173 // Index is state number 174 // Contents are RBBIStateDescriptor pointers. 175 176 /** Synthesized safe table, UVector of UnicodeString, one string per table row. */ 177 UVector *fSafeTable; 178 179 /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ 180 UVector32 *fLookAheadRuleMap = nullptr; 181 182 183 RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class 184 RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class 185 }; 186 187 // 188 // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, 189 // one for each state. 190 class RBBIStateDescriptor : public UMemory { 191 public: 192 UBool fMarked; 193 int32_t fAccepting; 194 int32_t fLookAhead; 195 UVector *fTagVals; 196 int32_t fTagsIdx; 197 UVector *fPositions; // Set of parse tree positions associated 198 // with this state. Unordered (it's a set). 199 // UVector contents are RBBINode * 200 201 UVector32 *fDtran; // Transitions out of this state. 202 // indexed by input character 203 // contents is int index of dest state 204 // in RBBITableBuilder.fDStates 205 206 RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus); 207 ~RBBIStateDescriptor(); 208 209 private: 210 RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class 211 RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class 212 }; 213 214 215 216 U_NAMESPACE_END 217 218 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 219 220 #endif 221