1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbisetb.h
5 /*
6 **********************************************************************
7 *   Copyright (c) 2001-2005, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 **********************************************************************
10 */
11 
12 #ifndef RBBISETB_H
13 #define RBBISETB_H
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_BREAK_ITERATION
18 
19 #include "unicode/ucptrie.h"
20 #include "unicode/umutablecptrie.h"
21 #include "unicode/uobject.h"
22 #include "rbbirb.h"
23 #include "uvector.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 //
28 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
29 //                   from the Unicode Sets appearing in the source  RBBI rules, and
30 //                   creates the TRIE table used to map from Unicode to the
31 //                   character categories.
32 //
33 
34 
35 //
36 //  RangeDescriptor
37 //
38 //     Each of the non-overlapping character ranges gets one of these descriptors.
39 //     All of them are strung together in a linked list, which is kept in order
40 //     (by character)
41 //
42 class RangeDescriptor : public UMemory {
43 public:
44     UChar32            fStartChar {};            // Start of range, unicode 32 bit value.
45     UChar32            fEndChar {};              // End of range, unicode 32 bit value.
46     int32_t            fNum {0};                 // runtime-mapped input value for this range.
47     bool               fIncludesDict {false};    // True if the range includes $dictionary.
48     bool               fFirstInGroup {false};    // True if first range in a group with the same fNum.
49     UVector           *fIncludesSets {nullptr};  // vector of the the original
50                                                  //   Unicode sets that include this range.
51                                                  //    (Contains ptrs to uset nodes)
52     RangeDescriptor   *fNext {nullptr};          // Next RangeDescriptor in the linked list.
53 
54     RangeDescriptor(UErrorCode &status);
55     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
56     ~RangeDescriptor();
57     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
58                                         //   where appearing in the second (higher) part.
59     bool isDictionaryRange();           // Check whether this range appears as part of
60                                         //   the Unicode set named "dictionary"
61 
62     RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
63     RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
64 };
65 
66 
67 //
68 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
69 //
70 //      Starting with the rules parse tree from the scanner,
71 //
72 //                   -  Enumerate the set of UnicodeSets that are referenced
73 //                      by the RBBI rules.
74 //                   -  compute a derived set of non-overlapping UnicodeSets
75 //                      that will correspond to columns in the state table for
76 //                      the RBBI execution engine.
77 //                   -  construct the trie table that maps input characters
78 //                      to set numbers in the non-overlapping set of sets.
79 //
80 
81 
82 class RBBISetBuilder : public UMemory {
83 public:
84     RBBISetBuilder(RBBIRuleBuilder *rb);
85     ~RBBISetBuilder();
86 
87     void     buildRanges();
88     void     buildTrie();
89     void     addValToSets(UVector *sets,      uint32_t val);
90     void     addValToSet (RBBINode *usetNode, uint32_t val);
91     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
92                                              //    runtime state machine, which are the same as
93                                              //    columns in the DFA state table
94     int32_t  getDictCategoriesStart() const; // First char category that includes $dictionary, or
95                                              // last category + 1 if there are no dictionary categories.
96     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
97     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
98     UChar32  getFirstChar(int32_t  val) const;
99     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
100                                              //   character were encountered.
101     /**
102      * Merge two character categories that have been identified as having equivalent behavior.
103      * The ranges belonging to the second category (table column) will be added to the first.
104      * @param categories the pair of categories to be merged.
105      */
106     void     mergeCategories(IntPair categories);
107 
108 #ifdef RBBI_DEBUG
109     void     printSets();
110     void     printRanges();
111     void     printRangeGroups();
112 #else
113     #define printSets()
114     #define printRanges()
115     #define printRangeGroups()
116 #endif
117 
118 private:
119     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
120     UErrorCode            *fStatus;
121 
122     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
123 
124     UMutableCPTrie        *fMutableTrie;    // The mapping TRIE that is the end result of processing
125     UCPTrie               *fTrie;           //  the Unicode Sets.
126     uint32_t               fTrieSize;
127 
128     // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
129     int32_t               fGroupCount;
130 
131     // The number of the first dictionary char category.
132     // If there are no Dictionary categories, set to the last category + 1.
133     int32_t               fDictCategoriesStart;
134 
135     UBool                 fSawBOF;
136 
137     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
138     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
139 };
140 
141 
142 
143 U_NAMESPACE_END
144 
145 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
146 
147 #endif
148