1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  rbbirb.cpp
5 //
6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10 //    building (compiling) break rules into the tables required by the runtime
11 //    RBBI engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/parseerr.h"
28 
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "rbbirb.h"
32 #include "rbbinode.h"
33 #include "rbbiscan.h"
34 #include "rbbisetb.h"
35 #include "rbbitblb.h"
36 #include "rbbidata.h"
37 #include "uassert.h"
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 
43 //----------------------------------------------------------------------------------------
44 //
45 //  Constructor.
46 //
47 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
49                                        UParseError     *parseErr,
50                                        UErrorCode      &status)
51  : fRules(rules), fStrippedRules(rules)
52 {
53     fStatus = &status; // status is checked below
54     fParseError = parseErr;
55     fDebugEnv   = NULL;
56 #ifdef RBBI_DEBUG
57     fDebugEnv   = getenv("U_RBBIDEBUG");
58 #endif
59 
60 
61     fForwardTree        = NULL;
62     fReverseTree        = NULL;
63     fSafeFwdTree        = NULL;
64     fSafeRevTree        = NULL;
65     fDefaultTree        = &fForwardTree;
66     fForwardTable       = NULL;
67     fRuleStatusVals     = NULL;
68     fChainRules         = FALSE;
69     fLBCMNoChain        = FALSE;
70     fLookAheadHardBreak = FALSE;
71     fUSetNodes          = NULL;
72     fRuleStatusVals     = NULL;
73     fScanner            = NULL;
74     fSetBuilder         = NULL;
75     if (parseErr) {
76         uprv_memset(parseErr, 0, sizeof(UParseError));
77     }
78 
79     if (U_FAILURE(status)) {
80         return;
81     }
82 
83     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
84     fRuleStatusVals     = new UVector(status);
85     fScanner            = new RBBIRuleScanner(this);
86     fSetBuilder         = new RBBISetBuilder(this);
87     if (U_FAILURE(status)) {
88         return;
89     }
90     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91         status = U_MEMORY_ALLOCATION_ERROR;
92     }
93 }
94 
95 
96 
97 //----------------------------------------------------------------------------------------
98 //
99 //  Destructor
100 //
101 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()102 RBBIRuleBuilder::~RBBIRuleBuilder() {
103 
104     int        i;
105     for (i=0; ; i++) {
106         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107         if (n==NULL) {
108             break;
109         }
110         delete n;
111     }
112 
113     delete fUSetNodes;
114     delete fSetBuilder;
115     delete fForwardTable;
116     delete fForwardTree;
117     delete fReverseTree;
118     delete fSafeFwdTree;
119     delete fSafeRevTree;
120     delete fScanner;
121     delete fRuleStatusVals;
122 }
123 
124 
125 
126 
127 
128 //----------------------------------------------------------------------------------------
129 //
130 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
131 //                    the format for saving in ICU data files,
132 //                    which is also the format needed by the RBBI runtime engine.
133 //
134 //----------------------------------------------------------------------------------------
align8(int32_t i)135 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
136 
flattenData()137 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
138     int32_t    i;
139 
140     if (U_FAILURE(*fStatus)) {
141         return NULL;
142     }
143 
144     // Remove whitespace from the rules to make it smaller.
145     // The rule parser has already removed comments.
146     fStrippedRules = fScanner->stripRules(fStrippedRules);
147 
148     // Calculate the size of each section in the data.
149     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
150     //   Sections sizes actually stored in the header are for the actual data
151     //     without the padding.
152     //
153     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
154     int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
155     int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
156     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
157     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
158 
159     int32_t rulesLengthInUTF8 = 0;
160     u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
161                        fStrippedRules.getBuffer(), fStrippedRules.length(),
162                        0xfffd, nullptr, fStatus);
163     *fStatus = U_ZERO_ERROR;
164 
165     int32_t rulesSize         = align8((rulesLengthInUTF8+1));
166 
167     int32_t         totalSize = headerSize
168                                 + forwardTableSize
169                                 + reverseTableSize
170                                 + statusTableSize + trieSize + rulesSize;
171 
172 #ifdef RBBI_DEBUG
173     if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
174         RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
175         RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
176         RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
177         RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
178         RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
179         RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
180         RBBIDebugPrintf("-----------------------------\n");
181         RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
182     }
183 #endif
184 
185     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
186     if (data == NULL) {
187         *fStatus = U_MEMORY_ALLOCATION_ERROR;
188         return NULL;
189     }
190     uprv_memset(data, 0, totalSize);
191 
192 
193     data->fMagic            = 0xb1a0;
194     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
195     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
196     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
197     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
198     data->fLength           = totalSize;
199     data->fCatCount         = fSetBuilder->getNumCharCategories();
200 
201     data->fFTable        = headerSize;
202     data->fFTableLen     = forwardTableSize;
203 
204     data->fRTable        = data->fFTable  + data->fFTableLen;
205     data->fRTableLen     = reverseTableSize;
206 
207     data->fTrie          = data->fRTable + data->fRTableLen;
208     data->fTrieLen       = trieSize;
209     data->fStatusTable   = data->fTrie    + data->fTrieLen;
210     data->fStatusTableLen= statusTableSize;
211     data->fRuleSource    = data->fStatusTable + statusTableSize;
212     data->fRuleSourceLen = rulesLengthInUTF8;
213 
214     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
215 
216     fForwardTable->exportTable((uint8_t *)data + data->fFTable);
217     fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
218     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
219 
220     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
221     for (i=0; i<fRuleStatusVals->size(); i++) {
222         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
223     }
224 
225     u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
226                        fStrippedRules.getBuffer(), fStrippedRules.length(),
227                        0xfffd, nullptr, fStatus);
228     if (U_FAILURE(*fStatus)) {
229         return NULL;
230     }
231 
232     return data;
233 }
234 
235 
236 //----------------------------------------------------------------------------------------
237 //
238 //  createRuleBasedBreakIterator    construct from source rules that are passed in
239 //                                  in a UnicodeString
240 //
241 //----------------------------------------------------------------------------------------
242 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)243 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
244                                     UParseError      *parseError,
245                                     UErrorCode       &status)
246 {
247     //
248     // Read the input rules, generate a parse tree, symbol table,
249     // and list of all Unicode Sets referenced by the rules.
250     //
251     RBBIRuleBuilder  builder(rules, parseError, status);
252     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
253         return NULL;
254     }
255 
256     RBBIDataHeader *data = builder.build(status);
257 
258     if (U_FAILURE(status)) {
259         return nullptr;
260     }
261 
262     //
263     //  Create a break iterator from the compiled rules.
264     //     (Identical to creation from stored pre-compiled rules)
265     //
266     // status is checked after init in construction.
267     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
268     if (U_FAILURE(status)) {
269         delete This;
270         This = NULL;
271     }
272     else if(This == NULL) { // test for NULL
273         status = U_MEMORY_ALLOCATION_ERROR;
274     }
275     return This;
276 }
277 
build(UErrorCode & status)278 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
279     if (U_FAILURE(status)) {
280         return nullptr;
281     }
282 
283     fScanner->parse();
284     if (U_FAILURE(status)) {
285         return nullptr;
286     }
287 
288     //
289     // UnicodeSet processing.
290     //    Munge the Unicode Sets to create an initial set of character categories.
291     //
292     fSetBuilder->buildRanges();
293 
294     //
295     //   Generate the DFA state transition table.
296     //
297     fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
298     if (fForwardTable == nullptr) {
299         status = U_MEMORY_ALLOCATION_ERROR;
300         return nullptr;
301     }
302 
303     fForwardTable->buildForwardTable();
304 
305     // State table and character category optimization.
306     // Merge equivalent rows and columns.
307     // Note that this process alters the initial set of character categories,
308     // causing the representation of UnicodeSets in the parse tree to become invalid.
309 
310     optimizeTables();
311     fForwardTable->buildSafeReverseTable(status);
312 
313 
314 #ifdef RBBI_DEBUG
315     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
316         fForwardTable->printStates();
317         fForwardTable->printRuleStatusTable();
318         fForwardTable->printReverseTable();
319     }
320 #endif
321 
322     //    Generate the mapping tables (TRIE) from input code points to
323     //    the character categories.
324     //
325     fSetBuilder->buildTrie();
326 
327     //
328     //   Package up the compiled data into a memory image
329     //      in the run-time format.
330     //
331     RBBIDataHeader *data = flattenData(); // returns NULL if error
332     if (U_FAILURE(status)) {
333         return nullptr;
334     }
335     return data;
336 }
337 
optimizeTables()338 void RBBIRuleBuilder::optimizeTables() {
339     bool didSomething;
340     do {
341         didSomething = false;
342 
343         // Begin looking for duplicates with char class 3.
344         // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
345         // and should not have other categories merged into them.
346         IntPair duplPair = {3, 0};
347         while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
348             fSetBuilder->mergeCategories(duplPair);
349             fForwardTable->removeColumn(duplPair.second);
350             didSomething = true;
351         }
352 
353         while (fForwardTable->removeDuplicateStates() > 0) {
354             didSomething = true;
355         }
356     } while (didSomething);
357 }
358 
359 U_NAMESPACE_END
360 
361 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
362