1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbiscan.h
5 //
6 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains declarations for class RBBIRuleScanner
10 //
11 
12 
13 #ifndef RBBISCAN_H
14 #define RBBISCAN_H
15 
16 #include "unicode/utypes.h"
17 #include "unicode/uobject.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uniset.h"
20 #include "unicode/parseerr.h"
21 #include "uhash.h"
22 #include "uvector.h"
23 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24                           //    looks up references to $variables within a set.
25 #include "rbbinode.h"
26 #include "rbbirpt.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 class   RBBIRuleBuilder;
31 class   RBBISymbolTable;
32 
33 
34 //--------------------------------------------------------------------------------
35 //
36 //  class RBBIRuleScanner does the lowest level, character-at-a-time
37 //                        scanning of break iterator rules.
38 //
39 //                        The output of the scanner is parse trees for
40 //                        the rule expressions and a list of all Unicode Sets
41 //                        encountered.
42 //
43 //--------------------------------------------------------------------------------
44 
45 class RBBIRuleScanner : public UMemory {
46 public:
47 
48     enum {
49         kStackSize = 100            // The size of the state stack for
50     };                              //   rules parsing.  Corresponds roughly
51                                     //   to the depth of parentheses nesting
52                                     //   that is allowed in the rules.
53 
54     struct RBBIRuleChar {
55         UChar32             fChar;
56         UBool               fEscaped;
RBBIRuleCharRBBIRuleChar57         RBBIRuleChar() : fChar(0), fEscaped(false) {}
58     };
59 
60     RBBIRuleScanner(RBBIRuleBuilder  *rb);
61 
62 
63     virtual    ~RBBIRuleScanner();
64 
65     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
66                                                     // Return false if at end.
67 
68     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
69                                                     //   Only a single character may be pushed.
70 
71     void        parse();                            // Parse the rules, generating two parse
72                                                     //   trees, one each for the forward and
73                                                     //   reverse rules,
74                                                     //   and a list of UnicodeSets encountered.
75 
76     int32_t     numRules();                         // Return the number of rules that have been seen.
77 
78     /**
79      * Return a rules string without unnecessary
80      * characters.
81      */
82     static UnicodeString stripRules(const UnicodeString &rules);
83 private:
84 
85     UBool       doParseActions(int32_t a);
86     void        error(UErrorCode e);                   // error reporting convenience function.
87     void        fixOpStack(RBBINode::OpPrecedence p);
88                                                        //   a character.
89     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
90 
91     UChar32     nextCharLL();
92 #ifdef RBBI_DEBUG
93     void        printNodeStack(const char *title);
94 #endif
95     RBBINode    *pushNewNode(RBBINode::NodeType  t);
96     void        scanSet();
97 
98 
99     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
100 
101     int32_t                       fScanIndex;        // Index of current character being processed
102                                                      //   in the rule input string.
103     int32_t                       fNextIndex;        // Index of the next character, which
104                                                      //   is the first character not yet scanned.
105     UBool                         fQuoteMode;        // Scan is in a 'quoted region'
106     int32_t                       fLineNum;          // Line number in input file.
107     int32_t                       fCharNum;          // Char position within the line.
108     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
109                                                      //   as a single line, not two.
110 
111     RBBIRuleChar                  fC;                // Current char for parse state machine
112                                                      //   processing.
113     UnicodeString                 fVarName;          // $variableName, valid when we've just
114                                                      //   scanned one.
115 
116     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
117                                                      //   parsing.  index by p[state][char-class]
118 
119     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
120     int32_t                       fStackPtr;           //  and pops as specified in the state
121                                                        //  transition rules.
122 
123     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
124                                                            //  during the parse of a rule
125     int32_t                        fNodeStackPtr;
126 
127 
128     UBool                          fReverseRule;     // True if the rule currently being scanned
129                                                      //  is a reverse direction rule (if it
130                                                      //  starts with a '!')
131 
132     UBool                          fLookAheadRule;   // True if the rule includes a '/'
133                                                      //   somewhere within it.
134 
135     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
136 
137     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
138                                                      //   $variable symbols.
139 
140     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
141                                                      //   the sets created while parsing rules.
142                                                      //   The key is the string used for creating
143                                                      //   the set.
144 
145     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
146                                                      //  the scanning of RBBI rules.  The
147                                                      //  indicies for these are assigned by the
148                                                      //  perl script that builds the state tables.
149                                                      //  See rbbirpt.h.
150 
151     int32_t                        fRuleNum;         // Counts each rule as it is scanned.
152 
153     int32_t                        fOptionStart;     // Input index of start of a !!option
154                                                      //   keyword, while being scanned.
155 
156     UnicodeSet *gRuleSet_rule_char;
157     UnicodeSet *gRuleSet_white_space;
158     UnicodeSet *gRuleSet_name_char;
159     UnicodeSet *gRuleSet_name_start_char;
160 
161     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
162     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
163 };
164 
165 U_NAMESPACE_END
166 
167 #endif
168