1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014 International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  rbbidata.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   RBBI data formats  Includes
16 *
17 *                          Structs that describes the format of the Binary RBBI data,
18 *                          as it is stored in ICU's data file.
19 *
20 *      RBBIDataWrapper  -  Instances of this class sit between the
21 *                          raw data structs and the RulesBasedBreakIterator objects
22 *                          that are created by applications.  The wrapper class
23 *                          provides reference counting for the underlying data,
24 *                          and direct pointers to data that would not otherwise
25 *                          be accessible without ugly pointer arithmetic.  The
26 *                          wrapper does not attempt to provide any higher level
27 *                          abstractions for the data itself.
28 *
29 *                          There will be only one instance of RBBIDataWrapper for any
30 *                          set of RBBI run time data being shared by instances
31 *                          (clones) of RulesBasedBreakIterator.
32 */
33 
34 #ifndef __RBBIDATA_H__
35 #define __RBBIDATA_H__
36 
37 #include "unicode/utypes.h"
38 #include "unicode/udata.h"
39 #include "udataswp.h"
40 
41 /**
42  * Swap RBBI data. See udataswp.h.
43  * @internal
44  */
45 U_CAPI int32_t U_EXPORT2
46 ubrk_swap(const UDataSwapper *ds,
47           const void *inData, int32_t length, void *outData,
48           UErrorCode *pErrorCode);
49 
50 #ifdef __cplusplus
51 
52 #include "unicode/ucptrie.h"
53 #include "unicode/uobject.h"
54 #include "unicode/unistr.h"
55 #include "unicode/uversion.h"
56 #include "umutex.h"
57 
58 
59 U_NAMESPACE_BEGIN
60 
61 // The current RBBI data format version.
62 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
63 
64 /*
65  *   The following structs map exactly onto the raw data from ICU common data file.
66  */
67 struct RBBIDataHeader {
68     uint32_t         fMagic;           /*  == 0xbla0                                               */
69     UVersionInfo     fFormatVersion;   /* Data Format.  Same as the value in struct UDataInfo      */
70                                        /*   if there is one associated with this data.             */
71                                        /*     (version originates in rbbi, is copied to UDataInfo) */
72     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
73                                        /*      including all sections, not just the header.        */
74     uint32_t         fCatCount;        /*  Number of character categories.                         */
75 
76     /*                                                                        */
77     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
78     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
79     /*  All sizes are in bytes.                                               */
80     /*                                                                        */
81     uint32_t         fFTable;         /*  forward state transition table. */
82     uint32_t         fFTableLen;
83     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
84     uint32_t         fRTableLen;
85     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
86     uint32_t         fTrieLen;
87     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
88     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
89     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
90     uint32_t         fStatusTableLen;
91 
92     uint32_t         fReserved[6];    /*  Reserved for expansion */
93 
94 };
95 
96 
97 
98 template <typename T>
99 struct RBBIStateTableRowT {
100     T               fAccepting;    //  Non-zero if this row is for an accepting state.
101                                    //  Value 0: not an accepting state.
102                                    //        1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
103                                    //       >1: Look-ahead match has completed.
104                                    //           Actual boundary position happened earlier.
105                                    //           Value here == fLookAhead in earlier
106                                    //           state, at actual boundary pos.
107     T               fLookAhead;    //  Non-zero if this row is for a state that
108                                    //    corresponds to a '/' in the rule source.
109                                    //    Value is the same as the fAccepting
110                                    //    value for the rule (which will appear
111                                    //    in a different state.
112     T               fTagsIdx;      //  Non-zero if this row covers a {tagged} position
113                                    //    from a rule.  Value is the index in the
114                                    //    StatusTable of the set of matching
115                                    //    tags (rule status values)
116     T               fNextState[1]; //  Next State, indexed by char category.
117                                    //    Variable-length array declared with length 1
118                                    //    to disable bounds checkers.
119                                    //    Array Size is actually fData->fHeader->fCatCount
120                                    //    CAUTION:  see RBBITableBuilder::getTableSize()
121                                    //              before changing anything here.
122 };
123 
124 typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
125 typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
126 
127 constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1;   // Value constant for RBBIStateTableRow::fAccepting
128 
129 union RBBIStateTableRow {
130   RBBIStateTableRow16 r16;
131   RBBIStateTableRow8 r8;
132 };
133 
134 struct RBBIStateTable {
135     uint32_t         fNumStates;            // Number of states.
136     uint32_t         fRowLen;               // Length of a state table row, in bytes.
137     uint32_t         fDictCategoriesStart;  // Char category number of the first dictionary
138                                             //   char class, or the the largest category number + 1
139                                             //   if there are no dictionary categories.
140     uint32_t         fLookAheadResultsSize; // Size of run-time array required for holding
141                                             //   look-ahead results. Indexed by row.fLookAhead.
142     uint32_t         fFlags;                // Option Flags for this state table.
143     char             fTableData[1];         // First RBBIStateTableRow begins here.
144                                             //   Variable-length array declared with length 1
145                                             //   to disable bounds checkers.
146                                             //   (making it char[] simplifies ugly address
147                                             //   arithmetic for indexing variable length rows.)
148 };
149 
150 constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
151 constexpr uint32_t RBBI_BOF_REQUIRED = 2;
152 constexpr uint32_t RBBI_8BITS_ROWS = 4;
153 
154 
155 /*                                        */
156 /*   The reference counting wrapper class */
157 /*                                        */
158 class RBBIDataWrapper : public UMemory {
159 public:
160     enum EDontAdopt {
161         kDontAdopt
162     };
163     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
164     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
165     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
166     ~RBBIDataWrapper();
167 
168     static UBool          isDataVersionAcceptable(const UVersionInfo version);
169 
170     void                  init0();
171     void                  init(const RBBIDataHeader *data, UErrorCode &status);
172     RBBIDataWrapper      *addReference();
173     void                  removeReference();
174     UBool                 operator ==(const RBBIDataWrapper &other) const;
175     int32_t               hashCode();
176     const UnicodeString  &getRuleSourceString() const;
177     void                  printData();
178     void                  printTable(const char *heading, const RBBIStateTable *table);
179 
180     /*                                     */
181     /*   Pointers to items within the data */
182     /*                                     */
183     const RBBIDataHeader     *fHeader;
184     const RBBIStateTable     *fForwardTable;
185     const RBBIStateTable     *fReverseTable;
186     const char               *fRuleSource;
187     const int32_t            *fRuleStatusTable;
188 
189     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
190     int32_t             fStatusMaxIdx;
191 
192     UCPTrie             *fTrie;
193 
194 private:
195     u_atomic_int32_t    fRefCount;
196     UDataMemory        *fUDataMem;
197     UnicodeString       fRuleString;
198     UBool               fDontFreeData;
199 
200     RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
201     RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
202 };
203 
204 
205 
206 U_NAMESPACE_END
207 
208 U_CFUNC UBool rbbi_cleanup(void);
209 
210 #endif /* C++ */
211 
212 #endif
213