1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2014 International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: rbbidata.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * RBBI data formats Includes 16 * 17 * Structs that describes the format of the Binary RBBI data, 18 * as it is stored in ICU's data file. 19 * 20 * RBBIDataWrapper - Instances of this class sit between the 21 * raw data structs and the RulesBasedBreakIterator objects 22 * that are created by applications. The wrapper class 23 * provides reference counting for the underlying data, 24 * and direct pointers to data that would not otherwise 25 * be accessible without ugly pointer arithmetic. The 26 * wrapper does not attempt to provide any higher level 27 * abstractions for the data itself. 28 * 29 * There will be only one instance of RBBIDataWrapper for any 30 * set of RBBI run time data being shared by instances 31 * (clones) of RulesBasedBreakIterator. 32 */ 33 34 #ifndef __RBBIDATA_H__ 35 #define __RBBIDATA_H__ 36 37 #include "unicode/utypes.h" 38 #include "unicode/udata.h" 39 #include "udataswp.h" 40 41 /** 42 * Swap RBBI data. See udataswp.h. 43 * @internal 44 */ 45 U_CAPI int32_t U_EXPORT2 46 ubrk_swap(const UDataSwapper *ds, 47 const void *inData, int32_t length, void *outData, 48 UErrorCode *pErrorCode); 49 50 #ifdef __cplusplus 51 52 #include "unicode/ucptrie.h" 53 #include "unicode/uobject.h" 54 #include "unicode/unistr.h" 55 #include "unicode/uversion.h" 56 #include "umutex.h" 57 58 59 U_NAMESPACE_BEGIN 60 61 // The current RBBI data format version. 62 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; 63 64 /* 65 * The following structs map exactly onto the raw data from ICU common data file. 66 */ 67 struct RBBIDataHeader { 68 uint32_t fMagic; /* == 0xbla0 */ 69 UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ 70 /* if there is one associated with this data. */ 71 /* (version originates in rbbi, is copied to UDataInfo) */ 72 uint32_t fLength; /* Total length in bytes of this RBBI Data, */ 73 /* including all sections, not just the header. */ 74 uint32_t fCatCount; /* Number of character categories. */ 75 76 /* */ 77 /* Offsets and sizes of each of the subsections within the RBBI data. */ 78 /* All offsets are bytes from the start of the RBBIDataHeader. */ 79 /* All sizes are in bytes. */ 80 /* */ 81 uint32_t fFTable; /* forward state transition table. */ 82 uint32_t fFTableLen; 83 uint32_t fRTable; /* Offset to the reverse state transition table. */ 84 uint32_t fRTableLen; 85 uint32_t fTrie; /* Offset to Trie data for character categories */ 86 uint32_t fTrieLen; 87 uint32_t fRuleSource; /* Offset to the source for for the break */ 88 uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ 89 uint32_t fStatusTable; /* Offset to the table of rule status values */ 90 uint32_t fStatusTableLen; 91 92 uint32_t fReserved[6]; /* Reserved for expansion */ 93 94 }; 95 96 97 98 template <typename T> 99 struct RBBIStateTableRowT { 100 T fAccepting; // Non-zero if this row is for an accepting state. 101 // Value 0: not an accepting state. 102 // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. 103 // >1: Look-ahead match has completed. 104 // Actual boundary position happened earlier. 105 // Value here == fLookAhead in earlier 106 // state, at actual boundary pos. 107 T fLookAhead; // Non-zero if this row is for a state that 108 // corresponds to a '/' in the rule source. 109 // Value is the same as the fAccepting 110 // value for the rule (which will appear 111 // in a different state. 112 T fTagsIdx; // Non-zero if this row covers a {tagged} position 113 // from a rule. Value is the index in the 114 // StatusTable of the set of matching 115 // tags (rule status values) 116 T fNextState[1]; // Next State, indexed by char category. 117 // Variable-length array declared with length 1 118 // to disable bounds checkers. 119 // Array Size is actually fData->fHeader->fCatCount 120 // CAUTION: see RBBITableBuilder::getTableSize() 121 // before changing anything here. 122 }; 123 124 typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; 125 typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; 126 127 constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting 128 129 union RBBIStateTableRow { 130 RBBIStateTableRow16 r16; 131 RBBIStateTableRow8 r8; 132 }; 133 134 struct RBBIStateTable { 135 uint32_t fNumStates; // Number of states. 136 uint32_t fRowLen; // Length of a state table row, in bytes. 137 uint32_t fDictCategoriesStart; // Char category number of the first dictionary 138 // char class, or the the largest category number + 1 139 // if there are no dictionary categories. 140 uint32_t fLookAheadResultsSize; // Size of run-time array required for holding 141 // look-ahead results. Indexed by row.fLookAhead. 142 uint32_t fFlags; // Option Flags for this state table. 143 char fTableData[1]; // First RBBIStateTableRow begins here. 144 // Variable-length array declared with length 1 145 // to disable bounds checkers. 146 // (making it char[] simplifies ugly address 147 // arithmetic for indexing variable length rows.) 148 }; 149 150 constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; 151 constexpr uint32_t RBBI_BOF_REQUIRED = 2; 152 constexpr uint32_t RBBI_8BITS_ROWS = 4; 153 154 155 /* */ 156 /* The reference counting wrapper class */ 157 /* */ 158 class RBBIDataWrapper : public UMemory { 159 public: 160 enum EDontAdopt { 161 kDontAdopt 162 }; 163 RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); 164 RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); 165 RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); 166 ~RBBIDataWrapper(); 167 168 static UBool isDataVersionAcceptable(const UVersionInfo version); 169 170 void init0(); 171 void init(const RBBIDataHeader *data, UErrorCode &status); 172 RBBIDataWrapper *addReference(); 173 void removeReference(); 174 UBool operator ==(const RBBIDataWrapper &other) const; 175 int32_t hashCode(); 176 const UnicodeString &getRuleSourceString() const; 177 void printData(); 178 void printTable(const char *heading, const RBBIStateTable *table); 179 180 /* */ 181 /* Pointers to items within the data */ 182 /* */ 183 const RBBIDataHeader *fHeader; 184 const RBBIStateTable *fForwardTable; 185 const RBBIStateTable *fReverseTable; 186 const char *fRuleSource; 187 const int32_t *fRuleStatusTable; 188 189 /* number of int32_t values in the rule status table. Used to sanity check indexing */ 190 int32_t fStatusMaxIdx; 191 192 UCPTrie *fTrie; 193 194 private: 195 u_atomic_int32_t fRefCount; 196 UDataMemory *fUDataMem; 197 UnicodeString fRuleString; 198 UBool fDontFreeData; 199 200 RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 201 RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 202 }; 203 204 205 206 U_NAMESPACE_END 207 208 U_CFUNC UBool rbbi_cleanup(void); 209 210 #endif /* C++ */ 211 212 #endif 213