1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ 16 #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ 17 18 #include "integral_types.h" 19 #include "cld2tablesummary.h" 20 #include "utf8statetable.h" 21 #include "scoreonescriptspan.h" 22 23 /* 24 There are two primary parts to a CLD2 dynamic data file: 25 1. A header, wherein trivial data, block lengths and block offsets are kept 26 2. A data block, wherein the large binary blocks are kept 27 28 By reading the header, an application can determine the offsets and lengths of 29 all the data blocks for all tables. Offsets in the header are expressed 30 relative to the first byte of the file, inclusive of the header itself; thus, 31 any offset whose value is less than the length of the header is invalid. 32 33 Any offset whose value is zero indicates a field that is null in the 34 underlying CLD2 data; a real example of this is the fast_state field of the 35 UTF8PropObj, which may be null. 36 37 The size of the header can be precalculated by calling calculateHeaderSize(), 38 which will indicate the exact size of the header for a data file that contains 39 a given number of CLD2TableSummary objects. 40 41 Notes on endianness: 42 The data format is only suitable for little-endian machines. For big-endian 43 systems, a tedious transformation would need to be made first to reverse the 44 byte order of significant portions of the binary - not just the lengths, but 45 also some of the underlying table data. 46 47 Note on 32/64 bit: 48 The data format is agnostic to 32/64 bit pointers. All the offsets within the 49 data blob itself are 32-bit values relative to the start of the file, and the 50 file should certainly never be gigabytes in size! 51 When the file is ultimately read by the loading code and mmap()'d, new 52 pointers are generated at whatever size the system uses, initialized to the 53 start of the mmap, and incremented by the 32-bit offset. This should be safe 54 regardless of 32- or 64-bit architectures. 55 56 -------------------------------------------------------------------- 57 FIELD 58 -------------------------------------------------------------------- 59 DATA_FILE_MARKER (no null terminator) 60 total file size (sanity check, uint32) 61 -------------------------------------------------------------------- 62 UTF8PropObj: const uint32 state0 63 UTF8PropObj: const uint32 state0_size 64 UTF8PropObj: const uint32 total_size 65 UTF8PropObj: const int max_expand 66 UTF8PropObj: const int entry_shift (coerced to 32 bits) 67 UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) 68 UTF8PropObj: const uint32 losub 69 UTF8PropObj: const uint32 hiadd 70 offset of UTF8PropObj: const uint8* state_table 71 length of UTF8PropObj: const uint8* state_table 72 offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 73 length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 74 offset of UTF8PropObj: const uint8* remap_string 75 length of UTF8PropObj: const uint8* remap_string 76 offset of UTF8PropObj: const uint8* fast_state 77 length of UTF8PropObj: const uint8* fast_state 78 -------------------------------------------------------------------- 79 start of const short kAvgDeltaOctaScore[] 80 length of const short kAvgDeltaOctaScore[] 81 -------------------------------------------------------------------- 82 number of CLD2TableSummary objects encoded (n) 83 [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne 84 [Table 1]: CLD2TableSummary: uint32 kCLDTableSize 85 [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask 86 [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate 87 [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 88 [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 89 [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd 90 [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd 91 [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts 92 [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 93 . 94 . 95 . 96 [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne 97 [Table n]: CLD2TableSummary: uint32 kCLDTableSize 98 [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask 99 [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate 100 [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 101 [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable 102 [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd 103 [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd 104 [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts 105 [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 106 -------------------------------------------------------------------- 107 108 109 Immediately after the header fields comes the data block. The data block has 110 the following content, in this order (note that padding is applied in order to 111 keep lookups word-aligned): 112 113 UTF8PropObj: const uint8* state_table 114 UTF8PropObj: const RemapEntry* remap_base (4-byte struct) 115 UTF8PropObj: const uint8* remap_string 116 UTF8PropObj: const uint8* fast_state 117 const short kAvgDeltaOctaScore[] 118 [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable 119 [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd 120 [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) 121 . 122 . 123 . 124 [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable 125 [Table n]: CLD2TableSummary: const uint32* kCLDTableInd 126 [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) 127 128 129 It is STRONGLY recommended that the chunks within the data block be kept 130 128-bit aligned for efficiency reasons, although the code will work without 131 such alignment: the main lookup tables have randomly-accessed groups of four 132 4-byte entries, and these must be 16-byte aligned to avoid the performance 133 cost of multiple cache misses per group. 134 */ 135 namespace CLD2DynamicData { 136 137 static const char* DATA_FILE_MARKER = "cld2_data_file00"; 138 static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits 139 140 // Nicer version of memcmp that shows the offset at which bytes differ 141 bool mem_compare(const void* data1, const void* data2, const int length); 142 143 // Enable or disable debugging; 0 to disable, 1 to enable 144 void setDebug(int debug); 145 146 // Lower-level structure for individual tables. There are n table headers in 147 // a given file header. 148 typedef struct { 149 CLD2::uint32 kCLDTableSizeOne; 150 CLD2::uint32 kCLDTableSize; 151 CLD2::uint32 kCLDTableKeyMask; 152 CLD2::uint32 kCLDTableBuildDate; 153 CLD2::uint32 startOf_kCLDTable; 154 CLD2::uint32 lengthOf_kCLDTable; 155 CLD2::uint32 startOf_kCLDTableInd; 156 CLD2::uint32 lengthOf_kCLDTableInd; 157 CLD2::uint32 startOf_kRecognizedLangScripts; 158 CLD2::uint32 lengthOf_kRecognizedLangScripts; 159 } TableHeader; 160 161 162 // Top-level structure for a CLD2 Data File Header. 163 // Contains all the primitive fields for the header as well as an array of 164 // headers for the individual tables. 165 typedef struct { 166 // Marker fields help recognize and verify the data file 167 char sanityString[DATA_FILE_MARKER_LENGTH]; 168 CLD2::uint32 totalFileSizeBytes; 169 170 // UTF8 primitives 171 CLD2::uint32 utf8PropObj_state0; 172 CLD2::uint32 utf8PropObj_state0_size; 173 CLD2::uint32 utf8PropObj_total_size; 174 CLD2::uint32 utf8PropObj_max_expand; 175 CLD2::uint32 utf8PropObj_entry_shift; 176 CLD2::uint32 utf8PropObj_bytes_per_entry; 177 CLD2::uint32 utf8PropObj_losub; 178 CLD2::uint32 utf8PropObj_hiadd; 179 CLD2::uint32 startOf_utf8PropObj_state_table; 180 CLD2::uint32 lengthOf_utf8PropObj_state_table; 181 CLD2::uint32 startOf_utf8PropObj_remap_base; 182 CLD2::uint32 lengthOf_utf8PropObj_remap_base; 183 CLD2::uint32 startOf_utf8PropObj_remap_string; 184 CLD2::uint32 lengthOf_utf8PropObj_remap_string; 185 CLD2::uint32 startOf_utf8PropObj_fast_state; 186 CLD2::uint32 lengthOf_utf8PropObj_fast_state; 187 188 // Average delta-octa-score bits 189 CLD2::uint32 startOf_kAvgDeltaOctaScore; 190 CLD2::uint32 lengthOf_kAvgDeltaOctaScore; 191 192 // Table bits 193 CLD2::uint32 numTablesEncoded; 194 TableHeader* tableHeaders; 195 } FileHeader; 196 197 // Calculate the exact size of a header that encodes the specified number of 198 // tables. This can be used to reserve space within the data file, 199 // calculate offsets, and so on. 200 CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); 201 202 // Dump a given header to stdout as a human-readable string. 203 void dumpHeader(FileHeader* header); 204 205 // Verify that a given pair of scoring tables match precisely 206 // If there is a problem, returns an error message; otherwise, the empty string. 207 bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); 208 209 // Return true iff the program is running in little-endian mode. 210 bool isLittleEndian(); 211 212 // Return true iff the core size assumptions are ok on this platform. 213 bool coreAssumptionsOk(); 214 215 } // End namespace CLD2DynamicData 216 #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ 217