1 // Copyright 2010-2018, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 #ifndef MOZC_DATA_MANAGER_DATA_MANAGER_H_ 31 #define MOZC_DATA_MANAGER_DATA_MANAGER_H_ 32 33 #include <iosfwd> 34 #include <string> 35 #include <utility> 36 #include <vector> 37 38 #include "base/mmap.h" 39 #include "base/port.h" 40 #include "base/string_piece.h" 41 #include "data_manager/data_manager_interface.h" 42 43 namespace mozc { 44 45 class DataSetReader; // Forward-declare this as it is used privately. 46 47 // This data manager parses a data set file image and extracts each data 48 // (dictionary, LM, etc.). 49 // TODO(noriyukit): Migrate all the embedded data managers, such as 50 // oss/oss_data_manager.h, to this one. 51 class DataManager : public DataManagerInterface { 52 public: 53 // Return status for initialization. 54 enum class Status { 55 OK = 0, 56 ENGINE_VERSION_MISMATCH = 1, 57 DATA_MISSING = 2, 58 DATA_BROKEN = 3, 59 MMAP_FAILURE = 4, 60 UNKNOWN = 5, 61 }; 62 63 static string StatusCodeToString(Status code); 64 65 DataManager(); 66 ~DataManager() override; 67 68 // Parses |array| and extracts byte blocks of data set. The |array| must 69 // outlive this instance. The second version specifies a custom magic number 70 // to expect (e.g., mock data set has a different magic number). 71 Status InitFromArray(StringPiece array); 72 Status InitFromArray(StringPiece array, StringPiece magic); 73 74 // The same as above InitFromArray() but the data is loaded using mmap, which 75 // is owned in this instance. 76 Status InitFromFile(const string &path); 77 Status InitFromFile(const string &path, StringPiece magic); 78 79 // The same as above InitFromArray() but only parses data set for user pos 80 // manager. For mozc runtime modules, use InitFromArray() because this method 81 // is only for build tools, e.g., rewriter/dictionary_generator.cc (some build 82 // tools depend on user pos data to create outputs, so we need to handle 83 // partial data set). 84 Status InitUserPosManagerDataFromArray(StringPiece array, StringPiece magic); 85 Status InitUserPosManagerDataFromFile(const string &path, StringPiece magic); 86 87 // Implementation of DataManagerInterface. 88 const uint16 *GetPOSMatcherData() const override; 89 void GetUserPOSData(StringPiece *token_array_data, 90 StringPiece *string_array_data) const override; 91 void GetConnectorData(const char **data, size_t *size) const override; 92 void GetSystemDictionaryData(const char **data, int *size) const override; 93 void GetCollocationData(const char **array, size_t *size) const override; 94 void GetCollocationSuppressionData(const char **array, 95 size_t *size) const override; 96 void GetSuggestionFilterData(const char **data, size_t *size) const override; 97 const uint8 *GetPosGroupData() const override; 98 void GetSegmenterData(size_t *l_num_elements, size_t *r_num_elements, 99 const uint16 **l_table, const uint16 **r_table, 100 size_t *bitarray_num_bytes, const char **bitarray_data, 101 const uint16 **boundary_data) const override; 102 void GetCounterSuffixSortedArray(const char **array, 103 size_t *size) const override; 104 void GetSuffixDictionaryData(StringPiece *key_array_data, 105 StringPiece *value_array_data, 106 const uint32 **token_array) const override; 107 void GetReadingCorrectionData( 108 StringPiece *value_array_data, StringPiece *error_array_data, 109 StringPiece *correction_array_data) const override; 110 void GetSymbolRewriterData(StringPiece *token_array_data, 111 StringPiece *string_array_data) const override; 112 void GetEmoticonRewriterData(StringPiece *token_array_data, 113 StringPiece *string_array_data) const override; 114 void GetEmojiRewriterData(StringPiece *token_array_data, 115 StringPiece *string_array_data) const override; 116 void GetSingleKanjiRewriterData( 117 StringPiece *token_array_data, 118 StringPiece *string_array_data, 119 StringPiece *variant_type_array_data, 120 StringPiece *variant_token_array_data, 121 StringPiece *variant_string_array_data, 122 StringPiece *noun_prefix_token_array_data, 123 StringPiece *noun_prefix_string_array_data) const override; 124 void GetZeroQueryData( 125 StringPiece *zero_query_token_array_data, 126 StringPiece *zero_query_string_array_data, 127 StringPiece *zero_query_number_token_array_data, 128 StringPiece *zero_query_number_string_array_data) const override; 129 130 #ifndef NO_USAGE_REWRITER 131 void GetUsageRewriterData( 132 StringPiece *base_conjugation_suffix_data, 133 StringPiece *conjugation_suffix_data, 134 StringPiece *conjugation_index_data, 135 StringPiece *usage_items_data, 136 StringPiece *string_array_data) const override; 137 #endif // NO_USAGE_REWRITER 138 139 StringPiece GetTypingModel(const string &name) const override; 140 StringPiece GetDataVersion() const override; 141 142 private: 143 Status InitFromReader(const DataSetReader &reader); 144 145 Mmap mmap_; 146 StringPiece pos_matcher_data_; 147 StringPiece user_pos_token_array_data_; 148 StringPiece user_pos_string_array_data_; 149 StringPiece connection_data_; 150 StringPiece dictionary_data_; 151 StringPiece suggestion_filter_data_; 152 StringPiece collocation_data_; 153 StringPiece collocation_suppression_data_; 154 StringPiece pos_group_data_; 155 StringPiece boundary_data_; 156 size_t segmenter_compressed_lsize_; 157 size_t segmenter_compressed_rsize_; 158 StringPiece segmenter_ltable_; 159 StringPiece segmenter_rtable_; 160 StringPiece segmenter_bitarray_; 161 StringPiece counter_suffix_data_; 162 StringPiece suffix_key_array_data_; 163 StringPiece suffix_value_array_data_; 164 StringPiece suffix_token_array_data_; 165 StringPiece reading_correction_value_array_data_; 166 StringPiece reading_correction_error_array_data_; 167 StringPiece reading_correction_correction_array_data_; 168 StringPiece symbol_token_array_data_; 169 StringPiece symbol_string_array_data_; 170 StringPiece emoticon_token_array_data_; 171 StringPiece emoticon_string_array_data_; 172 StringPiece emoji_token_array_data_; 173 StringPiece emoji_string_array_data_; 174 StringPiece single_kanji_token_array_data_; 175 StringPiece single_kanji_string_array_data_; 176 StringPiece single_kanji_variant_type_data_; 177 StringPiece single_kanji_variant_token_array_data_; 178 StringPiece single_kanji_variant_string_array_data_; 179 StringPiece single_kanji_noun_prefix_token_array_data_; 180 StringPiece single_kanji_noun_prefix_string_array_data_; 181 StringPiece zero_query_token_array_data_; 182 StringPiece zero_query_string_array_data_; 183 StringPiece zero_query_number_token_array_data_; 184 StringPiece zero_query_number_string_array_data_; 185 StringPiece usage_base_conjugation_suffix_data_; 186 StringPiece usage_conjugation_suffix_data_; 187 StringPiece usage_conjugation_index_data_; 188 StringPiece usage_items_data_; 189 StringPiece usage_string_array_data_; 190 std::vector<std::pair<string, StringPiece>> typing_model_data_; 191 StringPiece data_version_; 192 193 DISALLOW_COPY_AND_ASSIGN(DataManager); 194 }; 195 196 // Print helper for DataManager::Status. Logging, e.g., CHECK_EQ(), requires 197 // arguments to be printable. 198 std::ostream &operator<<(std::ostream &os, DataManager::Status status); 199 200 } // namespace mozc 201 202 #endif // MOZC_DATA_MANAGER_DATA_MANAGER_H_ 203