1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #ifndef MOZC_DATA_MANAGER_DATA_MANAGER_H_
31 #define MOZC_DATA_MANAGER_DATA_MANAGER_H_
32 
33 #include <iosfwd>
34 #include <string>
35 #include <utility>
36 #include <vector>
37 
38 #include "base/mmap.h"
39 #include "base/port.h"
40 #include "base/string_piece.h"
41 #include "data_manager/data_manager_interface.h"
42 
43 namespace mozc {
44 
45 class DataSetReader;  // Forward-declare this as it is used privately.
46 
47 // This data manager parses a data set file image and extracts each data
48 // (dictionary, LM, etc.).
49 // TODO(noriyukit): Migrate all the embedded data managers, such as
50 // oss/oss_data_manager.h, to this one.
51 class DataManager : public DataManagerInterface {
52  public:
53   // Return status for initialization.
54   enum class Status {
55     OK = 0,
56     ENGINE_VERSION_MISMATCH = 1,
57     DATA_MISSING = 2,
58     DATA_BROKEN = 3,
59     MMAP_FAILURE = 4,
60     UNKNOWN = 5,
61   };
62 
63   static string StatusCodeToString(Status code);
64 
65   DataManager();
66   ~DataManager() override;
67 
68   // Parses |array| and extracts byte blocks of data set.  The |array| must
69   // outlive this instance.  The second version specifies a custom magic number
70   // to expect (e.g., mock data set has a different magic number).
71   Status InitFromArray(StringPiece array);
72   Status InitFromArray(StringPiece array, StringPiece magic);
73 
74   // The same as above InitFromArray() but the data is loaded using mmap, which
75   // is owned in this instance.
76   Status InitFromFile(const string &path);
77   Status InitFromFile(const string &path, StringPiece magic);
78 
79   // The same as above InitFromArray() but only parses data set for user pos
80   // manager.  For mozc runtime modules, use InitFromArray() because this method
81   // is only for build tools, e.g., rewriter/dictionary_generator.cc (some build
82   // tools depend on user pos data to create outputs, so we need to handle
83   // partial data set).
84   Status InitUserPosManagerDataFromArray(StringPiece array, StringPiece magic);
85   Status InitUserPosManagerDataFromFile(const string &path, StringPiece magic);
86 
87   // Implementation of DataManagerInterface.
88   const uint16 *GetPOSMatcherData() const override;
89   void GetUserPOSData(StringPiece *token_array_data,
90                       StringPiece *string_array_data) const override;
91   void GetConnectorData(const char **data, size_t *size) const override;
92   void GetSystemDictionaryData(const char **data, int *size) const override;
93   void GetCollocationData(const char **array, size_t *size) const override;
94   void GetCollocationSuppressionData(const char **array,
95                                      size_t *size) const override;
96   void GetSuggestionFilterData(const char **data, size_t *size) const override;
97   const uint8 *GetPosGroupData() const override;
98   void GetSegmenterData(size_t *l_num_elements, size_t *r_num_elements,
99                         const uint16 **l_table, const uint16 **r_table,
100                         size_t *bitarray_num_bytes, const char **bitarray_data,
101                         const uint16 **boundary_data) const override;
102   void GetCounterSuffixSortedArray(const char **array,
103                                    size_t *size) const override;
104   void GetSuffixDictionaryData(StringPiece *key_array_data,
105                                StringPiece *value_array_data,
106                                const uint32 **token_array) const override;
107   void GetReadingCorrectionData(
108       StringPiece *value_array_data, StringPiece *error_array_data,
109       StringPiece *correction_array_data) const override;
110   void GetSymbolRewriterData(StringPiece *token_array_data,
111                              StringPiece *string_array_data) const override;
112   void GetEmoticonRewriterData(StringPiece *token_array_data,
113                                StringPiece *string_array_data) const override;
114   void GetEmojiRewriterData(StringPiece *token_array_data,
115                             StringPiece *string_array_data) const override;
116   void GetSingleKanjiRewriterData(
117       StringPiece *token_array_data,
118       StringPiece *string_array_data,
119       StringPiece *variant_type_array_data,
120       StringPiece *variant_token_array_data,
121       StringPiece *variant_string_array_data,
122       StringPiece *noun_prefix_token_array_data,
123       StringPiece *noun_prefix_string_array_data) const override;
124   void GetZeroQueryData(
125       StringPiece *zero_query_token_array_data,
126       StringPiece *zero_query_string_array_data,
127       StringPiece *zero_query_number_token_array_data,
128       StringPiece *zero_query_number_string_array_data) const override;
129 
130 #ifndef NO_USAGE_REWRITER
131   void GetUsageRewriterData(
132       StringPiece *base_conjugation_suffix_data,
133       StringPiece *conjugation_suffix_data,
134       StringPiece *conjugation_index_data,
135       StringPiece *usage_items_data,
136       StringPiece *string_array_data) const override;
137 #endif  // NO_USAGE_REWRITER
138 
139   StringPiece GetTypingModel(const string &name) const override;
140   StringPiece GetDataVersion() const override;
141 
142  private:
143   Status InitFromReader(const DataSetReader &reader);
144 
145   Mmap mmap_;
146   StringPiece pos_matcher_data_;
147   StringPiece user_pos_token_array_data_;
148   StringPiece user_pos_string_array_data_;
149   StringPiece connection_data_;
150   StringPiece dictionary_data_;
151   StringPiece suggestion_filter_data_;
152   StringPiece collocation_data_;
153   StringPiece collocation_suppression_data_;
154   StringPiece pos_group_data_;
155   StringPiece boundary_data_;
156   size_t segmenter_compressed_lsize_;
157   size_t segmenter_compressed_rsize_;
158   StringPiece segmenter_ltable_;
159   StringPiece segmenter_rtable_;
160   StringPiece segmenter_bitarray_;
161   StringPiece counter_suffix_data_;
162   StringPiece suffix_key_array_data_;
163   StringPiece suffix_value_array_data_;
164   StringPiece suffix_token_array_data_;
165   StringPiece reading_correction_value_array_data_;
166   StringPiece reading_correction_error_array_data_;
167   StringPiece reading_correction_correction_array_data_;
168   StringPiece symbol_token_array_data_;
169   StringPiece symbol_string_array_data_;
170   StringPiece emoticon_token_array_data_;
171   StringPiece emoticon_string_array_data_;
172   StringPiece emoji_token_array_data_;
173   StringPiece emoji_string_array_data_;
174   StringPiece single_kanji_token_array_data_;
175   StringPiece single_kanji_string_array_data_;
176   StringPiece single_kanji_variant_type_data_;
177   StringPiece single_kanji_variant_token_array_data_;
178   StringPiece single_kanji_variant_string_array_data_;
179   StringPiece single_kanji_noun_prefix_token_array_data_;
180   StringPiece single_kanji_noun_prefix_string_array_data_;
181   StringPiece zero_query_token_array_data_;
182   StringPiece zero_query_string_array_data_;
183   StringPiece zero_query_number_token_array_data_;
184   StringPiece zero_query_number_string_array_data_;
185   StringPiece usage_base_conjugation_suffix_data_;
186   StringPiece usage_conjugation_suffix_data_;
187   StringPiece usage_conjugation_index_data_;
188   StringPiece usage_items_data_;
189   StringPiece usage_string_array_data_;
190   std::vector<std::pair<string, StringPiece>> typing_model_data_;
191   StringPiece data_version_;
192 
193   DISALLOW_COPY_AND_ASSIGN(DataManager);
194 };
195 
196 // Print helper for DataManager::Status.  Logging, e.g., CHECK_EQ(), requires
197 // arguments to be printable.
198 std::ostream &operator<<(std::ostream &os, DataManager::Status status);
199 
200 }  // namespace mozc
201 
202 #endif  // MOZC_DATA_MANAGER_DATA_MANAGER_H_
203