1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "rewriter/user_boundary_history_rewriter.h"
31
32 #include <algorithm>
33 #include <deque>
34 #include <string>
35 #include <utility>
36 #include <vector>
37
38 #include "base/config_file_stream.h"
39 #include "base/file_util.h"
40 #include "base/logging.h"
41 #include "base/util.h"
42 #include "config/config_handler.h"
43 #include "converter/converter_interface.h"
44 #include "converter/segments.h"
45 #include "protocol/config.pb.h"
46 #include "request/conversion_request.h"
47 #include "rewriter/rewriter_interface.h"
48 #include "storage/lru_storage.h"
49 #include "usage_stats/usage_stats.h"
50
51 namespace mozc {
52
53 using storage::LRUStorage;
54
55 namespace {
56 const int kValueSize = 4;
57 const uint32 kLRUSize = 5000;
58 const uint32 kSeedValue = 0x761fea81;
59
60 const char kFileName[] = "user://boundary.db";
61
62 enum { INSERT, RESIZE };
63
64 class LengthArray {
65 public:
ToUCharArray(uint8 * array) const66 void ToUCharArray(uint8 *array) const {
67 array[0] = length0_;
68 array[1] = length1_;
69 array[2] = length2_;
70 array[3] = length3_;
71 array[4] = length4_;
72 array[5] = length5_;
73 array[6] = length6_;
74 array[7] = length7_;
75 }
76
CopyFromUCharArray(const uint8 * array)77 void CopyFromUCharArray(const uint8 *array) {
78 length0_ = array[0];
79 length1_ = array[1];
80 length2_ = array[2];
81 length3_ = array[3];
82 length4_ = array[4];
83 length5_ = array[5];
84 length6_ = array[6];
85 length7_ = array[7];
86 }
87
Equal(const LengthArray & r) const88 bool Equal(const LengthArray &r) const {
89 return (length0_ == r.length0_ &&
90 length1_ == r.length1_ &&
91 length2_ == r.length2_ &&
92 length3_ == r.length3_ &&
93 length4_ == r.length4_ &&
94 length5_ == r.length5_ &&
95 length6_ == r.length6_ &&
96 length7_ == r.length7_);
97 }
98
99 private:
100 uint8 length0_ : 4;
101 uint8 length1_ : 4;
102 uint8 length2_ : 4;
103 uint8 length3_ : 4;
104 uint8 length4_ : 4;
105 uint8 length5_ : 4;
106 uint8 length6_ : 4;
107 uint8 length7_ : 4;
108 };
109 } // namespace
110
UserBoundaryHistoryRewriter(const ConverterInterface * parent_converter)111 UserBoundaryHistoryRewriter::UserBoundaryHistoryRewriter(
112 const ConverterInterface *parent_converter)
113 : parent_converter_(parent_converter),
114 storage_(new LRUStorage) {
115 DCHECK(parent_converter_);
116 Reload();
117 }
118
~UserBoundaryHistoryRewriter()119 UserBoundaryHistoryRewriter::~UserBoundaryHistoryRewriter() {}
120
Finish(const ConversionRequest & request,Segments * segments)121 void UserBoundaryHistoryRewriter::Finish(const ConversionRequest &request,
122 Segments *segments) {
123 if (segments->request_type() != Segments::CONVERSION) {
124 return;
125 }
126
127 if (request.config().incognito_mode()) {
128 VLOG(2) << "incognito mode";
129 return;
130 }
131
132 if (request.config().history_learning_level() !=
133 config::Config::DEFAULT_HISTORY) {
134 VLOG(2) << "history_learning_level is not DEFAULT_HISTORY";
135 return;
136 }
137
138 if (!segments->user_history_enabled()) {
139 VLOG(2) << "!user_history_enabled";
140 return;
141 }
142
143 if (storage_.get() == NULL) {
144 VLOG(2) << "storage is NULL";
145 return;
146 }
147
148 if (segments->resized()) {
149 ResizeOrInsert(segments, request, INSERT);
150 #ifdef OS_ANDROID
151 // TODO(hidehiko): UsageStats requires some functionalities, e.g. network,
152 // which are not needed for mozc's main features.
153 // So, to focus on the main features' developping, we just skip it for now.
154 // Note: we can #ifdef inside SetInteger, but to build it we need to build
155 // other methods in usage_stats as well. So we'll exclude the method here
156 // for now.
157 #else
158 // update usage stats here
159 usage_stats::UsageStats::SetInteger(
160 "UserBoundaryHistoryEntrySize",
161 static_cast<int>(storage_->used_size()));
162 #endif
163 }
164 }
165
Rewrite(const ConversionRequest & request,Segments * segments) const166 bool UserBoundaryHistoryRewriter::Rewrite(
167 const ConversionRequest &request, Segments *segments) const {
168 if (request.config().incognito_mode()) {
169 VLOG(2) << "incognito mode";
170 return false;
171 }
172
173 if (request.config().history_learning_level() == config::Config::NO_HISTORY) {
174 VLOG(2) << "history_learning_level is NO_HISTORY";
175 return false;
176 }
177
178 if (!segments->user_history_enabled()) {
179 VLOG(2) << "!user_history_enabled";
180 return false;
181 }
182
183 if (storage_.get() == NULL) {
184 VLOG(2) << "storage is NULL";
185 return false;
186 }
187
188 if (request.skip_slow_rewriters()) {
189 return false;
190 }
191
192 if (!segments->resized()) {
193 return ResizeOrInsert(segments, request, RESIZE);
194 }
195
196 return false;
197 }
198
Reload()199 bool UserBoundaryHistoryRewriter::Reload() {
200 const string filename = ConfigFileStream::GetFileName(kFileName);
201 if (!storage_->OpenOrCreate(filename.c_str(),
202 kValueSize, kLRUSize, kSeedValue)) {
203 LOG(WARNING) << "cannot initialize UserBoundaryHistoryRewriter";
204 storage_.reset();
205 return false;
206 }
207
208 const char kFileSuffix[] = ".merge_pending";
209 const string merge_pending_file = filename + kFileSuffix;
210
211 // merge pending file does not always exist.
212 if (FileUtil::FileExists(merge_pending_file)) {
213 storage_->Merge(merge_pending_file.c_str());
214 FileUtil::Unlink(merge_pending_file);
215 }
216
217 return true;
218 }
219
220 // TODO(taku): split Reize/Insert into different functions
ResizeOrInsert(Segments * segments,const ConversionRequest & request,int type) const221 bool UserBoundaryHistoryRewriter::ResizeOrInsert(
222 Segments *segments, const ConversionRequest &request, int type) const {
223 bool result = false;
224 uint8 length_array[8];
225
226 const size_t history_segments_size = segments->history_segments_size();
227
228 // resize segments in [history_segments_size .. target_segments_size - 1]
229 size_t target_segments_size = segments->segments_size();
230
231 // when INSERTING new history,
232 // Get the prefix of segments having FIXED_VALUE state.
233 if (type == INSERT) {
234 target_segments_size = history_segments_size;
235 for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
236 const Segment &segment = segments->segment(i);
237 if (segment.segment_type() == Segment::FIXED_VALUE) {
238 ++target_segments_size;
239 }
240 }
241 }
242
243 // No effective segments found
244 if (target_segments_size <= history_segments_size) {
245 return false;
246 }
247
248 std::deque<std::pair<string, size_t>> keys(target_segments_size -
249 history_segments_size);
250 for (size_t i = history_segments_size; i < target_segments_size; ++i) {
251 const Segment &segment = segments->segment(i);
252 keys[i - history_segments_size].first = segment.key();
253 const size_t length = Util::CharsLen(segment.key());
254 if (length > 255) { // too long segment
255 VLOG(2) << "too long segment";
256 return false;
257 }
258 keys[i - history_segments_size].second = length;
259 }
260
261 for (size_t i = history_segments_size; i < target_segments_size; ++i) {
262 const size_t kMaxKeysSize = 5;
263 const size_t keys_size = std::min(kMaxKeysSize, keys.size());
264 string key;
265 memset(length_array, 0, sizeof(length_array));
266 for (size_t k = 0; k < keys_size; ++k) {
267 key += keys[k].first;
268 length_array[k] = static_cast<uint8>(keys[k].second);
269 }
270 for (int j = static_cast<int>(keys_size) - 1; j >= 0; --j) {
271 if (type == RESIZE) {
272 const LengthArray *value =
273 reinterpret_cast<const LengthArray *>(storage_->Lookup(key));
274 if (value != NULL) {
275 LengthArray orig_value;
276 orig_value.CopyFromUCharArray(length_array);
277 if (!value->Equal(orig_value)) {
278 value->ToUCharArray(length_array);
279 const int old_segments_size =
280 static_cast<int>(target_segments_size);
281 VLOG(2) << "ResizeSegment key: " << key << " "
282 << i - history_segments_size << " " << j + 1
283 << " " << static_cast<int>(length_array[0])
284 << " " << static_cast<int>(length_array[1])
285 << " " << static_cast<int>(length_array[2])
286 << " " << static_cast<int>(length_array[3])
287 << " " << static_cast<int>(length_array[4])
288 << " " << static_cast<int>(length_array[5])
289 << " " << static_cast<int>(length_array[6])
290 << " " << static_cast<int>(length_array[7]);
291 parent_converter_->ResizeSegment(segments,
292 request,
293 i - history_segments_size,
294 j + 1,
295 length_array, 8);
296 i += (j + target_segments_size - old_segments_size);
297 result = true;
298 break;
299 }
300 }
301 } else if (type == INSERT) {
302 VLOG(2) << "InserteSegment key: " << key << " "
303 << i - history_segments_size << " " << j + 1
304 << " " << static_cast<int>(length_array[0])
305 << " " << static_cast<int>(length_array[1])
306 << " " << static_cast<int>(length_array[2])
307 << " " << static_cast<int>(length_array[3])
308 << " " << static_cast<int>(length_array[4])
309 << " " << static_cast<int>(length_array[5])
310 << " " << static_cast<int>(length_array[6])
311 << " " << static_cast<int>(length_array[7]);
312 LengthArray inserted_value;
313 inserted_value.CopyFromUCharArray(length_array);
314 storage_->Insert(key, reinterpret_cast<const char *>(&inserted_value));
315 }
316
317 length_array[j] = 0;
318 key.erase(key.size() - keys[j].first.size());
319 }
320
321 keys.pop_front(); // delete first item
322 }
323
324 return result;
325 }
326
Clear()327 void UserBoundaryHistoryRewriter::Clear() {
328 if (storage_.get() != NULL) {
329 VLOG(1) << "Clearing user segment data";
330 storage_->Clear();
331 }
332 }
333
334 } // namespace mozc
335