1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/user_boundary_history_rewriter.h"
31 
32 #include <algorithm>
33 #include <deque>
34 #include <string>
35 #include <utility>
36 #include <vector>
37 
38 #include "base/config_file_stream.h"
39 #include "base/file_util.h"
40 #include "base/logging.h"
41 #include "base/util.h"
42 #include "config/config_handler.h"
43 #include "converter/converter_interface.h"
44 #include "converter/segments.h"
45 #include "protocol/config.pb.h"
46 #include "request/conversion_request.h"
47 #include "rewriter/rewriter_interface.h"
48 #include "storage/lru_storage.h"
49 #include "usage_stats/usage_stats.h"
50 
51 namespace mozc {
52 
53 using storage::LRUStorage;
54 
55 namespace {
56 const int kValueSize  = 4;
57 const uint32 kLRUSize = 5000;
58 const uint32 kSeedValue = 0x761fea81;
59 
60 const char kFileName[] = "user://boundary.db";
61 
62 enum { INSERT, RESIZE };
63 
64 class LengthArray {
65  public:
ToUCharArray(uint8 * array) const66   void ToUCharArray(uint8 *array) const {
67     array[0] = length0_;
68     array[1] = length1_;
69     array[2] = length2_;
70     array[3] = length3_;
71     array[4] = length4_;
72     array[5] = length5_;
73     array[6] = length6_;
74     array[7] = length7_;
75   }
76 
CopyFromUCharArray(const uint8 * array)77   void CopyFromUCharArray(const uint8 *array) {
78     length0_ = array[0];
79     length1_ = array[1];
80     length2_ = array[2];
81     length3_ = array[3];
82     length4_ = array[4];
83     length5_ = array[5];
84     length6_ = array[6];
85     length7_ = array[7];
86   }
87 
Equal(const LengthArray & r) const88   bool Equal(const LengthArray &r) const {
89     return (length0_ == r.length0_ &&
90             length1_ == r.length1_ &&
91             length2_ == r.length2_ &&
92             length3_ == r.length3_ &&
93             length4_ == r.length4_ &&
94             length5_ == r.length5_ &&
95             length6_ == r.length6_ &&
96             length7_ == r.length7_);
97   }
98 
99  private:
100   uint8 length0_ : 4;
101   uint8 length1_ : 4;
102   uint8 length2_ : 4;
103   uint8 length3_ : 4;
104   uint8 length4_ : 4;
105   uint8 length5_ : 4;
106   uint8 length6_ : 4;
107   uint8 length7_ : 4;
108 };
109 }  // namespace
110 
UserBoundaryHistoryRewriter(const ConverterInterface * parent_converter)111 UserBoundaryHistoryRewriter::UserBoundaryHistoryRewriter(
112     const ConverterInterface *parent_converter)
113     : parent_converter_(parent_converter),
114       storage_(new LRUStorage) {
115   DCHECK(parent_converter_);
116   Reload();
117 }
118 
~UserBoundaryHistoryRewriter()119 UserBoundaryHistoryRewriter::~UserBoundaryHistoryRewriter() {}
120 
Finish(const ConversionRequest & request,Segments * segments)121 void UserBoundaryHistoryRewriter::Finish(const ConversionRequest &request,
122                                          Segments *segments) {
123   if (segments->request_type() != Segments::CONVERSION) {
124     return;
125   }
126 
127   if (request.config().incognito_mode()) {
128     VLOG(2) << "incognito mode";
129     return;
130   }
131 
132   if (request.config().history_learning_level() !=
133       config::Config::DEFAULT_HISTORY) {
134     VLOG(2) << "history_learning_level is not DEFAULT_HISTORY";
135     return;
136   }
137 
138   if (!segments->user_history_enabled()) {
139     VLOG(2) << "!user_history_enabled";
140     return;
141   }
142 
143   if (storage_.get() == NULL) {
144     VLOG(2) << "storage is NULL";
145     return;
146   }
147 
148   if (segments->resized()) {
149     ResizeOrInsert(segments, request, INSERT);
150 #ifdef OS_ANDROID
151     // TODO(hidehiko): UsageStats requires some functionalities, e.g. network,
152     // which are not needed for mozc's main features.
153     // So, to focus on the main features' developping, we just skip it for now.
154     // Note: we can #ifdef inside SetInteger, but to build it we need to build
155     // other methods in usage_stats as well. So we'll exclude the method here
156     // for now.
157 #else
158     // update usage stats here
159     usage_stats::UsageStats::SetInteger(
160         "UserBoundaryHistoryEntrySize",
161         static_cast<int>(storage_->used_size()));
162 #endif
163   }
164 }
165 
Rewrite(const ConversionRequest & request,Segments * segments) const166 bool UserBoundaryHistoryRewriter::Rewrite(
167     const ConversionRequest &request, Segments *segments) const {
168   if (request.config().incognito_mode()) {
169     VLOG(2) << "incognito mode";
170     return false;
171   }
172 
173   if (request.config().history_learning_level() == config::Config::NO_HISTORY) {
174     VLOG(2) << "history_learning_level is NO_HISTORY";
175     return false;
176   }
177 
178   if (!segments->user_history_enabled()) {
179     VLOG(2) << "!user_history_enabled";
180     return false;
181   }
182 
183   if (storage_.get() == NULL) {
184     VLOG(2) << "storage is NULL";
185     return false;
186   }
187 
188   if (request.skip_slow_rewriters()) {
189     return false;
190   }
191 
192   if (!segments->resized()) {
193     return ResizeOrInsert(segments, request, RESIZE);
194   }
195 
196   return false;
197 }
198 
Reload()199 bool UserBoundaryHistoryRewriter::Reload() {
200   const string filename = ConfigFileStream::GetFileName(kFileName);
201   if (!storage_->OpenOrCreate(filename.c_str(),
202                               kValueSize, kLRUSize, kSeedValue)) {
203     LOG(WARNING) << "cannot initialize UserBoundaryHistoryRewriter";
204     storage_.reset();
205     return false;
206   }
207 
208   const char kFileSuffix[] = ".merge_pending";
209   const string merge_pending_file = filename + kFileSuffix;
210 
211   // merge pending file does not always exist.
212   if (FileUtil::FileExists(merge_pending_file)) {
213     storage_->Merge(merge_pending_file.c_str());
214     FileUtil::Unlink(merge_pending_file);
215   }
216 
217   return true;
218 }
219 
220 // TODO(taku): split Reize/Insert into different functions
ResizeOrInsert(Segments * segments,const ConversionRequest & request,int type) const221 bool UserBoundaryHistoryRewriter::ResizeOrInsert(
222     Segments *segments, const ConversionRequest &request, int type) const {
223   bool result = false;
224   uint8 length_array[8];
225 
226   const size_t history_segments_size = segments->history_segments_size();
227 
228   // resize segments in [history_segments_size .. target_segments_size - 1]
229   size_t target_segments_size = segments->segments_size();
230 
231   // when INSERTING new history,
232   // Get the prefix of segments having FIXED_VALUE state.
233   if (type == INSERT) {
234     target_segments_size = history_segments_size;
235     for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
236       const Segment &segment = segments->segment(i);
237       if (segment.segment_type() == Segment::FIXED_VALUE) {
238         ++target_segments_size;
239       }
240     }
241   }
242 
243   // No effective segments found
244   if (target_segments_size <= history_segments_size) {
245     return false;
246   }
247 
248   std::deque<std::pair<string, size_t>> keys(target_segments_size -
249                                    history_segments_size);
250   for (size_t i = history_segments_size; i < target_segments_size; ++i) {
251     const Segment &segment = segments->segment(i);
252     keys[i - history_segments_size].first = segment.key();
253     const size_t length = Util::CharsLen(segment.key());
254     if (length > 255) {   // too long segment
255       VLOG(2) << "too long segment";
256       return false;
257     }
258     keys[i - history_segments_size].second = length;
259   }
260 
261   for (size_t i = history_segments_size; i < target_segments_size; ++i) {
262     const size_t kMaxKeysSize = 5;
263     const size_t keys_size = std::min(kMaxKeysSize, keys.size());
264     string key;
265     memset(length_array, 0, sizeof(length_array));
266     for (size_t k = 0; k < keys_size; ++k) {
267       key += keys[k].first;
268       length_array[k] = static_cast<uint8>(keys[k].second);
269     }
270     for (int j = static_cast<int>(keys_size) - 1; j >= 0; --j) {
271       if (type == RESIZE) {
272         const LengthArray *value =
273             reinterpret_cast<const LengthArray *>(storage_->Lookup(key));
274         if (value != NULL) {
275           LengthArray orig_value;
276           orig_value.CopyFromUCharArray(length_array);
277           if (!value->Equal(orig_value)) {
278             value->ToUCharArray(length_array);
279             const int old_segments_size =
280                 static_cast<int>(target_segments_size);
281             VLOG(2) << "ResizeSegment key: " << key << " "
282                     << i - history_segments_size << " " << j + 1
283                     << " " << static_cast<int>(length_array[0])
284                     << " " << static_cast<int>(length_array[1])
285                     << " " << static_cast<int>(length_array[2])
286                     << " " << static_cast<int>(length_array[3])
287                     << " " << static_cast<int>(length_array[4])
288                     << " " << static_cast<int>(length_array[5])
289                     << " " << static_cast<int>(length_array[6])
290                     << " " << static_cast<int>(length_array[7]);
291             parent_converter_->ResizeSegment(segments,
292                                              request,
293                                              i - history_segments_size,
294                                              j + 1,
295                                              length_array, 8);
296             i += (j + target_segments_size - old_segments_size);
297             result = true;
298             break;
299           }
300         }
301       } else if (type == INSERT) {
302         VLOG(2) << "InserteSegment key: " << key << " "
303                 << i - history_segments_size << " " << j + 1
304                 << " " << static_cast<int>(length_array[0])
305                 << " " << static_cast<int>(length_array[1])
306                 << " " << static_cast<int>(length_array[2])
307                 << " " << static_cast<int>(length_array[3])
308                 << " " << static_cast<int>(length_array[4])
309                 << " " << static_cast<int>(length_array[5])
310                 << " " << static_cast<int>(length_array[6])
311                 << " " << static_cast<int>(length_array[7]);
312         LengthArray inserted_value;
313         inserted_value.CopyFromUCharArray(length_array);
314         storage_->Insert(key, reinterpret_cast<const char *>(&inserted_value));
315       }
316 
317       length_array[j] = 0;
318       key.erase(key.size() - keys[j].first.size());
319     }
320 
321     keys.pop_front();  // delete first item
322   }
323 
324   return result;
325 }
326 
Clear()327 void UserBoundaryHistoryRewriter::Clear() {
328   if (storage_.get() != NULL) {
329     VLOG(1) << "Clearing user segment data";
330     storage_->Clear();
331   }
332 }
333 
334 }  // namespace mozc
335