1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "rewriter/single_kanji_rewriter.h"
31
32 #include <algorithm>
33 #include <iterator>
34 #include <memory>
35 #include <set>
36 #include <string>
37 #include <vector>
38
39 #include "base/logging.h"
40 #include "base/util.h"
41 #include "config/config_handler.h"
42 #include "converter/segments.h"
43 #include "dictionary/pos_matcher.h"
44 #include "protocol/commands.pb.h"
45 #include "protocol/config.pb.h"
46 #include "request/conversion_request.h"
47 #include "rewriter/rewriter_interface.h"
48
49 using mozc::dictionary::POSMatcher;
50
51 namespace mozc {
52 namespace {
53
54 // A random access iterator over uint32 array that increments a pointer by N:
55 // iter -> array[0]
56 // iter + 1 -> array[N]
57 // iter + 2 -> array[2 * N]
58 // ...
59 template <size_t N>
60 class Uint32ArrayIterator
61 : public std::iterator<std::random_access_iterator_tag, uint32> {
62 public:
Uint32ArrayIterator(const uint32 * ptr)63 explicit Uint32ArrayIterator(const uint32 *ptr) : ptr_(ptr) {}
64
operator *() const65 uint32 operator*() const { return *ptr_; }
66
operator [](size_t i) const67 uint32 operator[](size_t i) const {
68 DCHECK_LT(i, N);
69 return ptr_[i];
70 }
71
swap(Uint32ArrayIterator & x)72 void swap(Uint32ArrayIterator &x) {
73 using std::swap;
74 swap(ptr_, x.ptr_);
75 }
76
swap(Uint32ArrayIterator & x,Uint32ArrayIterator & y)77 friend void swap(Uint32ArrayIterator &x, Uint32ArrayIterator &y) {
78 x.swap(y);
79 }
80
operator ++()81 Uint32ArrayIterator &operator++() {
82 ptr_ += N;
83 return *this;
84 }
85
operator ++(int)86 Uint32ArrayIterator operator++(int) {
87 const uint32 *tmp = ptr_;
88 ptr_ += N;
89 return Uint32ArrayIterator(tmp);
90 }
91
operator --()92 Uint32ArrayIterator &operator--() {
93 ptr_ -= N;
94 return *this;
95 }
96
operator --(int)97 Uint32ArrayIterator operator--(int) {
98 const uint32 *tmp = ptr_;
99 ptr_ -= N;
100 return Uint32ArrayIterator(tmp);
101 }
102
operator +=(ptrdiff_t n)103 Uint32ArrayIterator &operator+=(ptrdiff_t n) {
104 ptr_ += n * N;
105 return *this;
106 }
107
operator -=(ptrdiff_t n)108 Uint32ArrayIterator &operator-=(ptrdiff_t n) {
109 ptr_ -= n * N;
110 return *this;
111 }
112
operator +(Uint32ArrayIterator x,ptrdiff_t n)113 friend Uint32ArrayIterator operator+(Uint32ArrayIterator x, ptrdiff_t n) {
114 return Uint32ArrayIterator(x.ptr_ + n * N);
115 }
116
operator +(ptrdiff_t n,Uint32ArrayIterator x)117 friend Uint32ArrayIterator operator+(ptrdiff_t n, Uint32ArrayIterator x) {
118 return Uint32ArrayIterator(x.ptr_ + n * N);
119 }
120
operator -(Uint32ArrayIterator x,ptrdiff_t n)121 friend Uint32ArrayIterator operator-(Uint32ArrayIterator x, ptrdiff_t n) {
122 return Uint32ArrayIterator(x.ptr_ - n * N);
123 }
124
operator -(Uint32ArrayIterator x,Uint32ArrayIterator y)125 friend ptrdiff_t operator-(Uint32ArrayIterator x, Uint32ArrayIterator y) {
126 return (x.ptr_ - y.ptr_) / N;
127 }
128
operator ==(Uint32ArrayIterator x,Uint32ArrayIterator y)129 friend bool operator==(Uint32ArrayIterator x, Uint32ArrayIterator y) {
130 return x.ptr_ == y.ptr_;
131 }
132
operator !=(Uint32ArrayIterator x,Uint32ArrayIterator y)133 friend bool operator!=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
134 return x.ptr_ != y.ptr_;
135 }
136
operator <(Uint32ArrayIterator x,Uint32ArrayIterator y)137 friend bool operator<(Uint32ArrayIterator x, Uint32ArrayIterator y) {
138 return x.ptr_ < y.ptr_;
139 }
140
operator <=(Uint32ArrayIterator x,Uint32ArrayIterator y)141 friend bool operator<=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
142 return x.ptr_ <= y.ptr_;
143 }
144
operator >(Uint32ArrayIterator x,Uint32ArrayIterator y)145 friend bool operator>(Uint32ArrayIterator x, Uint32ArrayIterator y) {
146 return x.ptr_ > y.ptr_;
147 }
148
operator >=(Uint32ArrayIterator x,Uint32ArrayIterator y)149 friend bool operator>=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
150 return x.ptr_ >= y.ptr_;
151 }
152
153 private:
154 const uint32 *ptr_;
155 };
156
157 // Looks up single kanji list from key (reading). Returns false if not found.
158 // The underlying token array, |single_kanji_token_array|, has the following
159 // format:
160 //
161 // +------------------+
162 // | index of key 0 |
163 // +------------------+
164 // | index of value 0 |
165 // +------------------+
166 // | index of key 1 |
167 // +------------------+
168 // | index of value 1 |
169 // +------------------+
170 // | ... |
171 //
172 // Here, each element is of uint32 type. Each of actual string values are
173 // stored in |single_kanji_string_array| at its index.
LookupKanjiList(StringPiece single_kanji_token_array,const SerializedStringArray & single_kanji_string_array,const string & key,std::vector<string> * kanji_list)174 bool LookupKanjiList(StringPiece single_kanji_token_array,
175 const SerializedStringArray &single_kanji_string_array,
176 const string &key, std::vector<string> *kanji_list) {
177 DCHECK(kanji_list);
178 const uint32* token_array =
179 reinterpret_cast<const uint32*>(single_kanji_token_array.data());
180 const size_t token_array_size =
181 single_kanji_token_array.size() / sizeof(uint32);
182
183 const Uint32ArrayIterator<2> end(token_array + token_array_size);
184 const auto iter = std::lower_bound(
185 Uint32ArrayIterator<2>(token_array), end, key,
186 [&single_kanji_string_array](uint32 index, const string &target_key) {
187 return single_kanji_string_array[index] < target_key;
188 });
189 if (iter == end || single_kanji_string_array[iter[0]] != key) {
190 return false;
191 }
192 const StringPiece values = single_kanji_string_array[iter[1]];
193 Util::SplitStringToUtf8Chars(values, kanji_list);
194 return true;
195 }
196
197 // Generates kanji variant description from key. Does nothing if not found.
198 // The underlying token array, |variant_token_array|, has the following
199 // format:
200 //
201 // +-------------------------+
202 // | index of target 0 |
203 // +-------------------------+
204 // | index of original 0 |
205 // +-------------------------+
206 // | index of variant type 0 |
207 // +-------------------------+
208 // | index of target 1 |
209 // +-------------------------+
210 // | index of original 1 |
211 // +-------------------------+
212 // | index of variant type 1 |
213 // +-------------------------+
214 // | ... |
215 //
216 // Here, each element is of uint32 type. Actual strings of target and original
217 // are stored in |variant_string_array|, while strings of variant type are
218 // stored in |variant_type|.
GenerateDescription(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,const string & key,string * desc)219 void GenerateDescription(StringPiece variant_token_array,
220 const SerializedStringArray &variant_string_array,
221 const SerializedStringArray &variant_type,
222 const string &key, string *desc) {
223 DCHECK(desc);
224 const uint32 *token_array =
225 reinterpret_cast<const uint32*>(variant_token_array.data());
226 const size_t token_array_size =
227 variant_token_array.size() / sizeof(uint32);
228
229 const Uint32ArrayIterator<3> end(token_array + token_array_size);
230 const auto iter = std::lower_bound(
231 Uint32ArrayIterator<3>(token_array), end, key,
232 [&variant_string_array](uint32 index, const string &target_key) {
233 return variant_string_array[index] < target_key;
234 });
235 if (iter == end || variant_string_array[iter[0]] != key) {
236 return;
237 }
238 const StringPiece original = variant_string_array[iter[1]];
239 const uint32 type_id = iter[2];
240 DCHECK_LT(type_id, variant_type.size());
241 // Format like "XXXのYYY"
242 desc->assign(original.data(), original.size());
243 desc->append("の");
244 desc->append(variant_type[type_id].data(), variant_type[type_id].size());
245 }
246
247 // Add single kanji variants description to existing candidates,
248 // because if we have candidates with same value, the lower ranked candidate
249 // will be removed.
AddDescriptionForExsistingCandidates(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,Segment * segment)250 void AddDescriptionForExsistingCandidates(
251 StringPiece variant_token_array,
252 const SerializedStringArray &variant_string_array,
253 const SerializedStringArray &variant_type,
254 Segment *segment) {
255 DCHECK(segment);
256 for (size_t i = 0; i < segment->candidates_size(); ++i) {
257 Segment::Candidate *cand = segment->mutable_candidate(i);
258 if (!cand->description.empty()) {
259 continue;
260 }
261 GenerateDescription(variant_token_array, variant_string_array,
262 variant_type, cand->value, &cand->description);
263 }
264 }
265
FillCandidate(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,const string & key,const string & value,int cost,uint16 single_kanji_id,Segment::Candidate * cand)266 void FillCandidate(StringPiece variant_token_array,
267 const SerializedStringArray &variant_string_array,
268 const SerializedStringArray &variant_type,
269 const string &key, const string &value,
270 int cost, uint16 single_kanji_id,
271 Segment::Candidate *cand) {
272 cand->lid = single_kanji_id;
273 cand->rid = single_kanji_id;
274 cand->cost = cost;
275 cand->content_key = key;
276 cand->content_value = value;
277 cand->key = key;
278 cand->value = value;
279 cand->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
280 cand->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
281 GenerateDescription(variant_token_array, variant_string_array,
282 variant_type, value, &cand->description);
283 }
284
285 // Insert SingleKanji into segment.
InsertCandidate(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,bool is_single_segment,uint16 single_kanji_id,const std::vector<string> & kanji_list,Segment * segment)286 void InsertCandidate(StringPiece variant_token_array,
287 const SerializedStringArray &variant_string_array,
288 const SerializedStringArray &variant_type,
289 bool is_single_segment,
290 uint16 single_kanji_id,
291 const std::vector<string> &kanji_list,
292 Segment *segment) {
293 DCHECK(segment);
294 if (segment->candidates_size() == 0) {
295 LOG(WARNING) << "candidates_size is 0";
296 return;
297 }
298
299 const string &candidate_key = ((!segment->key().empty()) ?
300 segment->key() :
301 segment->candidate(0).key);
302
303 // Adding 8000 to the single kanji cost
304 // Note that this cost does not make no effect.
305 // Here we set the cost just in case.
306 const int kOffsetCost = 8000;
307
308 // Append single-kanji
309 for (size_t i = 0; i < kanji_list.size(); ++i) {
310 Segment::Candidate *c = segment->push_back_candidate();
311 FillCandidate(variant_token_array, variant_string_array,
312 variant_type, candidate_key, kanji_list[i],
313 kOffsetCost + i, single_kanji_id, c);
314 }
315 }
316
InsertNounPrefix(const POSMatcher & pos_matcher,Segment * segment,SerializedDictionary::iterator begin,SerializedDictionary::iterator end)317 void InsertNounPrefix(const POSMatcher &pos_matcher,
318 Segment *segment,
319 SerializedDictionary::iterator begin,
320 SerializedDictionary::iterator end) {
321 DCHECK(begin != end);
322
323 if (segment->candidates_size() == 0) {
324 LOG(WARNING) << "candidates_size is 0";
325 return;
326 }
327
328 if (segment->segment_type() == Segment::FIXED_VALUE) {
329 return;
330 }
331
332 const string &candidate_key = ((!segment->key().empty()) ?
333 segment->key() :
334 segment->candidate(0).key);
335 for (auto iter = begin; iter != end; ++iter) {
336 const int insert_pos = std::min(
337 static_cast<int>(segment->candidates_size()),
338 static_cast<int>(iter.cost() + (segment->candidate(0).attributes &
339 Segment::Candidate::CONTEXT_SENSITIVE)
340 ? 1
341 : 0));
342 Segment::Candidate *c = segment->insert_candidate(insert_pos);
343 c->lid = pos_matcher.GetNounPrefixId();
344 c->rid = pos_matcher.GetNounPrefixId();
345 c->cost = 5000;
346 c->content_value = string(iter.value());
347 c->key = candidate_key;
348 c->content_key = candidate_key;
349 c->value = string(iter.value());
350 c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
351 c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
352 }
353 }
354
355 } // namespace
356
SingleKanjiRewriter(const DataManagerInterface & data_manager)357 SingleKanjiRewriter::SingleKanjiRewriter(
358 const DataManagerInterface &data_manager)
359 : pos_matcher_(data_manager.GetPOSMatcherData()) {
360 StringPiece string_array_data;
361 StringPiece variant_type_array_data;
362 StringPiece variant_string_array_data;
363 StringPiece noun_prefix_token_array_data;
364 StringPiece noun_prefix_string_array_data;
365 data_manager.GetSingleKanjiRewriterData(
366 &single_kanji_token_array_,
367 &string_array_data,
368 &variant_type_array_data,
369 &variant_token_array_,
370 &variant_string_array_data,
371 &noun_prefix_token_array_data,
372 &noun_prefix_string_array_data);
373 // Single Kanji token array is an array of uint32. Its size must be multiple
374 // of 2; see the comment above LookupKanjiList.
375 DCHECK_EQ(0, single_kanji_token_array_.size() % (2 * sizeof(uint32)));
376 DCHECK(SerializedStringArray::VerifyData(string_array_data));
377 single_kanji_string_array_.Set(string_array_data);
378
379 DCHECK(SerializedStringArray::VerifyData(variant_type_array_data));
380 variant_type_array_.Set(variant_type_array_data);
381
382 // Variant token array is an array of uint32. Its size must be multiple
383 // of 3; see the comment above GenerateDescription.
384 DCHECK_EQ(0, variant_token_array_.size() % (3 * sizeof(uint32)));
385 DCHECK(SerializedStringArray::VerifyData(variant_string_array_data));
386 variant_string_array_.Set(variant_string_array_data);
387
388 DCHECK(SerializedDictionary::VerifyData(noun_prefix_token_array_data,
389 noun_prefix_string_array_data));
390 noun_prefix_dictionary_.reset(new SerializedDictionary(
391 noun_prefix_token_array_data,
392 noun_prefix_string_array_data));
393 }
394
~SingleKanjiRewriter()395 SingleKanjiRewriter::~SingleKanjiRewriter() {}
396
capability(const ConversionRequest & request) const397 int SingleKanjiRewriter::capability(const ConversionRequest &request) const {
398 if (request.request().mixed_conversion()) {
399 return RewriterInterface::ALL;
400 }
401 return RewriterInterface::CONVERSION;
402 }
403
Rewrite(const ConversionRequest & request,Segments * segments) const404 bool SingleKanjiRewriter::Rewrite(const ConversionRequest &request,
405 Segments *segments) const {
406 if (!request.config().use_single_kanji_conversion()) {
407 VLOG(2) << "no use_single_kanji_conversion";
408 return false;
409 }
410
411 bool modified = false;
412 const size_t segments_size = segments->conversion_segments_size();
413 const bool is_single_segment = (segments_size == 1);
414 for (size_t i = 0; i < segments_size; ++i) {
415 AddDescriptionForExsistingCandidates(
416 variant_token_array_,
417 variant_string_array_,
418 variant_type_array_,
419 segments->mutable_conversion_segment(i));
420
421 const string &key = segments->conversion_segment(i).key();
422 std::vector<string> kanji_list;
423 if (!LookupKanjiList(single_kanji_token_array_, single_kanji_string_array_,
424 key, &kanji_list)) {
425 continue;
426 }
427 InsertCandidate(variant_token_array_,
428 variant_string_array_,
429 variant_type_array_,
430 is_single_segment,
431 pos_matcher_.GetGeneralSymbolId(),
432 kanji_list,
433 segments->mutable_conversion_segment(i));
434
435 modified = true;
436 }
437
438 // Tweak for noun prefix.
439 // TODO(team): Ideally, this issue can be fixed via the language model
440 // and dictionary generation.
441 for (size_t i = 0; i < segments_size; ++i) {
442 if (segments->conversion_segment(i).candidates_size() == 0) {
443 continue;
444 }
445
446 if (i + 1 < segments_size) {
447 const Segment::Candidate &right_candidate =
448 segments->conversion_segment(i + 1).candidate(0);
449 // right segment must be a noun.
450 if (!pos_matcher_.IsContentNoun(right_candidate.lid)) {
451 continue;
452 }
453 } else if (segments_size != 1) { // also apply if segments_size == 1.
454 continue;
455 }
456
457 const string &key = segments->conversion_segment(i).key();
458 const auto range = noun_prefix_dictionary_->equal_range(key);
459 if (range.first == range.second) {
460 continue;
461 }
462 InsertNounPrefix(pos_matcher_,
463 segments->mutable_conversion_segment(i),
464 range.first, range.second);
465 // Ignore the next noun content word.
466 ++i;
467 modified = true;
468 }
469
470 return modified;
471 }
472 } // namespace mozc
473