1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/single_kanji_rewriter.h"
31 
32 #include <algorithm>
33 #include <iterator>
34 #include <memory>
35 #include <set>
36 #include <string>
37 #include <vector>
38 
39 #include "base/logging.h"
40 #include "base/util.h"
41 #include "config/config_handler.h"
42 #include "converter/segments.h"
43 #include "dictionary/pos_matcher.h"
44 #include "protocol/commands.pb.h"
45 #include "protocol/config.pb.h"
46 #include "request/conversion_request.h"
47 #include "rewriter/rewriter_interface.h"
48 
49 using mozc::dictionary::POSMatcher;
50 
51 namespace mozc {
52 namespace {
53 
54 // A random access iterator over uint32 array that increments a pointer by N:
55 // iter     -> array[0]
56 // iter + 1 -> array[N]
57 // iter + 2 -> array[2 * N]
58 // ...
59 template <size_t N>
60 class Uint32ArrayIterator
61     : public std::iterator<std::random_access_iterator_tag, uint32> {
62  public:
Uint32ArrayIterator(const uint32 * ptr)63   explicit Uint32ArrayIterator(const uint32 *ptr) : ptr_(ptr) {}
64 
operator *() const65   uint32 operator*() const { return *ptr_; }
66 
operator [](size_t i) const67   uint32 operator[](size_t i) const {
68     DCHECK_LT(i, N);
69     return ptr_[i];
70   }
71 
swap(Uint32ArrayIterator & x)72   void swap(Uint32ArrayIterator &x) {
73     using std::swap;
74     swap(ptr_, x.ptr_);
75   }
76 
swap(Uint32ArrayIterator & x,Uint32ArrayIterator & y)77   friend void swap(Uint32ArrayIterator &x, Uint32ArrayIterator &y) {
78     x.swap(y);
79   }
80 
operator ++()81   Uint32ArrayIterator &operator++() {
82     ptr_ += N;
83     return *this;
84   }
85 
operator ++(int)86   Uint32ArrayIterator operator++(int) {
87     const uint32 *tmp = ptr_;
88     ptr_ += N;
89     return Uint32ArrayIterator(tmp);
90   }
91 
operator --()92   Uint32ArrayIterator &operator--() {
93     ptr_ -= N;
94     return *this;
95   }
96 
operator --(int)97   Uint32ArrayIterator operator--(int) {
98     const uint32 *tmp = ptr_;
99     ptr_ -= N;
100     return Uint32ArrayIterator(tmp);
101   }
102 
operator +=(ptrdiff_t n)103   Uint32ArrayIterator &operator+=(ptrdiff_t n) {
104     ptr_ += n * N;
105     return *this;
106   }
107 
operator -=(ptrdiff_t n)108   Uint32ArrayIterator &operator-=(ptrdiff_t n) {
109     ptr_ -= n * N;
110     return *this;
111   }
112 
operator +(Uint32ArrayIterator x,ptrdiff_t n)113   friend Uint32ArrayIterator operator+(Uint32ArrayIterator x, ptrdiff_t n) {
114     return Uint32ArrayIterator(x.ptr_ + n * N);
115   }
116 
operator +(ptrdiff_t n,Uint32ArrayIterator x)117   friend Uint32ArrayIterator operator+(ptrdiff_t n, Uint32ArrayIterator x) {
118     return Uint32ArrayIterator(x.ptr_ + n * N);
119   }
120 
operator -(Uint32ArrayIterator x,ptrdiff_t n)121   friend Uint32ArrayIterator operator-(Uint32ArrayIterator x, ptrdiff_t n) {
122     return Uint32ArrayIterator(x.ptr_ - n * N);
123   }
124 
operator -(Uint32ArrayIterator x,Uint32ArrayIterator y)125   friend ptrdiff_t operator-(Uint32ArrayIterator x, Uint32ArrayIterator y) {
126     return (x.ptr_ - y.ptr_) / N;
127   }
128 
operator ==(Uint32ArrayIterator x,Uint32ArrayIterator y)129   friend bool operator==(Uint32ArrayIterator x, Uint32ArrayIterator y) {
130     return x.ptr_ == y.ptr_;
131   }
132 
operator !=(Uint32ArrayIterator x,Uint32ArrayIterator y)133   friend bool operator!=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
134     return x.ptr_ != y.ptr_;
135   }
136 
operator <(Uint32ArrayIterator x,Uint32ArrayIterator y)137   friend bool operator<(Uint32ArrayIterator x, Uint32ArrayIterator y) {
138     return x.ptr_ < y.ptr_;
139   }
140 
operator <=(Uint32ArrayIterator x,Uint32ArrayIterator y)141   friend bool operator<=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
142     return x.ptr_ <= y.ptr_;
143   }
144 
operator >(Uint32ArrayIterator x,Uint32ArrayIterator y)145   friend bool operator>(Uint32ArrayIterator x, Uint32ArrayIterator y) {
146     return x.ptr_ > y.ptr_;
147   }
148 
operator >=(Uint32ArrayIterator x,Uint32ArrayIterator y)149   friend bool operator>=(Uint32ArrayIterator x, Uint32ArrayIterator y) {
150     return x.ptr_ >= y.ptr_;
151   }
152 
153  private:
154   const uint32 *ptr_;
155 };
156 
157 // Looks up single kanji list from key (reading).  Returns false if not found.
158 // The underlying token array, |single_kanji_token_array|, has the following
159 // format:
160 //
161 // +------------------+
162 // | index of key 0   |
163 // +------------------+
164 // | index of value 0 |
165 // +------------------+
166 // | index of key 1   |
167 // +------------------+
168 // | index of value 1 |
169 // +------------------+
170 // | ...              |
171 //
172 // Here, each element is of uint32 type.  Each of actual string values are
173 // stored in |single_kanji_string_array| at its index.
LookupKanjiList(StringPiece single_kanji_token_array,const SerializedStringArray & single_kanji_string_array,const string & key,std::vector<string> * kanji_list)174 bool LookupKanjiList(StringPiece single_kanji_token_array,
175                      const SerializedStringArray &single_kanji_string_array,
176                      const string &key, std::vector<string> *kanji_list) {
177   DCHECK(kanji_list);
178   const uint32* token_array =
179       reinterpret_cast<const uint32*>(single_kanji_token_array.data());
180   const size_t token_array_size =
181       single_kanji_token_array.size() / sizeof(uint32);
182 
183   const Uint32ArrayIterator<2> end(token_array + token_array_size);
184   const auto iter = std::lower_bound(
185       Uint32ArrayIterator<2>(token_array), end, key,
186       [&single_kanji_string_array](uint32 index, const string &target_key) {
187         return single_kanji_string_array[index] < target_key;
188       });
189   if (iter == end || single_kanji_string_array[iter[0]] != key) {
190     return false;
191   }
192   const StringPiece values = single_kanji_string_array[iter[1]];
193   Util::SplitStringToUtf8Chars(values, kanji_list);
194   return true;
195 }
196 
197 // Generates kanji variant description from key.  Does nothing if not found.
198 // The underlying token array, |variant_token_array|, has the following
199 // format:
200 //
201 // +-------------------------+
202 // | index of target 0       |
203 // +-------------------------+
204 // | index of original 0     |
205 // +-------------------------+
206 // | index of variant type 0 |
207 // +-------------------------+
208 // | index of target 1       |
209 // +-------------------------+
210 // | index of original 1     |
211 // +-------------------------+
212 // | index of variant type 1 |
213 // +-------------------------+
214 // | ...                     |
215 //
216 // Here, each element is of uint32 type.  Actual strings of target and original
217 // are stored in |variant_string_array|, while strings of variant type are
218 // stored in |variant_type|.
GenerateDescription(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,const string & key,string * desc)219 void GenerateDescription(StringPiece variant_token_array,
220                          const SerializedStringArray &variant_string_array,
221                          const SerializedStringArray &variant_type,
222                          const string &key, string *desc) {
223   DCHECK(desc);
224   const uint32 *token_array =
225       reinterpret_cast<const uint32*>(variant_token_array.data());
226   const size_t token_array_size =
227       variant_token_array.size() / sizeof(uint32);
228 
229   const Uint32ArrayIterator<3> end(token_array + token_array_size);
230   const auto iter = std::lower_bound(
231       Uint32ArrayIterator<3>(token_array), end, key,
232       [&variant_string_array](uint32 index, const string &target_key) {
233         return variant_string_array[index] < target_key;
234       });
235   if (iter == end || variant_string_array[iter[0]] != key) {
236     return;
237   }
238   const StringPiece original = variant_string_array[iter[1]];
239   const uint32 type_id = iter[2];
240   DCHECK_LT(type_id, variant_type.size());
241   // Format like "XXXのYYY"
242   desc->assign(original.data(), original.size());
243   desc->append("の");
244   desc->append(variant_type[type_id].data(), variant_type[type_id].size());
245 }
246 
247 // Add single kanji variants description to existing candidates,
248 // because if we have candidates with same value, the lower ranked candidate
249 // will be removed.
AddDescriptionForExsistingCandidates(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,Segment * segment)250 void AddDescriptionForExsistingCandidates(
251     StringPiece variant_token_array,
252     const SerializedStringArray &variant_string_array,
253     const SerializedStringArray &variant_type,
254     Segment *segment) {
255   DCHECK(segment);
256   for (size_t i = 0; i < segment->candidates_size(); ++i) {
257     Segment::Candidate *cand = segment->mutable_candidate(i);
258     if (!cand->description.empty()) {
259       continue;
260     }
261     GenerateDescription(variant_token_array, variant_string_array,
262                         variant_type, cand->value, &cand->description);
263   }
264 }
265 
FillCandidate(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,const string & key,const string & value,int cost,uint16 single_kanji_id,Segment::Candidate * cand)266 void FillCandidate(StringPiece variant_token_array,
267                    const SerializedStringArray &variant_string_array,
268                    const SerializedStringArray &variant_type,
269                    const string &key, const string &value,
270                    int cost, uint16 single_kanji_id,
271                    Segment::Candidate *cand) {
272   cand->lid = single_kanji_id;
273   cand->rid = single_kanji_id;
274   cand->cost = cost;
275   cand->content_key = key;
276   cand->content_value = value;
277   cand->key = key;
278   cand->value = value;
279   cand->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
280   cand->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
281   GenerateDescription(variant_token_array, variant_string_array,
282                       variant_type, value, &cand->description);
283 }
284 
285 // Insert SingleKanji into segment.
InsertCandidate(StringPiece variant_token_array,const SerializedStringArray & variant_string_array,const SerializedStringArray & variant_type,bool is_single_segment,uint16 single_kanji_id,const std::vector<string> & kanji_list,Segment * segment)286 void InsertCandidate(StringPiece variant_token_array,
287                      const SerializedStringArray &variant_string_array,
288                      const SerializedStringArray &variant_type,
289                      bool is_single_segment,
290                      uint16 single_kanji_id,
291                      const std::vector<string> &kanji_list,
292                      Segment *segment) {
293   DCHECK(segment);
294   if (segment->candidates_size() == 0) {
295     LOG(WARNING) << "candidates_size is 0";
296     return;
297   }
298 
299   const string &candidate_key = ((!segment->key().empty()) ?
300                                  segment->key() :
301                                  segment->candidate(0).key);
302 
303   // Adding 8000 to the single kanji cost
304   // Note that this cost does not make no effect.
305   // Here we set the cost just in case.
306   const int kOffsetCost = 8000;
307 
308   // Append single-kanji
309   for (size_t i = 0; i < kanji_list.size(); ++i) {
310     Segment::Candidate *c = segment->push_back_candidate();
311     FillCandidate(variant_token_array, variant_string_array,
312                   variant_type, candidate_key, kanji_list[i],
313                   kOffsetCost + i, single_kanji_id, c);
314   }
315 }
316 
InsertNounPrefix(const POSMatcher & pos_matcher,Segment * segment,SerializedDictionary::iterator begin,SerializedDictionary::iterator end)317 void InsertNounPrefix(const POSMatcher &pos_matcher,
318                       Segment *segment,
319                       SerializedDictionary::iterator begin,
320                       SerializedDictionary::iterator end) {
321   DCHECK(begin != end);
322 
323   if (segment->candidates_size() == 0) {
324     LOG(WARNING) << "candidates_size is 0";
325     return;
326   }
327 
328   if (segment->segment_type() == Segment::FIXED_VALUE) {
329     return;
330   }
331 
332   const string &candidate_key = ((!segment->key().empty()) ?
333                                  segment->key() :
334                                  segment->candidate(0).key);
335   for (auto iter = begin; iter != end; ++iter) {
336     const int insert_pos = std::min(
337         static_cast<int>(segment->candidates_size()),
338         static_cast<int>(iter.cost() + (segment->candidate(0).attributes &
339                                         Segment::Candidate::CONTEXT_SENSITIVE)
340                              ? 1
341                              : 0));
342     Segment::Candidate *c = segment->insert_candidate(insert_pos);
343     c->lid = pos_matcher.GetNounPrefixId();
344     c->rid = pos_matcher.GetNounPrefixId();
345     c->cost = 5000;
346     c->content_value = string(iter.value());
347     c->key = candidate_key;
348     c->content_key = candidate_key;
349     c->value = string(iter.value());
350     c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
351     c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
352   }
353 }
354 
355 }  // namespace
356 
SingleKanjiRewriter(const DataManagerInterface & data_manager)357 SingleKanjiRewriter::SingleKanjiRewriter(
358     const DataManagerInterface &data_manager)
359     : pos_matcher_(data_manager.GetPOSMatcherData()) {
360   StringPiece string_array_data;
361   StringPiece variant_type_array_data;
362   StringPiece variant_string_array_data;
363   StringPiece noun_prefix_token_array_data;
364   StringPiece noun_prefix_string_array_data;
365   data_manager.GetSingleKanjiRewriterData(
366       &single_kanji_token_array_,
367       &string_array_data,
368       &variant_type_array_data,
369       &variant_token_array_,
370       &variant_string_array_data,
371       &noun_prefix_token_array_data,
372       &noun_prefix_string_array_data);
373   // Single Kanji token array is an array of uint32.  Its size must be multiple
374   // of 2; see the comment above LookupKanjiList.
375   DCHECK_EQ(0, single_kanji_token_array_.size() % (2 * sizeof(uint32)));
376   DCHECK(SerializedStringArray::VerifyData(string_array_data));
377   single_kanji_string_array_.Set(string_array_data);
378 
379   DCHECK(SerializedStringArray::VerifyData(variant_type_array_data));
380   variant_type_array_.Set(variant_type_array_data);
381 
382   // Variant token array is an array of uint32.  Its size must be multiple
383   // of 3; see the comment above GenerateDescription.
384   DCHECK_EQ(0, variant_token_array_.size() % (3 * sizeof(uint32)));
385   DCHECK(SerializedStringArray::VerifyData(variant_string_array_data));
386   variant_string_array_.Set(variant_string_array_data);
387 
388   DCHECK(SerializedDictionary::VerifyData(noun_prefix_token_array_data,
389                                           noun_prefix_string_array_data));
390   noun_prefix_dictionary_.reset(new SerializedDictionary(
391       noun_prefix_token_array_data,
392       noun_prefix_string_array_data));
393 }
394 
~SingleKanjiRewriter()395 SingleKanjiRewriter::~SingleKanjiRewriter() {}
396 
capability(const ConversionRequest & request) const397 int SingleKanjiRewriter::capability(const ConversionRequest &request) const {
398   if (request.request().mixed_conversion()) {
399     return RewriterInterface::ALL;
400   }
401   return RewriterInterface::CONVERSION;
402 }
403 
Rewrite(const ConversionRequest & request,Segments * segments) const404 bool SingleKanjiRewriter::Rewrite(const ConversionRequest &request,
405                                   Segments *segments) const {
406   if (!request.config().use_single_kanji_conversion()) {
407     VLOG(2) << "no use_single_kanji_conversion";
408     return false;
409   }
410 
411   bool modified = false;
412   const size_t segments_size = segments->conversion_segments_size();
413   const bool is_single_segment = (segments_size == 1);
414   for (size_t i = 0; i < segments_size; ++i) {
415     AddDescriptionForExsistingCandidates(
416         variant_token_array_,
417         variant_string_array_,
418         variant_type_array_,
419         segments->mutable_conversion_segment(i));
420 
421     const string &key = segments->conversion_segment(i).key();
422     std::vector<string> kanji_list;
423     if (!LookupKanjiList(single_kanji_token_array_, single_kanji_string_array_,
424                          key, &kanji_list)) {
425       continue;
426     }
427     InsertCandidate(variant_token_array_,
428                     variant_string_array_,
429                     variant_type_array_,
430                     is_single_segment,
431                     pos_matcher_.GetGeneralSymbolId(),
432                     kanji_list,
433                     segments->mutable_conversion_segment(i));
434 
435     modified = true;
436   }
437 
438   // Tweak for noun prefix.
439   // TODO(team): Ideally, this issue can be fixed via the language model
440   // and dictionary generation.
441   for (size_t i = 0; i < segments_size; ++i) {
442     if (segments->conversion_segment(i).candidates_size() == 0) {
443       continue;
444     }
445 
446     if (i + 1 < segments_size) {
447       const Segment::Candidate &right_candidate =
448           segments->conversion_segment(i + 1).candidate(0);
449       // right segment must be a noun.
450       if (!pos_matcher_.IsContentNoun(right_candidate.lid)) {
451         continue;
452       }
453     } else if (segments_size != 1) {  // also apply if segments_size == 1.
454       continue;
455     }
456 
457     const string &key = segments->conversion_segment(i).key();
458     const auto range = noun_prefix_dictionary_->equal_range(key);
459     if (range.first == range.second) {
460       continue;
461     }
462     InsertNounPrefix(pos_matcher_,
463                      segments->mutable_conversion_segment(i),
464                      range.first, range.second);
465     // Ignore the next noun content word.
466     ++i;
467     modified = true;
468   }
469 
470   return modified;
471 }
472 }  // namespace mozc
473