1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "converter/key_corrector.h"
31 
32 #include <cstring>
33 
34 #include "base/logging.h"
35 #include "base/port.h"
36 #include "base/util.h"
37 
38 namespace mozc {
39 namespace {
40 
41 // maximum key length KeyCorrector can handle
42 // if key is too long, we don't do key correction
43 const size_t kMaxSize = 128;
44 
45 // invalid alignment marker
46 const size_t kInvalidPos = static_cast<size_t>(-1);
47 
48 // "ん" (few "n" pettern)
49 // "んあ" -> "んな"
50 // "んい" -> "んに"
51 // "んう" -> "んぬ"
52 // "んえ" -> "んね"
53 // "んお" -> "んの"
RewriteNN(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)54 bool RewriteNN(size_t key_pos,
55                const char *begin, const char *end,
56                size_t *mblen, string *output) {
57   if (key_pos == 0) {
58     *mblen = 0;
59     return false;
60   }
61 
62   const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
63   if (ucs4 != 0x3093) {  // "ん"
64     *mblen = 0;
65     return false;
66   }
67 
68   if (begin + *mblen >= end) {
69     *mblen = 0;
70     return false;
71   }
72 
73   size_t mblen2 = 0;
74   const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
75   uint16 output_ucs4 = 0x0000;
76   switch (next_ucs4) {
77     case 0x3042:  // "あ"
78       output_ucs4 = 0x306A;   // "な"
79       break;
80     case 0x3044:  // "い"
81       output_ucs4 = 0x306B;   // "に"
82       break;
83     case 0x3046:  // "う"
84       output_ucs4 = 0x306C;   // "ぬ"
85       break;
86     case 0x3048:  // "え"
87       output_ucs4 = 0x306D;   // "ね"
88       break;
89     case 0x304A:  // "お"
90       output_ucs4 = 0x306E;   // "の"
91       break;
92     default:
93       break;
94   }
95 
96   if (output_ucs4 != 0x0000) {   // "ん[あいうえお]"
97     Util::UCS4ToUTF8Append(ucs4, output);
98     Util::UCS4ToUTF8Append(output_ucs4, output);
99     *mblen += mblen2;
100     return true;
101   } else {   // others
102     *mblen = 0;
103     return false;
104   }
105 
106   return false;
107 }
108 
109 // "んん" (many "n" pattern)
110 // "([^ん])んん[ん]" -> ignore
111 // "([^ん])んん[あいうえお]" ->  $1 and leave "ん[あいうえお]"
112 // "([^ん])んん[^あいうえお]" -> $1"ん" and leave "[^あいうえお]"
RewriteDoubleNN(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)113 bool RewriteDoubleNN(size_t key_pos,
114                      const char *begin, const char *end,
115                      size_t *mblen, string *output) {
116   // 0x3093: "ん"
117   static const uint16 kPattern[] = { 0x0000, 0x3093, 0x3093 };
118 
119   *mblen = 0;
120   uint16 first_char = 0x0000;
121   size_t first_mblen = 0;
122   for (size_t i = 0; i < arraysize(kPattern); ++i) {
123     if (begin >= end) {
124       *mblen = 0;
125       return false;
126     }
127     size_t mblen2 = 0;
128     const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
129     if ((kPattern[i] == 0x0000 && ucs4 != 0x3093 &&
130          Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
131         (kPattern[i] == 0x3093 && ucs4 == 0x3093)) {
132       if (i == 0) {
133         first_mblen = mblen2;
134         first_char = ucs4;
135       }
136     } else {
137       *mblen = 0;
138       return false;
139     }
140     begin += mblen2;
141     *mblen += mblen2;
142   }
143 
144   if (begin >= end) {
145     *mblen = 0;
146     return false;
147   }
148 
149   if (first_char == 0x0000 || first_mblen == 0) {
150     *mblen = 0;
151     return false;
152   }
153 
154   size_t mblen2 = 0;
155   const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
156   if (ucs4 == 0x3093) {   // "ん" ignore
157     *mblen = 0;
158     return false;
159   } else if (ucs4 == 0x3042 ||      // "[あいうえお]"
160              ucs4 == 0x3044 ||
161              ucs4 == 0x3046 ||
162              ucs4 == 0x3048 ||
163              ucs4 == 0x304A) {
164     // drop first "ん" and leave "ん[あいうえお]"
165     // remained part will be handled by RewriteNN(), e.g, "んあ" -> "んな"
166     Util::UCS4ToUTF8Append(first_char, output);
167     // Skip one Hiragana character in UTF-8, which is 3 bytes.
168     *mblen = first_mblen + 3;
169     return true;
170   } else {  // "[^あいうえお]"
171     Util::UCS4ToUTF8Append(first_char, output);
172     Util::UCS4ToUTF8Append(0x3093, output);   // "ん"
173     return true;
174   }
175 
176   return false;
177 }
178 
179 // "に" pattern
180 // "にゃ" -> "んや"
181 // "にゅ" -> "んゆ"
182 // "にょ" -> "んよ"
RewriteNI(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)183 bool RewriteNI(size_t key_pos,
184                const char *begin, const char *end,
185                size_t *mblen, string *output) {
186   const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
187   if (ucs4 != 0x306B) {  // "に"
188     *mblen = 0;
189     return false;
190   }
191 
192   if (begin + *mblen >= end) {
193     *mblen = 0;
194     return false;
195   }
196 
197   size_t mblen2 = 0;
198   const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
199   uint16 output_ucs4 = 0x0000;
200   switch (next_ucs4) {
201     case 0x3083:   // "ゃ"
202       output_ucs4 = 0x3084;  // "や"
203       break;
204     case 0x3085:   // "ゅ"
205       output_ucs4 = 0x3086;  // "ゆ"
206       break;
207     case 0x3087:   // "ょ"
208       output_ucs4 = 0x3088;  // "よ"
209       break;
210     default:
211       output_ucs4 = 0x0000;
212       break;
213   }
214 
215   if (output_ucs4 != 0x0000) {
216     Util::UCS4ToUTF8Append(0x3093, output);    // "ん"
217     Util::UCS4ToUTF8Append(output_ucs4, output);
218     *mblen += mblen2;
219     return true;
220   } else {
221     *mblen = 0;
222     return false;
223   }
224 
225   return false;
226 }
227 
228 // "m" Pattern (not BOS)
229 // "m[ばびぶべぼぱぴぷぺぽ]" -> "ん[ばびぶべぼぱぴぷぺぽ]"
RewriteM(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)230 bool RewriteM(size_t key_pos,
231               const char *begin, const char *end,
232               size_t *mblen, string *output) {
233   if (key_pos == 0) {
234     *mblen = 0;
235     return false;
236   }
237   const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
238   // "m" or "m" (don't take capitial letter, as "M" might not be a misspelling)
239   if (ucs4 != 0x006D && ucs4 != 0xFF4D) {
240     *mblen = 0;
241     return false;
242   }
243 
244   if (begin + *mblen >= end) {
245     *mblen = 0;
246     return false;
247   }
248 
249   size_t mblen2 = 0;
250   const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
251   // "[はばぱひびぴふぶぷへべぺほぼぽ]" => [0x306F .. 0X307D]
252   // Here we want to take "[は..ぽ]" except for "はひふへほ"
253   if (next_ucs4 % 3 != 0 &&   // not "はひふへほ"
254       next_ucs4 >= 0x306F && next_ucs4 <= 0x307D) {  // "は..ぽ"
255     Util::UCS4ToUTF8Append(0x3093, output);    // "ん"
256     Util::UCS4ToUTF8Append(next_ucs4, output);
257     *mblen += mblen2;
258     return true;
259   } else {
260     *mblen = 0;
261     return false;
262   }
263 
264   return true;
265 }
266 
267 // "きっって" ->" きって"
268 // replace "([^っ])っっ([^っ])" => "$1っ$2"
269 // Don't consider more that three "っっっ"
270 // e.g, "かっっった" -> "かっっった"
RewriteSmallTSU(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)271 bool RewriteSmallTSU(size_t key_pos,
272                      const char *begin, const char *end,
273                      size_t *mblen, string *output) {
274   // 0x0000 is a place holder for "[^っ]"
275   // "っ": 0x3063
276   static const uint16 kPattern[] = { 0x0000, 0x3063, 0x3063, 0x0000 };
277 
278   uint16 first_char = 0x0000;
279   uint16 last_char = 0x0000;
280   for (size_t i = 0; i < arraysize(kPattern); ++i) {
281     if (begin >= end) {
282       *mblen = 0;
283       return false;
284     }
285     size_t mblen2 = 0;
286     const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
287     if ((kPattern[i] == 0x0000 && ucs4 != 0x3063 &&
288          Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
289         (kPattern[i] == 0x3063 && ucs4 == 0x3063)) {
290       if (i == 0) {
291         first_char = ucs4;
292       } else if (i == arraysize(kPattern) - 1) {
293         last_char = ucs4;
294       }
295     } else {
296       *mblen = 0;
297       return false;
298     }
299     begin += mblen2;
300     *mblen += mblen2;
301   }
302 
303   if (first_char == 0x0000 || last_char == 0x0000) {
304     *mblen = 0;
305     return false;
306   }
307 
308   Util::UCS4ToUTF8Append(first_char, output);
309   Util::UCS4ToUTF8Append(0x3063, output);   // "っ"
310   Util::UCS4ToUTF8Append(last_char, output);
311 
312   return true;
313 }
314 
315 // Not implemented yet, as they looks minor
316 // "[子音][ゃゅょ][^う]" Pattern
317 // "きゅ[^う] -> きゅう"
318 // "しゅ[^う] -> しゅう"
319 // "ちゅ[^う] -> ちゅう"
320 // "にゅ[^う] -> にゅう"
321 // "ひゅ[^う] -> ひゅう"
322 // "りゅ[^う] -> りゅう"
RewriteYu(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)323 bool RewriteYu(size_t key_pos,
324                const char *begin, const char *end,
325                size_t *mblen, string *output) {
326   const char32 first_char = Util::UTF8ToUCS4(begin, end, mblen);
327   if (first_char != 0x304D && first_char != 0x3057 &&
328       first_char != 0x3061 && first_char != 0x306B &&
329       first_char != 0x3072 && first_char != 0x308A) {  // !"きしちにひり"
330     *mblen = 0;
331     return false;
332   }
333 
334   if (begin + *mblen >= end) {
335     *mblen = 0;
336     return false;
337   }
338 
339   size_t mblen2 = 0;
340   const char32 next_char = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
341   if (next_char != 0x3085) {   // "ゅ"
342     *mblen = 0;
343     return false;
344   }
345 
346   if (begin + *mblen + mblen2 >= end) {
347     *mblen = 0;
348     return false;
349   }
350 
351   size_t mblen3 = 0;
352   const char32 last_char = Util::UTF8ToUCS4(begin + *mblen + mblen2,
353                                             end, &mblen3);
354   if (last_char == 0x3046) {   // "う"
355     *mblen = 0;
356     return false;
357   }
358 
359   // OK, rewrite
360   *mblen += mblen2;
361   Util::UCS4ToUTF8Append(first_char, output);
362   Util::UCS4ToUTF8Append(next_char, output);   // "ゅ"
363   Util::UCS4ToUTF8Append(0x3046, output);      // "う"
364 
365   return true;
366 }
367 }  // namespace
368 
KeyCorrector(const string & key,InputMode mode,size_t history_size)369 KeyCorrector::KeyCorrector(const string &key, InputMode mode,
370                            size_t history_size)
371     : available_(false), mode_(mode) {
372   CorrectKey(key, mode, history_size);
373 }
374 
KeyCorrector()375 KeyCorrector::KeyCorrector()
376     : available_(false), mode_(ROMAN) {}
377 
~KeyCorrector()378 KeyCorrector::~KeyCorrector() {}
379 
mode() const380 KeyCorrector::InputMode KeyCorrector::mode() const {
381   return mode_;
382 }
383 
IsAvailable() const384 bool KeyCorrector::IsAvailable() const {
385   return available_;
386 }
387 
388 // return corrected key;
corrected_key() const389 const string &KeyCorrector::corrected_key() const {
390   return corrected_key_;
391 }
392 
original_key() const393 const string &KeyCorrector::original_key() const {
394   return original_key_;
395 }
396 
GetCorrectedPosition(const size_t original_key_pos) const397 size_t KeyCorrector::GetCorrectedPosition(const size_t original_key_pos) const {
398   if (original_key_pos < alignment_.size()) {
399     return alignment_[original_key_pos];
400   }
401   return kInvalidPos;
402 }
403 
GetOriginalPosition(const size_t corrected_key_pos) const404 size_t KeyCorrector::GetOriginalPosition(const size_t corrected_key_pos) const {
405   if (corrected_key_pos < rev_alignment_.size()) {
406     return rev_alignment_[corrected_key_pos];
407   }
408   return kInvalidPos;
409 }
410 
Clear()411 void KeyCorrector::Clear() {
412   available_ = false;
413   original_key_.clear();
414   corrected_key_.clear();
415   alignment_.clear();
416   rev_alignment_.clear();
417 }
418 
CorrectKey(const string & key,InputMode mode,size_t history_size)419 bool KeyCorrector::CorrectKey(const string &key, InputMode mode,
420                               size_t history_size) {
421   Clear();
422 
423   // TOOD(taku)  support KANA
424   if (mode == KANA) {
425     return false;
426   }
427 
428   if (key.size() == 0 || key.size() >= kMaxSize) {
429     VLOG(1) << "invalid key length";
430     return false;
431   }
432 
433   original_key_ = key;
434 
435   const char *begin = key.data();
436   const char *end = key.data() + key.size();
437   const char *input_begin = key.data() + history_size;
438   size_t key_pos = 0;
439 
440   while (begin < end) {
441     size_t mblen = 0;
442     const size_t org_len = corrected_key_.size();
443     if (begin < input_begin ||
444         (!RewriteDoubleNN(key_pos, begin, end, &mblen, &corrected_key_) &&
445          !RewriteNN(key_pos, begin, end, &mblen, &corrected_key_) &&
446          !RewriteYu(key_pos,  begin, end, &mblen, &corrected_key_) &&
447          !RewriteNI(key_pos, begin, end, &mblen, &corrected_key_) &&
448          !RewriteSmallTSU(key_pos, begin, end, &mblen, &corrected_key_) &&
449          !RewriteM(key_pos,  begin, end, &mblen, &corrected_key_))) {
450       const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
451       Util::UCS4ToUTF8Append(ucs4, &corrected_key_);
452     }
453 
454     const size_t corrected_mblen = corrected_key_.size() - org_len;
455 
456     if (corrected_mblen <= 0 && mblen <= 0) {
457       LOG(ERROR) << "Invalid pattern: " << key;
458       Clear();
459       return false;
460     }
461 
462     // one to one mapping
463     if (mblen == corrected_mblen) {
464       const size_t len = static_cast<size_t>(begin - key.data());
465       for (size_t i = 0; i < mblen; ++i) {
466         alignment_.push_back(org_len + i);
467         rev_alignment_.push_back(len + i);
468       }
469     } else {
470     // NOT a one to one maping, we take fist/last alignment only
471       alignment_.push_back(org_len);
472       for (size_t i = 1; i < mblen; ++i) {
473         alignment_.push_back(kInvalidPos);
474       }
475       rev_alignment_.push_back(static_cast<size_t>(begin - key.data()));
476       for (size_t i = 1; i < corrected_mblen; ++i) {
477         rev_alignment_.push_back(kInvalidPos);
478       }
479     }
480 
481     begin += mblen;
482     ++key_pos;
483   }
484 
485   DCHECK_EQ(original_key_.size(), alignment_.size());
486   DCHECK_EQ(corrected_key_.size(), rev_alignment_.size());
487 
488   available_ = true;
489   return true;
490 }
491 
GetCorrectedPrefix(const size_t original_key_pos,size_t * length) const492 const char *KeyCorrector::GetCorrectedPrefix(const size_t original_key_pos,
493                                              size_t *length) const {
494   if (!IsAvailable()) {
495     *length = 0;
496     return NULL;
497   }
498 
499   if (mode_ == KANA) {
500     *length = 0;
501     return NULL;
502   }
503 
504   const size_t corrected_key_pos = GetCorrectedPosition(original_key_pos);
505   if (!IsValidPosition(corrected_key_pos)) {
506     *length = 0;
507     return NULL;
508   }
509 
510   const char *corrected_substr = corrected_key_.data() + corrected_key_pos;
511   const size_t corrected_length = corrected_key_.size() - corrected_key_pos;
512   const char *original_substr = original_key_.data() + original_key_pos;
513   const size_t original_length = original_key_.size() - original_key_pos;
514   // substrs are different
515   if (original_length != corrected_length ||
516       memcmp(original_substr, corrected_substr, original_length) != 0) {
517     *length = corrected_length;
518     return corrected_substr;
519   }
520 
521   *length = 0;
522   return NULL;
523 }
524 
GetOriginalOffset(const size_t original_key_pos,const size_t new_key_offset) const525 size_t KeyCorrector::GetOriginalOffset(const size_t original_key_pos,
526                                        const size_t new_key_offset) const {
527   if (!IsAvailable()) {
528     return kInvalidPos;
529   }
530 
531   if (mode_ == KANA) {
532     return kInvalidPos;
533   }
534 
535   const size_t corrected_key_pos =
536       GetCorrectedPosition(original_key_pos);
537   if (!IsValidPosition(corrected_key_pos)) {
538     return kInvalidPos;
539   }
540 
541   if (rev_alignment_.size() == corrected_key_pos + new_key_offset) {
542     return (alignment_.size() - GetOriginalPosition(corrected_key_pos));
543   } else {
544     // treat right edge
545     const size_t original_key_pos2 =
546         GetOriginalPosition(corrected_key_pos + new_key_offset);
547 
548     if (!IsValidPosition(original_key_pos2)) {
549       return kInvalidPos;
550     }
551 
552     // Don't allow NULL matching
553     if (original_key_pos2 >= original_key_pos) {
554       return static_cast<size_t>(original_key_pos2 - original_key_pos);
555     }
556   }
557 
558   return kInvalidPos;
559 }
560 
561 // static
IsValidPosition(size_t pos)562 bool KeyCorrector::IsValidPosition(size_t pos) {
563   return (pos != kInvalidPos);
564 }
565 
566 // static
IsInvalidPosition(size_t pos)567 bool KeyCorrector::IsInvalidPosition(size_t pos) {
568   return (pos == kInvalidPos);
569 }
570 
571 // static
InvalidPosition()572 size_t KeyCorrector::InvalidPosition() {
573   return kInvalidPos;
574 }
575 
576 // static
GetCorrectedCostPenalty(const string & key)577 int KeyCorrector::GetCorrectedCostPenalty(const string &key) {
578   // "んん" and "っっ" must be mis-spelling.
579   if (key.find("んん") != string::npos || key.find("っっ") != string::npos) {
580     return 0;
581   }
582   // add 3000 to the original word cost
583   const int kCorrectedCostPenalty = 3000;
584   return kCorrectedCostPenalty;
585 }
586 
587 }  // namespace mozc
588