1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "converter/key_corrector.h"
31
32 #include <cstring>
33
34 #include "base/logging.h"
35 #include "base/port.h"
36 #include "base/util.h"
37
38 namespace mozc {
39 namespace {
40
41 // maximum key length KeyCorrector can handle
42 // if key is too long, we don't do key correction
43 const size_t kMaxSize = 128;
44
45 // invalid alignment marker
46 const size_t kInvalidPos = static_cast<size_t>(-1);
47
48 // "ん" (few "n" pettern)
49 // "んあ" -> "んな"
50 // "んい" -> "んに"
51 // "んう" -> "んぬ"
52 // "んえ" -> "んね"
53 // "んお" -> "んの"
RewriteNN(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)54 bool RewriteNN(size_t key_pos,
55 const char *begin, const char *end,
56 size_t *mblen, string *output) {
57 if (key_pos == 0) {
58 *mblen = 0;
59 return false;
60 }
61
62 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
63 if (ucs4 != 0x3093) { // "ん"
64 *mblen = 0;
65 return false;
66 }
67
68 if (begin + *mblen >= end) {
69 *mblen = 0;
70 return false;
71 }
72
73 size_t mblen2 = 0;
74 const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
75 uint16 output_ucs4 = 0x0000;
76 switch (next_ucs4) {
77 case 0x3042: // "あ"
78 output_ucs4 = 0x306A; // "な"
79 break;
80 case 0x3044: // "い"
81 output_ucs4 = 0x306B; // "に"
82 break;
83 case 0x3046: // "う"
84 output_ucs4 = 0x306C; // "ぬ"
85 break;
86 case 0x3048: // "え"
87 output_ucs4 = 0x306D; // "ね"
88 break;
89 case 0x304A: // "お"
90 output_ucs4 = 0x306E; // "の"
91 break;
92 default:
93 break;
94 }
95
96 if (output_ucs4 != 0x0000) { // "ん[あいうえお]"
97 Util::UCS4ToUTF8Append(ucs4, output);
98 Util::UCS4ToUTF8Append(output_ucs4, output);
99 *mblen += mblen2;
100 return true;
101 } else { // others
102 *mblen = 0;
103 return false;
104 }
105
106 return false;
107 }
108
109 // "んん" (many "n" pattern)
110 // "([^ん])んん[ん]" -> ignore
111 // "([^ん])んん[あいうえお]" -> $1 and leave "ん[あいうえお]"
112 // "([^ん])んん[^あいうえお]" -> $1"ん" and leave "[^あいうえお]"
RewriteDoubleNN(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)113 bool RewriteDoubleNN(size_t key_pos,
114 const char *begin, const char *end,
115 size_t *mblen, string *output) {
116 // 0x3093: "ん"
117 static const uint16 kPattern[] = { 0x0000, 0x3093, 0x3093 };
118
119 *mblen = 0;
120 uint16 first_char = 0x0000;
121 size_t first_mblen = 0;
122 for (size_t i = 0; i < arraysize(kPattern); ++i) {
123 if (begin >= end) {
124 *mblen = 0;
125 return false;
126 }
127 size_t mblen2 = 0;
128 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
129 if ((kPattern[i] == 0x0000 && ucs4 != 0x3093 &&
130 Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
131 (kPattern[i] == 0x3093 && ucs4 == 0x3093)) {
132 if (i == 0) {
133 first_mblen = mblen2;
134 first_char = ucs4;
135 }
136 } else {
137 *mblen = 0;
138 return false;
139 }
140 begin += mblen2;
141 *mblen += mblen2;
142 }
143
144 if (begin >= end) {
145 *mblen = 0;
146 return false;
147 }
148
149 if (first_char == 0x0000 || first_mblen == 0) {
150 *mblen = 0;
151 return false;
152 }
153
154 size_t mblen2 = 0;
155 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
156 if (ucs4 == 0x3093) { // "ん" ignore
157 *mblen = 0;
158 return false;
159 } else if (ucs4 == 0x3042 || // "[あいうえお]"
160 ucs4 == 0x3044 ||
161 ucs4 == 0x3046 ||
162 ucs4 == 0x3048 ||
163 ucs4 == 0x304A) {
164 // drop first "ん" and leave "ん[あいうえお]"
165 // remained part will be handled by RewriteNN(), e.g, "んあ" -> "んな"
166 Util::UCS4ToUTF8Append(first_char, output);
167 // Skip one Hiragana character in UTF-8, which is 3 bytes.
168 *mblen = first_mblen + 3;
169 return true;
170 } else { // "[^あいうえお]"
171 Util::UCS4ToUTF8Append(first_char, output);
172 Util::UCS4ToUTF8Append(0x3093, output); // "ん"
173 return true;
174 }
175
176 return false;
177 }
178
179 // "に" pattern
180 // "にゃ" -> "んや"
181 // "にゅ" -> "んゆ"
182 // "にょ" -> "んよ"
RewriteNI(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)183 bool RewriteNI(size_t key_pos,
184 const char *begin, const char *end,
185 size_t *mblen, string *output) {
186 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
187 if (ucs4 != 0x306B) { // "に"
188 *mblen = 0;
189 return false;
190 }
191
192 if (begin + *mblen >= end) {
193 *mblen = 0;
194 return false;
195 }
196
197 size_t mblen2 = 0;
198 const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
199 uint16 output_ucs4 = 0x0000;
200 switch (next_ucs4) {
201 case 0x3083: // "ゃ"
202 output_ucs4 = 0x3084; // "や"
203 break;
204 case 0x3085: // "ゅ"
205 output_ucs4 = 0x3086; // "ゆ"
206 break;
207 case 0x3087: // "ょ"
208 output_ucs4 = 0x3088; // "よ"
209 break;
210 default:
211 output_ucs4 = 0x0000;
212 break;
213 }
214
215 if (output_ucs4 != 0x0000) {
216 Util::UCS4ToUTF8Append(0x3093, output); // "ん"
217 Util::UCS4ToUTF8Append(output_ucs4, output);
218 *mblen += mblen2;
219 return true;
220 } else {
221 *mblen = 0;
222 return false;
223 }
224
225 return false;
226 }
227
228 // "m" Pattern (not BOS)
229 // "m[ばびぶべぼぱぴぷぺぽ]" -> "ん[ばびぶべぼぱぴぷぺぽ]"
RewriteM(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)230 bool RewriteM(size_t key_pos,
231 const char *begin, const char *end,
232 size_t *mblen, string *output) {
233 if (key_pos == 0) {
234 *mblen = 0;
235 return false;
236 }
237 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
238 // "m" or "m" (don't take capitial letter, as "M" might not be a misspelling)
239 if (ucs4 != 0x006D && ucs4 != 0xFF4D) {
240 *mblen = 0;
241 return false;
242 }
243
244 if (begin + *mblen >= end) {
245 *mblen = 0;
246 return false;
247 }
248
249 size_t mblen2 = 0;
250 const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
251 // "[はばぱひびぴふぶぷへべぺほぼぽ]" => [0x306F .. 0X307D]
252 // Here we want to take "[は..ぽ]" except for "はひふへほ"
253 if (next_ucs4 % 3 != 0 && // not "はひふへほ"
254 next_ucs4 >= 0x306F && next_ucs4 <= 0x307D) { // "は..ぽ"
255 Util::UCS4ToUTF8Append(0x3093, output); // "ん"
256 Util::UCS4ToUTF8Append(next_ucs4, output);
257 *mblen += mblen2;
258 return true;
259 } else {
260 *mblen = 0;
261 return false;
262 }
263
264 return true;
265 }
266
267 // "きっって" ->" きって"
268 // replace "([^っ])っっ([^っ])" => "$1っ$2"
269 // Don't consider more that three "っっっ"
270 // e.g, "かっっった" -> "かっっった"
RewriteSmallTSU(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)271 bool RewriteSmallTSU(size_t key_pos,
272 const char *begin, const char *end,
273 size_t *mblen, string *output) {
274 // 0x0000 is a place holder for "[^っ]"
275 // "っ": 0x3063
276 static const uint16 kPattern[] = { 0x0000, 0x3063, 0x3063, 0x0000 };
277
278 uint16 first_char = 0x0000;
279 uint16 last_char = 0x0000;
280 for (size_t i = 0; i < arraysize(kPattern); ++i) {
281 if (begin >= end) {
282 *mblen = 0;
283 return false;
284 }
285 size_t mblen2 = 0;
286 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
287 if ((kPattern[i] == 0x0000 && ucs4 != 0x3063 &&
288 Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
289 (kPattern[i] == 0x3063 && ucs4 == 0x3063)) {
290 if (i == 0) {
291 first_char = ucs4;
292 } else if (i == arraysize(kPattern) - 1) {
293 last_char = ucs4;
294 }
295 } else {
296 *mblen = 0;
297 return false;
298 }
299 begin += mblen2;
300 *mblen += mblen2;
301 }
302
303 if (first_char == 0x0000 || last_char == 0x0000) {
304 *mblen = 0;
305 return false;
306 }
307
308 Util::UCS4ToUTF8Append(first_char, output);
309 Util::UCS4ToUTF8Append(0x3063, output); // "っ"
310 Util::UCS4ToUTF8Append(last_char, output);
311
312 return true;
313 }
314
315 // Not implemented yet, as they looks minor
316 // "[子音][ゃゅょ][^う]" Pattern
317 // "きゅ[^う] -> きゅう"
318 // "しゅ[^う] -> しゅう"
319 // "ちゅ[^う] -> ちゅう"
320 // "にゅ[^う] -> にゅう"
321 // "ひゅ[^う] -> ひゅう"
322 // "りゅ[^う] -> りゅう"
RewriteYu(size_t key_pos,const char * begin,const char * end,size_t * mblen,string * output)323 bool RewriteYu(size_t key_pos,
324 const char *begin, const char *end,
325 size_t *mblen, string *output) {
326 const char32 first_char = Util::UTF8ToUCS4(begin, end, mblen);
327 if (first_char != 0x304D && first_char != 0x3057 &&
328 first_char != 0x3061 && first_char != 0x306B &&
329 first_char != 0x3072 && first_char != 0x308A) { // !"きしちにひり"
330 *mblen = 0;
331 return false;
332 }
333
334 if (begin + *mblen >= end) {
335 *mblen = 0;
336 return false;
337 }
338
339 size_t mblen2 = 0;
340 const char32 next_char = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
341 if (next_char != 0x3085) { // "ゅ"
342 *mblen = 0;
343 return false;
344 }
345
346 if (begin + *mblen + mblen2 >= end) {
347 *mblen = 0;
348 return false;
349 }
350
351 size_t mblen3 = 0;
352 const char32 last_char = Util::UTF8ToUCS4(begin + *mblen + mblen2,
353 end, &mblen3);
354 if (last_char == 0x3046) { // "う"
355 *mblen = 0;
356 return false;
357 }
358
359 // OK, rewrite
360 *mblen += mblen2;
361 Util::UCS4ToUTF8Append(first_char, output);
362 Util::UCS4ToUTF8Append(next_char, output); // "ゅ"
363 Util::UCS4ToUTF8Append(0x3046, output); // "う"
364
365 return true;
366 }
367 } // namespace
368
KeyCorrector(const string & key,InputMode mode,size_t history_size)369 KeyCorrector::KeyCorrector(const string &key, InputMode mode,
370 size_t history_size)
371 : available_(false), mode_(mode) {
372 CorrectKey(key, mode, history_size);
373 }
374
KeyCorrector()375 KeyCorrector::KeyCorrector()
376 : available_(false), mode_(ROMAN) {}
377
~KeyCorrector()378 KeyCorrector::~KeyCorrector() {}
379
mode() const380 KeyCorrector::InputMode KeyCorrector::mode() const {
381 return mode_;
382 }
383
IsAvailable() const384 bool KeyCorrector::IsAvailable() const {
385 return available_;
386 }
387
388 // return corrected key;
corrected_key() const389 const string &KeyCorrector::corrected_key() const {
390 return corrected_key_;
391 }
392
original_key() const393 const string &KeyCorrector::original_key() const {
394 return original_key_;
395 }
396
GetCorrectedPosition(const size_t original_key_pos) const397 size_t KeyCorrector::GetCorrectedPosition(const size_t original_key_pos) const {
398 if (original_key_pos < alignment_.size()) {
399 return alignment_[original_key_pos];
400 }
401 return kInvalidPos;
402 }
403
GetOriginalPosition(const size_t corrected_key_pos) const404 size_t KeyCorrector::GetOriginalPosition(const size_t corrected_key_pos) const {
405 if (corrected_key_pos < rev_alignment_.size()) {
406 return rev_alignment_[corrected_key_pos];
407 }
408 return kInvalidPos;
409 }
410
Clear()411 void KeyCorrector::Clear() {
412 available_ = false;
413 original_key_.clear();
414 corrected_key_.clear();
415 alignment_.clear();
416 rev_alignment_.clear();
417 }
418
CorrectKey(const string & key,InputMode mode,size_t history_size)419 bool KeyCorrector::CorrectKey(const string &key, InputMode mode,
420 size_t history_size) {
421 Clear();
422
423 // TOOD(taku) support KANA
424 if (mode == KANA) {
425 return false;
426 }
427
428 if (key.size() == 0 || key.size() >= kMaxSize) {
429 VLOG(1) << "invalid key length";
430 return false;
431 }
432
433 original_key_ = key;
434
435 const char *begin = key.data();
436 const char *end = key.data() + key.size();
437 const char *input_begin = key.data() + history_size;
438 size_t key_pos = 0;
439
440 while (begin < end) {
441 size_t mblen = 0;
442 const size_t org_len = corrected_key_.size();
443 if (begin < input_begin ||
444 (!RewriteDoubleNN(key_pos, begin, end, &mblen, &corrected_key_) &&
445 !RewriteNN(key_pos, begin, end, &mblen, &corrected_key_) &&
446 !RewriteYu(key_pos, begin, end, &mblen, &corrected_key_) &&
447 !RewriteNI(key_pos, begin, end, &mblen, &corrected_key_) &&
448 !RewriteSmallTSU(key_pos, begin, end, &mblen, &corrected_key_) &&
449 !RewriteM(key_pos, begin, end, &mblen, &corrected_key_))) {
450 const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
451 Util::UCS4ToUTF8Append(ucs4, &corrected_key_);
452 }
453
454 const size_t corrected_mblen = corrected_key_.size() - org_len;
455
456 if (corrected_mblen <= 0 && mblen <= 0) {
457 LOG(ERROR) << "Invalid pattern: " << key;
458 Clear();
459 return false;
460 }
461
462 // one to one mapping
463 if (mblen == corrected_mblen) {
464 const size_t len = static_cast<size_t>(begin - key.data());
465 for (size_t i = 0; i < mblen; ++i) {
466 alignment_.push_back(org_len + i);
467 rev_alignment_.push_back(len + i);
468 }
469 } else {
470 // NOT a one to one maping, we take fist/last alignment only
471 alignment_.push_back(org_len);
472 for (size_t i = 1; i < mblen; ++i) {
473 alignment_.push_back(kInvalidPos);
474 }
475 rev_alignment_.push_back(static_cast<size_t>(begin - key.data()));
476 for (size_t i = 1; i < corrected_mblen; ++i) {
477 rev_alignment_.push_back(kInvalidPos);
478 }
479 }
480
481 begin += mblen;
482 ++key_pos;
483 }
484
485 DCHECK_EQ(original_key_.size(), alignment_.size());
486 DCHECK_EQ(corrected_key_.size(), rev_alignment_.size());
487
488 available_ = true;
489 return true;
490 }
491
GetCorrectedPrefix(const size_t original_key_pos,size_t * length) const492 const char *KeyCorrector::GetCorrectedPrefix(const size_t original_key_pos,
493 size_t *length) const {
494 if (!IsAvailable()) {
495 *length = 0;
496 return NULL;
497 }
498
499 if (mode_ == KANA) {
500 *length = 0;
501 return NULL;
502 }
503
504 const size_t corrected_key_pos = GetCorrectedPosition(original_key_pos);
505 if (!IsValidPosition(corrected_key_pos)) {
506 *length = 0;
507 return NULL;
508 }
509
510 const char *corrected_substr = corrected_key_.data() + corrected_key_pos;
511 const size_t corrected_length = corrected_key_.size() - corrected_key_pos;
512 const char *original_substr = original_key_.data() + original_key_pos;
513 const size_t original_length = original_key_.size() - original_key_pos;
514 // substrs are different
515 if (original_length != corrected_length ||
516 memcmp(original_substr, corrected_substr, original_length) != 0) {
517 *length = corrected_length;
518 return corrected_substr;
519 }
520
521 *length = 0;
522 return NULL;
523 }
524
GetOriginalOffset(const size_t original_key_pos,const size_t new_key_offset) const525 size_t KeyCorrector::GetOriginalOffset(const size_t original_key_pos,
526 const size_t new_key_offset) const {
527 if (!IsAvailable()) {
528 return kInvalidPos;
529 }
530
531 if (mode_ == KANA) {
532 return kInvalidPos;
533 }
534
535 const size_t corrected_key_pos =
536 GetCorrectedPosition(original_key_pos);
537 if (!IsValidPosition(corrected_key_pos)) {
538 return kInvalidPos;
539 }
540
541 if (rev_alignment_.size() == corrected_key_pos + new_key_offset) {
542 return (alignment_.size() - GetOriginalPosition(corrected_key_pos));
543 } else {
544 // treat right edge
545 const size_t original_key_pos2 =
546 GetOriginalPosition(corrected_key_pos + new_key_offset);
547
548 if (!IsValidPosition(original_key_pos2)) {
549 return kInvalidPos;
550 }
551
552 // Don't allow NULL matching
553 if (original_key_pos2 >= original_key_pos) {
554 return static_cast<size_t>(original_key_pos2 - original_key_pos);
555 }
556 }
557
558 return kInvalidPos;
559 }
560
561 // static
IsValidPosition(size_t pos)562 bool KeyCorrector::IsValidPosition(size_t pos) {
563 return (pos != kInvalidPos);
564 }
565
566 // static
IsInvalidPosition(size_t pos)567 bool KeyCorrector::IsInvalidPosition(size_t pos) {
568 return (pos == kInvalidPos);
569 }
570
571 // static
InvalidPosition()572 size_t KeyCorrector::InvalidPosition() {
573 return kInvalidPos;
574 }
575
576 // static
GetCorrectedCostPenalty(const string & key)577 int KeyCorrector::GetCorrectedCostPenalty(const string &key) {
578 // "んん" and "っっ" must be mis-spelling.
579 if (key.find("んん") != string::npos || key.find("っっ") != string::npos) {
580 return 0;
581 }
582 // add 3000 to the original word cost
583 const int kCorrectedCostPenalty = 3000;
584 return kCorrectedCostPenalty;
585 }
586
587 } // namespace mozc
588