1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "composer/internal/char_chunk.h"
31
32 #include <set>
33 #include <string>
34 #include <vector>
35
36 #include "base/logging.h"
37 #include "base/util.h"
38 #include "composer/internal/composition_input.h"
39 #include "composer/internal/transliterators.h"
40 #include "composer/table.h"
41
42 namespace mozc {
43 namespace composer {
44
45 namespace {
46 // Max recursion count for looking up pending loop.
47 const int kMaxRecursion = 4;
48
49 // Delete "end" from "target", if "target" ends with the "end".
DeleteEnd(const string & end,string * target)50 bool DeleteEnd(const string &end, string *target) {
51 const string::size_type rindex = target->rfind(end);
52 if (rindex == string::npos) {
53 return false;
54 }
55 target->erase(rindex);
56 return true;
57 }
58
59 // Get from pending rules recursively
60 // The recursion will be stopped if recursion_count is 0.
61 // When returns false, the caller doesn't append result entries.
62 // If we have the rule,
63 // '1' -> '', 'あ'
64 // 'あ1' -> '', 'い'
65 // 'い1' -> '', 'う'
66 // 'う1' -> '', 'え'
67 // 'え1' -> '', 'お'
68 // 'お1' -> '', 'あ'
69 // 'あ*' -> '', '{*}ぁ'
70 // '{*}ぁ' -> '', '{*}あ'
71 // '{*}あ' -> '', '{*}ぁ'
72 // 'い*' -> '', '{*}ぃ'
73 // '{*}ぃ' -> '', '{*}い'
74 // '{*}い' -> '', '{*}ぃ'
75 // Here, we want to get '{*}あ' <-> '{*}ぁ' loop from the input, 'あ'
GetFromPending(const Table * table,const string & key,int recursion_count,std::set<string> * result)76 bool GetFromPending(const Table *table, const string &key,
77 int recursion_count, std::set<string> *result) {
78 DCHECK(result);
79 if (recursion_count == 0) {
80 // Don't find the loop within the |recursion_count|.
81 return false;
82 }
83 if (result->find(key) != result->end()) {
84 // Found the entry that is already looked up.
85 // Return true because we found the loop.
86 return true;
87 }
88 result->insert(key);
89
90 std::vector<const Entry *> entries;
91 table->LookUpPredictiveAll(key, &entries);
92 for (size_t i = 0; i < entries.size(); ++i) {
93 if (!entries[i]->result().empty()) {
94 // skip rules with result, because this causes too many results.
95 // for example, if we have
96 // 'k' -> 'っ', 'k'
97 // 'ka' -> 'か', ''
98 // From the input 'k', this causes 'か', 'っ', 'っか', ...
99 // So here we stop calling recursion.
100 return false;
101 }
102 if (!GetFromPending(table, entries[i]->pending(),
103 recursion_count - 1, result)) {
104 return false;
105 }
106 }
107 return true;
108 }
109 } // namespace
110
CharChunk(Transliterators::Transliterator transliterator,const Table * table)111 CharChunk::CharChunk(Transliterators::Transliterator transliterator,
112 const Table *table)
113 : transliterator_(transliterator),
114 table_(table),
115 attributes_(NO_TABLE_ATTRIBUTE) {
116 DCHECK_NE(Transliterators::LOCAL, transliterator);
117 }
118
Clear()119 void CharChunk::Clear() {
120 raw_.clear();
121 conversion_.clear();
122 pending_.clear();
123 ambiguous_.clear();
124 }
125
GetLength(Transliterators::Transliterator t12r) const126 size_t CharChunk::GetLength(Transliterators::Transliterator t12r) const {
127 const string t13n = Transliterate(
128 t12r,
129 Table::DeleteSpecialKey(raw_),
130 Table::DeleteSpecialKey(conversion_ + pending_));
131 return Util::CharsLen(t13n);
132 }
133
AppendResult(Transliterators::Transliterator t12r,string * result) const134 void CharChunk::AppendResult(Transliterators::Transliterator t12r,
135 string *result) const {
136 const string t13n = Transliterate(
137 t12r,
138 Table::DeleteSpecialKey(raw_),
139 Table::DeleteSpecialKey(conversion_ + pending_));
140 result->append(t13n);
141 }
142
AppendTrimedResult(Transliterators::Transliterator t12r,string * result) const143 void CharChunk::AppendTrimedResult(Transliterators::Transliterator t12r,
144 string *result) const {
145 // Only determined value (e.g. |conversion_| only) is added.
146 string converted = conversion_;
147 if (!pending_.empty()) {
148 size_t key_length = 0;
149 bool fixed = false;
150 const Entry *entry = table_->LookUpPrefix(pending_, &key_length, &fixed);
151 if (entry != NULL && entry->input() == entry->result()) {
152 converted.append(entry->result());
153 }
154 }
155 result->append(Transliterate(
156 t12r,
157 Table::DeleteSpecialKey(raw_),
158 Table::DeleteSpecialKey(converted)));
159 }
160
AppendFixedResult(Transliterators::Transliterator t12r,string * result) const161 void CharChunk::AppendFixedResult(Transliterators::Transliterator t12r,
162 string *result) const {
163 string converted = conversion_;
164 if (!ambiguous_.empty()) {
165 // Add the |ambiguous_| value as a fixed value. |ambiguous_|
166 // contains an undetermined result string like "ん" converted
167 // from a single 'n'.
168 converted.append(ambiguous_);
169 } else {
170 // If |pending_| exists but |ambiguous_| does not exist,
171 // |pending_| is appended. If |ambiguous_| exists, the value of
172 // |pending_| is usually equal to |ambiguous_| so it is not
173 // appended.
174 converted.append(pending_);
175 }
176 result->append(Transliterate(
177 t12r,
178 Table::DeleteSpecialKey(raw_),
179 Table::DeleteSpecialKey(converted)));
180 }
181
182 // If we have the rule (roman),
183 // 1: 'ka' -> 'か', ''
184 // 2: 'ki' -> 'き', ''
185 // 3: 'ku' -> 'く', ''
186 // 4: 'kk' -> 'っ', 'k'
187 // From the input 'k', we want to correct 'k', 'か', 'き', 'く', 'っ'
188 // We don't expand for next k for rule 4, because it causes
189 // many useless looped results, like 'っか', 'っっか', 'っっ', etc
190 //
191 // If we have the input 'kk', we will get 'か', 'き', 'く', 'っ' from the
192 // pending 'k' of rule 4.
193 // With the result of AppendTrimedResult, 'っ', we can get
194 // 'っか', 'っき', 'っく', 'っっ' from this input.
195 //
196 // If we have the rule (kana),
197 // 'は゜' -> 'ぱ', ''
198 // 'は゛' -> 'ば', ''
199 // From the input 'は', we want 'は', 'ば', 'ぱ'
200 //
201 // For mobile rules,
202 // If we have the rule,
203 // '1' -> '', 'あ'
204 // 'あ1' -> '', 'い'
205 // 'い1' -> '', 'う'
206 // 'う1' -> '', 'え'
207 // 'え1' -> '', 'お'
208 // 'お1' -> '', 'あ'
209 // 'あ*' -> '', '{*}ぁ'
210 // '{*}ぁ' -> '', '{*}あ'
211 // '{*}あ' -> '', '{*}ぁ'
212 // 'い*' -> '', '{*}ぃ'
213 // '{*}ぃ' -> '', '{*}い'
214 // '{*}い' -> '', '{*}ぃ'
215 // From the input '1', we want 'あ', 'ぁ', not including 'い', 'ぃ', 'う', etc
216 //
217 // Actually, we don't need to recursive lookup for above two patterns.
218 //
219 // What we want to append here is the 'looped rule' in |kMaxRecursion| lookup.
220 // Here, '{*}ぁ' -> '{*}あ' -> '{*}ぁ' is the loop.
GetExpandedResults(std::set<string> * results) const221 void CharChunk::GetExpandedResults(std::set<string> *results) const {
222 DCHECK(results);
223
224 if (pending_.empty()) {
225 return;
226 }
227 // Append current pending string
228 if (conversion_.empty()) {
229 results->insert(Table::DeleteSpecialKey(pending_));
230 }
231 std::vector<const Entry *> entries;
232 table_->LookUpPredictiveAll(pending_, &entries);
233 for (size_t i = 0; i < entries.size(); ++i) {
234 if (!entries[i]->result().empty()) {
235 results->insert(Table::DeleteSpecialKey(entries[i]->result()));
236 }
237 if (entries[i]->pending().empty()) {
238 continue;
239 }
240 std::set<string> loop_result;
241 if (!GetFromPending(table_, entries[i]->pending(),
242 kMaxRecursion, &loop_result)) {
243 continue;
244 }
245 for (std::set<string>::const_iterator itr = loop_result.begin();
246 itr != loop_result.end(); ++itr) {
247 results->insert(Table::DeleteSpecialKey(*itr));
248 }
249 }
250 }
251
IsFixed() const252 bool CharChunk::IsFixed() const {
253 return pending_.empty();
254 }
255
IsAppendable(Transliterators::Transliterator t12r,const Table * table) const256 bool CharChunk::IsAppendable(Transliterators::Transliterator t12r,
257 const Table *table) const {
258 return !pending_.empty() &&
259 (t12r == Transliterators::LOCAL || t12r == transliterator_) &&
260 table == table_;
261 }
262
IsConvertible(Transliterators::Transliterator t12r,const Table * table,const string & input) const263 bool CharChunk::IsConvertible(
264 Transliterators::Transliterator t12r,
265 const Table *table,
266 const string &input) const {
267 if (!IsAppendable(t12r, table)) {
268 return false;
269 }
270
271 size_t key_length = 0;
272 bool fixed = false;
273 string key = pending_ + input;
274 const Entry *entry = table->LookUpPrefix(key, &key_length, &fixed);
275
276 return entry && (key.size() == key_length) && fixed;
277 }
278
Combine(const CharChunk & left_chunk)279 void CharChunk::Combine(const CharChunk &left_chunk) {
280 conversion_ = left_chunk.conversion_ + conversion_;
281 raw_ = left_chunk.raw_ + raw_;
282 // TODO(komatsu): This is a hacky way. We should look up the
283 // conversion table with the new |raw_| value.
284 if (left_chunk.ambiguous_.empty()) {
285 ambiguous_.clear();
286 } else {
287 if (ambiguous_.empty()) {
288 ambiguous_ = left_chunk.ambiguous_ + pending_;
289 } else {
290 ambiguous_ = left_chunk.ambiguous_ + ambiguous_;
291 }
292 }
293 pending_ = left_chunk.pending_ + pending_;
294 }
295
AddInputInternal(string * input)296 bool CharChunk::AddInputInternal(string *input) {
297 const bool kNoLoop = false;
298
299 size_t key_length = 0;
300 bool fixed = false;
301 string key = pending_ + *input;
302 const Entry *entry = table_->LookUpPrefix(key, &key_length, &fixed);
303
304 if (entry == NULL) {
305 if (key_length == 0) {
306 // No prefix character is not contained in the table, fallback
307 // operation is performed.
308 if (pending_.empty()) {
309 // Conversion data was not found.
310 AddConvertedChar(input);
311 }
312 return kNoLoop;
313 }
314
315 if (key_length < pending_.size()) {
316 // Do not modify this char_chunk, all key characters will be used
317 // by the next char_chunk.
318 return kNoLoop;
319 }
320
321 DCHECK_GE(key_length, pending_.size());
322 // Some prefix character is contained in the table, but not
323 // reached any conversion result (like "t" with "ta->た").
324 key_length -= pending_.size();
325
326 // Conversion data had only pending.
327 const string new_pending_chars(*input, 0, key_length);
328 raw_.append(new_pending_chars);
329 pending_.append(new_pending_chars);
330 if (!ambiguous_.empty()) {
331 // If ambiguos_ has already a value, ambiguous_ is appended.
332 // ex. "ny" makes ambiguous_ "んy", but "sh" leaves ambiguous_ empty.
333 ambiguous_.append(new_pending_chars);
334 }
335 input->erase(0, key_length);
336 return kNoLoop;
337 }
338
339 // The prefix of key reached a conversion result, thus entry is not NULL.
340
341 if (key.size() == key_length) {
342 const bool is_following_entry = (
343 !conversion_.empty() ||
344 (!raw_.empty() && !pending_.empty() && raw_ != pending_));
345
346 raw_.append(*input);
347 input->clear();
348 if (fixed) {
349 // The whole key has been used, table lookup has reached a fixed
350 // value. It is a stable world. (like "ka->か", "q@->だ").
351 conversion_.append(entry->result());
352 pending_ = entry->pending();
353 ambiguous_.clear();
354
355 // If this entry is the first entry, the table attributes are
356 // applied to this chunk.
357 if (!is_following_entry) {
358 attributes_ = entry->attributes();
359 }
360 } else { // !fixed
361 // The whole string of key reached a conversion result, but the
362 // result is ambiguous (like "n" with "n->ん and na->な").
363 pending_ = key;
364 ambiguous_ = entry->result();
365 }
366 return kNoLoop;
367 }
368
369 // Delete pending_ from raw_ if matched.
370 DeleteEnd(pending_, &raw_);
371
372 // A result was found without any ambiguity.
373 input->assign(key, key_length, key.size() - key_length);
374 raw_.append(key, 0, key_length);
375 conversion_.append(entry->result());
376 pending_ = entry->pending();
377 ambiguous_.clear();
378
379 if (input->empty() || pending_.empty()) {
380 // If the remaining input character or pending character is empty,
381 // there is no reason to continue the looping.
382 return kNoLoop;
383 }
384
385 const bool kLoop = true;
386 return kLoop;
387 }
388
AddInput(string * input)389 void CharChunk::AddInput(string *input) {
390 while (AddInputInternal(input)) {}
391 }
392
AddConvertedChar(string * input)393 void CharChunk::AddConvertedChar(string *input) {
394 // TODO(komatsu) Nice to make "string Util::PopOneChar(string *str);".
395 string first_char = Util::SubString(*input, 0, 1);
396 conversion_.append(first_char);
397 raw_.append(first_char);
398 *input = Util::SubString(*input, 1, string::npos);
399 }
400
AddInputAndConvertedChar(string * key,string * converted_char)401 void CharChunk::AddInputAndConvertedChar(string *key,
402 string *converted_char) {
403 // If this chunk is empty, the key and converted_char are simply
404 // copied.
405 if (raw_.empty() && pending_.empty() && conversion_.empty()) {
406 raw_ = *key;
407 key->clear();
408 pending_ = *converted_char;
409 // TODO(komatsu): We should check if the |converted_char| is
410 // really ambigous or not, otherwise the last character of the
411 // preedit on Kana input is always dropped.
412 ambiguous_ = *converted_char;
413 converted_char->clear();
414
415 // If this entry is the first entry, the table attributes are
416 // applied to this chunk.
417 const Entry *entry = table_->LookUp(pending_);
418 if (entry != NULL) {
419 attributes_ = entry->attributes();
420 }
421 return;
422 }
423
424 const string input = pending_ + *converted_char;
425 size_t key_length = 0;
426 bool fixed = false;
427 const Entry *entry = table_->LookUpPrefix(input, &key_length, &fixed);
428 if (entry == NULL) {
429 // Do not modify this char_chunk, all key and converted_char
430 // values will be used by the next char_chunk.
431 return;
432 }
433
434 // The whole input string was used.
435 if (key_length == input.size()) {
436 raw_.append(*key);
437 if (fixed) {
438 conversion_.append(entry->result());
439 pending_ = entry->pending();
440 ambiguous_.clear();
441 } else { // !fixed
442 // |conversion_| remains the current value.
443 pending_ = entry->result();
444 ambiguous_ = entry->result();
445 }
446 key->clear();
447 converted_char->clear();
448 return;
449 }
450
451 // The following key_length == pending_.size() means the new key and
452 // and converted_char does not affect at all. Do not any thing here
453 // and a new char_chunk will be made for the new key and
454 // converted_char.
455 if (key_length == pending_.size()) {
456 return;
457 }
458
459 // The input is partially used.
460 raw_.append(*key);
461 conversion_.append(entry->result());
462 pending_ = entry->pending();
463 // While the whole key is used in this char_chunk, the
464 // converted_char is separated to this char_chunk and the next
465 // char_chunk. This is not a preferred behavior, but there is no
466 // better way to work around this limitation.
467 key->clear();
468 converted_char->assign(input, key_length, input.size() - key_length);
469 }
470
ShouldCommit() const471 bool CharChunk::ShouldCommit() const {
472 return (attributes_ & DIRECT_INPUT) && pending_.empty();
473 }
474
ShouldInsertNewChunk(const CompositionInput & input) const475 bool CharChunk::ShouldInsertNewChunk(const CompositionInput &input) const {
476 if (raw_.empty() && conversion_.empty() && pending_.empty()) {
477 return false;
478 }
479
480 const bool is_new_input =
481 input.is_new_input() ||
482 ((attributes_ & END_CHUNK) && pending_.empty());
483
484 if (is_new_input &&
485 (table_->HasNewChunkEntry(input.raw()) ||
486 !table_->HasSubRules(input.raw()))) {
487 // Such input which is not on the table is treated as NewChunk entry.
488 return true;
489 }
490 return false;
491 }
492
AddCompositionInput(CompositionInput * input)493 void CharChunk::AddCompositionInput(CompositionInput *input) {
494 if (input->has_conversion()) {
495 AddInputAndConvertedChar(
496 input->mutable_raw(),
497 input->mutable_conversion());
498 return;
499 }
500
501 if (ShouldInsertNewChunk(*input)) {
502 return;
503 }
504 AddInput(input->mutable_raw());
505 }
506
SetTransliterator(const Transliterators::Transliterator transliterator)507 void CharChunk::SetTransliterator(
508 const Transliterators::Transliterator transliterator) {
509 if (transliterator == Transliterators::LOCAL) {
510 // LOCAL transliterator shouldn't be set as local transliterator.
511 // Just ignore.
512 return;
513 }
514 transliterator_ = transliterator;
515 }
516
raw() const517 const string &CharChunk::raw() const {
518 return raw_;
519 }
520
set_raw(const string & raw)521 void CharChunk::set_raw(const string &raw) {
522 raw_ = raw;
523 }
524
conversion() const525 const string &CharChunk::conversion() const {
526 return conversion_;
527 }
528
set_conversion(const string & conversion)529 void CharChunk::set_conversion(const string &conversion) {
530 conversion_ = conversion;
531 }
532
pending() const533 const string &CharChunk::pending() const {
534 return pending_;
535 }
536
set_pending(const string & pending)537 void CharChunk::set_pending(const string &pending) {
538 pending_ = pending;
539 }
540
ambiguous() const541 const string &CharChunk::ambiguous() const {
542 return ambiguous_;
543 }
544
set_ambiguous(const string & ambiguous)545 void CharChunk::set_ambiguous(const string &ambiguous) {
546 ambiguous_ = ambiguous;
547 }
548
SplitChunk(Transliterators::Transliterator t12r,const size_t position,CharChunk ** left_new_chunk)549 bool CharChunk::SplitChunk(Transliterators::Transliterator t12r,
550 const size_t position,
551 CharChunk **left_new_chunk) {
552 if (position <= 0 || position >= GetLength(t12r)) {
553 LOG(WARNING) << "Invalid position: " << position;
554 return false;
555 }
556
557 string raw_lhs, raw_rhs, converted_lhs, converted_rhs;
558 Transliterators::GetTransliterator(GetTransliterator(t12r))->Split(
559 position,
560 Table::DeleteSpecialKey(raw_),
561 Table::DeleteSpecialKey(conversion_ + pending_),
562 &raw_lhs, &raw_rhs, &converted_lhs, &converted_rhs);
563
564 *left_new_chunk = new CharChunk(transliterator_, table_);
565 (*left_new_chunk)->set_raw(raw_lhs);
566 set_raw(raw_rhs);
567
568 if (converted_lhs.size() > conversion_.size()) {
569 // [ conversion | pending ] => [ conv | pend#1 ] [ pend#2 ]
570 const string pending_lhs(converted_lhs, conversion_.size());
571 (*left_new_chunk)->set_conversion(conversion_);
572 (*left_new_chunk)->set_pending(pending_lhs);
573
574 conversion_.clear();
575 pending_ = converted_rhs;
576 ambiguous_.clear();
577 } else {
578 // [ conversion | pending ] => [ conv#1 ] [ conv#2 | pending ]
579 (*left_new_chunk)->set_conversion(converted_lhs);
580 // left_new_chunk->set_pending("");
581 const size_t pending_pos = converted_rhs.size() - pending_.size();
582 conversion_.assign(converted_rhs, 0, pending_pos);
583 // pending_ = pending_;
584 }
585 return true;
586 }
587
588 Transliterators::Transliterator
GetTransliterator(Transliterators::Transliterator transliterator) const589 CharChunk::GetTransliterator(
590 Transliterators::Transliterator transliterator) const {
591 if (attributes_ && NO_TRANSLITERATION) {
592 if (transliterator == Transliterators::LOCAL ||
593 transliterator == Transliterators::HALF_ASCII ||
594 transliterator == Transliterators::FULL_ASCII) {
595 return Transliterators::CONVERSION_STRING;
596 }
597 return transliterator;
598 }
599 if (transliterator == Transliterators::LOCAL) {
600 return transliterator_;
601 }
602 return transliterator;
603 }
604
Transliterate(Transliterators::Transliterator transliterator,const string & raw,const string & converted) const605 string CharChunk::Transliterate(
606 Transliterators::Transliterator transliterator,
607 const string &raw, const string &converted) const {
608 return Transliterators::GetTransliterator(
609 GetTransliterator(transliterator))->Transliterate(raw, converted);
610 }
611
612
Clone() const613 CharChunk *CharChunk::Clone() const {
614 // TODO(hsumita): Implements TransliteratorFactory and uses it instead of
615 // copying a pointer.
616 CharChunk *new_char_chunk = new CharChunk(transliterator_, table_);
617 new_char_chunk->raw_.assign(raw_);
618 new_char_chunk->conversion_.assign(conversion_);
619 new_char_chunk->pending_.assign(pending_);
620 new_char_chunk->ambiguous_.assign(ambiguous_);
621 new_char_chunk->attributes_ = attributes_;
622 return new_char_chunk;
623 }
624
625 } // namespace composer
626 } // namespace mozc
627