1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "composer/internal/char_chunk.h"
31 
32 #include <set>
33 #include <string>
34 #include <vector>
35 
36 #include "base/logging.h"
37 #include "base/util.h"
38 #include "composer/internal/composition_input.h"
39 #include "composer/internal/transliterators.h"
40 #include "composer/table.h"
41 
42 namespace mozc {
43 namespace composer {
44 
45 namespace {
46 // Max recursion count for looking up pending loop.
47 const int kMaxRecursion = 4;
48 
49 // Delete "end" from "target", if "target" ends with the "end".
DeleteEnd(const string & end,string * target)50 bool DeleteEnd(const string &end, string *target) {
51   const string::size_type rindex = target->rfind(end);
52   if (rindex == string::npos) {
53     return false;
54   }
55   target->erase(rindex);
56   return true;
57 }
58 
59 // Get from pending rules recursively
60 // The recursion will be stopped if recursion_count is 0.
61 // When returns false, the caller doesn't append result entries.
62 // If we have the rule,
63 // '1' -> '', 'あ'
64 // 'あ1' -> '', 'い'
65 // 'い1' -> '', 'う'
66 // 'う1' -> '', 'え'
67 // 'え1' -> '', 'お'
68 // 'お1' -> '', 'あ'
69 // 'あ*' -> '', '{*}ぁ'
70 // '{*}ぁ' -> '', '{*}あ'
71 // '{*}あ' -> '', '{*}ぁ'
72 // 'い*' -> '', '{*}ぃ'
73 // '{*}ぃ' -> '', '{*}い'
74 // '{*}い' -> '', '{*}ぃ'
75 // Here, we want to get '{*}あ' <-> '{*}ぁ' loop from the input, 'あ'
GetFromPending(const Table * table,const string & key,int recursion_count,std::set<string> * result)76 bool GetFromPending(const Table *table, const string &key,
77                     int recursion_count, std::set<string> *result) {
78   DCHECK(result);
79   if (recursion_count == 0) {
80     // Don't find the loop within the |recursion_count|.
81     return false;
82   }
83   if (result->find(key) != result->end()) {
84     // Found the entry that is already looked up.
85     // Return true because we found the loop.
86     return true;
87   }
88   result->insert(key);
89 
90   std::vector<const Entry *> entries;
91   table->LookUpPredictiveAll(key, &entries);
92   for (size_t i = 0; i < entries.size(); ++i) {
93     if (!entries[i]->result().empty()) {
94       // skip rules with result, because this causes too many results.
95       // for example, if we have
96       //  'k' -> 'っ', 'k'
97       //  'ka' -> 'か', ''
98       // From the input 'k', this causes 'か', 'っ', 'っか', ...
99       // So here we stop calling recursion.
100       return false;
101     }
102     if (!GetFromPending(table, entries[i]->pending(),
103                         recursion_count - 1, result)) {
104       return false;
105     }
106   }
107   return true;
108 }
109 }  // namespace
110 
CharChunk(Transliterators::Transliterator transliterator,const Table * table)111 CharChunk::CharChunk(Transliterators::Transliterator transliterator,
112                      const Table *table)
113     : transliterator_(transliterator),
114       table_(table),
115       attributes_(NO_TABLE_ATTRIBUTE) {
116   DCHECK_NE(Transliterators::LOCAL, transliterator);
117 }
118 
Clear()119 void CharChunk::Clear() {
120   raw_.clear();
121   conversion_.clear();
122   pending_.clear();
123   ambiguous_.clear();
124 }
125 
GetLength(Transliterators::Transliterator t12r) const126 size_t CharChunk::GetLength(Transliterators::Transliterator t12r) const {
127   const string t13n = Transliterate(
128       t12r,
129       Table::DeleteSpecialKey(raw_),
130       Table::DeleteSpecialKey(conversion_ + pending_));
131   return Util::CharsLen(t13n);
132 }
133 
AppendResult(Transliterators::Transliterator t12r,string * result) const134 void CharChunk::AppendResult(Transliterators::Transliterator t12r,
135                              string *result) const {
136   const string t13n = Transliterate(
137       t12r,
138       Table::DeleteSpecialKey(raw_),
139       Table::DeleteSpecialKey(conversion_ + pending_));
140   result->append(t13n);
141 }
142 
AppendTrimedResult(Transliterators::Transliterator t12r,string * result) const143 void CharChunk::AppendTrimedResult(Transliterators::Transliterator t12r,
144                                    string *result) const {
145   // Only determined value (e.g. |conversion_| only) is added.
146   string converted = conversion_;
147   if (!pending_.empty()) {
148     size_t key_length = 0;
149     bool fixed = false;
150     const Entry *entry = table_->LookUpPrefix(pending_, &key_length, &fixed);
151     if (entry != NULL && entry->input() == entry->result()) {
152       converted.append(entry->result());
153     }
154   }
155   result->append(Transliterate(
156       t12r,
157       Table::DeleteSpecialKey(raw_),
158       Table::DeleteSpecialKey(converted)));
159 }
160 
AppendFixedResult(Transliterators::Transliterator t12r,string * result) const161 void CharChunk::AppendFixedResult(Transliterators::Transliterator t12r,
162                                   string *result) const {
163   string converted = conversion_;
164   if (!ambiguous_.empty()) {
165     // Add the |ambiguous_| value as a fixed value.  |ambiguous_|
166     // contains an undetermined result string like "ん" converted
167     // from a single 'n'.
168     converted.append(ambiguous_);
169   } else {
170     // If |pending_| exists but |ambiguous_| does not exist,
171     // |pending_| is appended.  If |ambiguous_| exists, the value of
172     // |pending_| is usually equal to |ambiguous_| so it is not
173     // appended.
174     converted.append(pending_);
175   }
176   result->append(Transliterate(
177       t12r,
178       Table::DeleteSpecialKey(raw_),
179       Table::DeleteSpecialKey(converted)));
180 }
181 
182 // If we have the rule (roman),
183 // 1: 'ka' -> 'か', ''
184 // 2: 'ki' -> 'き', ''
185 // 3: 'ku' -> 'く', ''
186 // 4: 'kk' -> 'っ', 'k'
187 // From the input 'k', we want to correct 'k', 'か', 'き', 'く', 'っ'
188 // We don't expand for next k for rule 4, because it causes
189 // many useless looped results, like 'っか', 'っっか', 'っっ', etc
190 //
191 // If we have the input 'kk', we will get 'か', 'き', 'く', 'っ' from the
192 // pending 'k' of rule 4.
193 // With the result of AppendTrimedResult, 'っ', we can get
194 // 'っか', 'っき', 'っく', 'っっ' from this input.
195 //
196 // If we have the rule (kana),
197 // 'は゜' -> 'ぱ', ''
198 // 'は゛' -> 'ば', ''
199 // From the input 'は', we want 'は', 'ば', 'ぱ'
200 //
201 // For mobile rules,
202 // If we have the rule,
203 // '1' -> '', 'あ'
204 // 'あ1' -> '', 'い'
205 // 'い1' -> '', 'う'
206 // 'う1' -> '', 'え'
207 // 'え1' -> '', 'お'
208 // 'お1' -> '', 'あ'
209 // 'あ*' -> '', '{*}ぁ'
210 // '{*}ぁ' -> '', '{*}あ'
211 // '{*}あ' -> '', '{*}ぁ'
212 // 'い*' -> '', '{*}ぃ'
213 // '{*}ぃ' -> '', '{*}い'
214 // '{*}い' -> '', '{*}ぃ'
215 // From the input '1', we want 'あ', 'ぁ', not including 'い', 'ぃ', 'う', etc
216 //
217 // Actually, we don't need to recursive lookup for above two patterns.
218 //
219 // What we want to append here is the 'looped rule' in |kMaxRecursion| lookup.
220 // Here, '{*}ぁ' -> '{*}あ' -> '{*}ぁ' is the loop.
GetExpandedResults(std::set<string> * results) const221 void CharChunk::GetExpandedResults(std::set<string> *results) const {
222   DCHECK(results);
223 
224   if (pending_.empty()) {
225     return;
226   }
227   // Append current pending string
228   if (conversion_.empty()) {
229     results->insert(Table::DeleteSpecialKey(pending_));
230   }
231   std::vector<const Entry *> entries;
232   table_->LookUpPredictiveAll(pending_, &entries);
233   for (size_t i = 0; i < entries.size(); ++i) {
234     if (!entries[i]->result().empty()) {
235       results->insert(Table::DeleteSpecialKey(entries[i]->result()));
236     }
237     if (entries[i]->pending().empty()) {
238       continue;
239     }
240     std::set<string> loop_result;
241     if (!GetFromPending(table_, entries[i]->pending(),
242                         kMaxRecursion, &loop_result)) {
243       continue;
244     }
245     for (std::set<string>::const_iterator itr = loop_result.begin();
246          itr != loop_result.end(); ++itr) {
247       results->insert(Table::DeleteSpecialKey(*itr));
248     }
249   }
250 }
251 
IsFixed() const252 bool CharChunk::IsFixed() const {
253   return pending_.empty();
254 }
255 
IsAppendable(Transliterators::Transliterator t12r,const Table * table) const256 bool CharChunk::IsAppendable(Transliterators::Transliterator t12r,
257                              const Table *table) const {
258   return !pending_.empty() &&
259       (t12r == Transliterators::LOCAL || t12r == transliterator_) &&
260       table == table_;
261 }
262 
IsConvertible(Transliterators::Transliterator t12r,const Table * table,const string & input) const263 bool CharChunk::IsConvertible(
264     Transliterators::Transliterator t12r,
265     const Table *table,
266     const string &input) const {
267   if (!IsAppendable(t12r, table)) {
268     return false;
269   }
270 
271   size_t key_length = 0;
272   bool fixed = false;
273   string key = pending_ + input;
274   const Entry *entry = table->LookUpPrefix(key, &key_length, &fixed);
275 
276   return entry && (key.size() == key_length) && fixed;
277 }
278 
Combine(const CharChunk & left_chunk)279 void CharChunk::Combine(const CharChunk &left_chunk) {
280   conversion_ = left_chunk.conversion_ + conversion_;
281   raw_ = left_chunk.raw_ + raw_;
282   // TODO(komatsu): This is a hacky way.  We should look up the
283   // conversion table with the new |raw_| value.
284   if (left_chunk.ambiguous_.empty()) {
285     ambiguous_.clear();
286   } else {
287     if (ambiguous_.empty()) {
288       ambiguous_ = left_chunk.ambiguous_ + pending_;
289     } else {
290       ambiguous_ = left_chunk.ambiguous_ + ambiguous_;
291     }
292   }
293   pending_ = left_chunk.pending_ + pending_;
294 }
295 
AddInputInternal(string * input)296 bool CharChunk::AddInputInternal(string *input) {
297   const bool kNoLoop = false;
298 
299   size_t key_length = 0;
300   bool fixed = false;
301   string key = pending_ + *input;
302   const Entry *entry = table_->LookUpPrefix(key, &key_length, &fixed);
303 
304   if (entry == NULL) {
305     if (key_length == 0) {
306       // No prefix character is not contained in the table, fallback
307       // operation is performed.
308       if (pending_.empty()) {
309         // Conversion data was not found.
310         AddConvertedChar(input);
311       }
312       return kNoLoop;
313     }
314 
315     if (key_length < pending_.size()) {
316       // Do not modify this char_chunk, all key characters will be used
317       // by the next char_chunk.
318       return kNoLoop;
319     }
320 
321     DCHECK_GE(key_length, pending_.size());
322     // Some prefix character is contained in the table, but not
323     // reached any conversion result (like "t" with "ta->た").
324     key_length -= pending_.size();
325 
326     // Conversion data had only pending.
327     const string new_pending_chars(*input, 0, key_length);
328     raw_.append(new_pending_chars);
329     pending_.append(new_pending_chars);
330     if (!ambiguous_.empty()) {
331       // If ambiguos_ has already a value, ambiguous_ is appended.
332       // ex. "ny" makes ambiguous_ "んy", but "sh" leaves ambiguous_ empty.
333       ambiguous_.append(new_pending_chars);
334     }
335     input->erase(0, key_length);
336     return kNoLoop;
337   }
338 
339   // The prefix of key reached a conversion result, thus entry is not NULL.
340 
341   if (key.size() == key_length) {
342     const bool is_following_entry = (
343         !conversion_.empty() ||
344         (!raw_.empty() && !pending_.empty() && raw_ != pending_));
345 
346     raw_.append(*input);
347     input->clear();
348     if (fixed) {
349       // The whole key has been used, table lookup has reached a fixed
350       // value.  It is a stable world. (like "ka->か", "q@->だ").
351       conversion_.append(entry->result());
352       pending_ = entry->pending();
353       ambiguous_.clear();
354 
355       // If this entry is the first entry, the table attributes are
356       // applied to this chunk.
357       if (!is_following_entry) {
358         attributes_ = entry->attributes();
359       }
360     } else {  // !fixed
361       // The whole string of key reached a conversion result, but the
362       // result is ambiguous (like "n" with "n->ん and na->な").
363       pending_ = key;
364       ambiguous_ = entry->result();
365     }
366     return kNoLoop;
367   }
368 
369   // Delete pending_ from raw_ if matched.
370   DeleteEnd(pending_, &raw_);
371 
372   // A result was found without any ambiguity.
373   input->assign(key, key_length, key.size() - key_length);
374   raw_.append(key, 0, key_length);
375   conversion_.append(entry->result());
376   pending_ = entry->pending();
377   ambiguous_.clear();
378 
379   if (input->empty() || pending_.empty()) {
380     // If the remaining input character or pending character is empty,
381     // there is no reason to continue the looping.
382     return kNoLoop;
383   }
384 
385   const bool kLoop = true;
386   return kLoop;
387 }
388 
AddInput(string * input)389 void CharChunk::AddInput(string *input) {
390   while (AddInputInternal(input)) {}
391 }
392 
AddConvertedChar(string * input)393 void CharChunk::AddConvertedChar(string *input) {
394   // TODO(komatsu) Nice to make "string Util::PopOneChar(string *str);".
395   string first_char = Util::SubString(*input, 0, 1);
396   conversion_.append(first_char);
397   raw_.append(first_char);
398   *input = Util::SubString(*input, 1, string::npos);
399 }
400 
AddInputAndConvertedChar(string * key,string * converted_char)401 void CharChunk::AddInputAndConvertedChar(string *key,
402                                          string *converted_char) {
403   // If this chunk is empty, the key and converted_char are simply
404   // copied.
405   if (raw_.empty() && pending_.empty() && conversion_.empty()) {
406     raw_ = *key;
407     key->clear();
408     pending_ = *converted_char;
409     // TODO(komatsu): We should check if the |converted_char| is
410     // really ambigous or not, otherwise the last character of the
411     // preedit on Kana input is always dropped.
412     ambiguous_ = *converted_char;
413     converted_char->clear();
414 
415     // If this entry is the first entry, the table attributes are
416     // applied to this chunk.
417     const Entry *entry = table_->LookUp(pending_);
418     if (entry != NULL) {
419       attributes_ = entry->attributes();
420     }
421     return;
422   }
423 
424   const string input = pending_ + *converted_char;
425   size_t key_length = 0;
426   bool fixed = false;
427   const Entry *entry = table_->LookUpPrefix(input, &key_length, &fixed);
428   if (entry == NULL) {
429     // Do not modify this char_chunk, all key and converted_char
430     // values will be used by the next char_chunk.
431     return;
432   }
433 
434   // The whole input string was used.
435   if (key_length == input.size()) {
436     raw_.append(*key);
437     if (fixed) {
438       conversion_.append(entry->result());
439       pending_ = entry->pending();
440       ambiguous_.clear();
441     } else {  // !fixed
442       // |conversion_| remains the current value.
443       pending_ = entry->result();
444       ambiguous_ = entry->result();
445     }
446     key->clear();
447     converted_char->clear();
448     return;
449   }
450 
451   // The following key_length == pending_.size() means the new key and
452   // and converted_char does not affect at all.  Do not any thing here
453   // and a new char_chunk will be made for the new key and
454   // converted_char.
455   if (key_length == pending_.size()) {
456     return;
457   }
458 
459   // The input is partially used.
460   raw_.append(*key);
461   conversion_.append(entry->result());
462   pending_ = entry->pending();
463   // While the whole key is used in this char_chunk, the
464   // converted_char is separated to this char_chunk and the next
465   // char_chunk.  This is not a preferred behavior, but there is no
466   // better way to work around this limitation.
467   key->clear();
468   converted_char->assign(input, key_length, input.size() - key_length);
469 }
470 
ShouldCommit() const471 bool CharChunk::ShouldCommit() const {
472   return (attributes_ & DIRECT_INPUT) && pending_.empty();
473 }
474 
ShouldInsertNewChunk(const CompositionInput & input) const475 bool CharChunk::ShouldInsertNewChunk(const CompositionInput &input) const {
476   if (raw_.empty() && conversion_.empty() && pending_.empty()) {
477     return false;
478   }
479 
480   const bool is_new_input =
481       input.is_new_input() ||
482       ((attributes_ & END_CHUNK) && pending_.empty());
483 
484   if (is_new_input &&
485       (table_->HasNewChunkEntry(input.raw()) ||
486           !table_->HasSubRules(input.raw()))) {
487     // Such input which is not on the table is treated as NewChunk entry.
488     return true;
489   }
490   return false;
491 }
492 
AddCompositionInput(CompositionInput * input)493 void CharChunk::AddCompositionInput(CompositionInput *input) {
494   if (input->has_conversion()) {
495     AddInputAndConvertedChar(
496                              input->mutable_raw(),
497                              input->mutable_conversion());
498     return;
499   }
500 
501   if (ShouldInsertNewChunk(*input)) {
502     return;
503   }
504   AddInput(input->mutable_raw());
505 }
506 
SetTransliterator(const Transliterators::Transliterator transliterator)507 void CharChunk::SetTransliterator(
508     const Transliterators::Transliterator transliterator) {
509   if (transliterator == Transliterators::LOCAL) {
510     // LOCAL transliterator shouldn't be set as local transliterator.
511     // Just ignore.
512     return;
513   }
514   transliterator_ = transliterator;
515 }
516 
raw() const517 const string &CharChunk::raw() const {
518   return raw_;
519 }
520 
set_raw(const string & raw)521 void CharChunk::set_raw(const string &raw) {
522   raw_ = raw;
523 }
524 
conversion() const525 const string &CharChunk::conversion() const {
526   return conversion_;
527 }
528 
set_conversion(const string & conversion)529 void CharChunk::set_conversion(const string &conversion) {
530   conversion_ = conversion;
531 }
532 
pending() const533 const string &CharChunk::pending() const {
534   return pending_;
535 }
536 
set_pending(const string & pending)537 void CharChunk::set_pending(const string &pending) {
538   pending_ = pending;
539 }
540 
ambiguous() const541 const string &CharChunk::ambiguous() const {
542   return ambiguous_;
543 }
544 
set_ambiguous(const string & ambiguous)545 void CharChunk::set_ambiguous(const string &ambiguous) {
546   ambiguous_ = ambiguous;
547 }
548 
SplitChunk(Transliterators::Transliterator t12r,const size_t position,CharChunk ** left_new_chunk)549 bool CharChunk::SplitChunk(Transliterators::Transliterator t12r,
550                            const size_t position,
551                            CharChunk **left_new_chunk) {
552   if (position <= 0 || position >= GetLength(t12r)) {
553     LOG(WARNING) << "Invalid position: " << position;
554     return false;
555   }
556 
557   string raw_lhs, raw_rhs, converted_lhs, converted_rhs;
558   Transliterators::GetTransliterator(GetTransliterator(t12r))->Split(
559       position,
560       Table::DeleteSpecialKey(raw_),
561       Table::DeleteSpecialKey(conversion_ + pending_),
562       &raw_lhs, &raw_rhs, &converted_lhs, &converted_rhs);
563 
564   *left_new_chunk = new CharChunk(transliterator_, table_);
565   (*left_new_chunk)->set_raw(raw_lhs);
566   set_raw(raw_rhs);
567 
568   if (converted_lhs.size() > conversion_.size()) {
569     // [ conversion | pending ] => [ conv | pend#1 ] [ pend#2 ]
570     const string pending_lhs(converted_lhs, conversion_.size());
571     (*left_new_chunk)->set_conversion(conversion_);
572     (*left_new_chunk)->set_pending(pending_lhs);
573 
574     conversion_.clear();
575     pending_ = converted_rhs;
576     ambiguous_.clear();
577   } else {
578     // [ conversion | pending ] => [ conv#1 ] [ conv#2 | pending ]
579     (*left_new_chunk)->set_conversion(converted_lhs);
580     // left_new_chunk->set_pending("");
581     const size_t pending_pos = converted_rhs.size() - pending_.size();
582     conversion_.assign(converted_rhs, 0, pending_pos);
583     // pending_ = pending_;
584   }
585   return true;
586 }
587 
588 Transliterators::Transliterator
GetTransliterator(Transliterators::Transliterator transliterator) const589 CharChunk::GetTransliterator(
590     Transliterators::Transliterator transliterator) const {
591   if (attributes_ && NO_TRANSLITERATION) {
592     if (transliterator == Transliterators::LOCAL ||
593         transliterator == Transliterators::HALF_ASCII ||
594         transliterator == Transliterators::FULL_ASCII) {
595       return Transliterators::CONVERSION_STRING;
596     }
597     return transliterator;
598   }
599   if (transliterator == Transliterators::LOCAL) {
600       return transliterator_;
601   }
602   return transliterator;
603 }
604 
Transliterate(Transliterators::Transliterator transliterator,const string & raw,const string & converted) const605 string CharChunk::Transliterate(
606     Transliterators::Transliterator transliterator,
607     const string &raw, const string &converted) const {
608   return Transliterators::GetTransliterator(
609       GetTransliterator(transliterator))->Transliterate(raw, converted);
610 }
611 
612 
Clone() const613 CharChunk *CharChunk::Clone() const {
614   // TODO(hsumita): Implements TransliteratorFactory and uses it instead of
615   // copying a pointer.
616   CharChunk *new_char_chunk = new CharChunk(transliterator_, table_);
617   new_char_chunk->raw_.assign(raw_);
618   new_char_chunk->conversion_.assign(conversion_);
619   new_char_chunk->pending_.assign(pending_);
620   new_char_chunk->ambiguous_.assign(ambiguous_);
621   new_char_chunk->attributes_ = attributes_;
622   return new_char_chunk;
623 }
624 
625 }  // namespace composer
626 }  // namespace mozc
627