1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/symbol_rewriter.h"
31 
32 #include <algorithm>
33 #include <cstring>
34 #include <set>
35 #include <string>
36 #include <vector>
37 
38 #include "base/logging.h"
39 #include "base/singleton.h"
40 #include "base/util.h"
41 #include "config/config_handler.h"
42 #include "converter/converter_interface.h"
43 #include "converter/segments.h"
44 #include "data_manager/data_manager_interface.h"
45 #include "protocol/commands.pb.h"
46 #include "protocol/config.pb.h"
47 #include "request/conversion_request.h"
48 #include "rewriter/rewriter_interface.h"
49 
50 // SymbolRewriter:
51 // When updating the rule
52 // 1. Export the spreadsheet into TEXT (TSV)
53 // 2. Copy the TSV to mozc/data/symbol/symbol.tsv
54 // 3. Run symbol_rewriter_dictionary_generator_main in this directory
55 // 4. Make sure symbol_rewriter_data.h is correct
56 
57 namespace mozc {
58 
59 namespace {
60 // Try to start inserting symbols from this position
61 const size_t kOffsetSize = 3;
62 // Number of symbols which are inserted to first part
63 const size_t kMaxInsertToMedium = 15;
64 }  // namespace
65 
66 // Some characters may have different description for full/half width forms.
67 // Here we just change the description in this function.
68 // If the symbol has description and additional description,
69 // Return merged description.
70 // TODO(taku): allow us to define two descriptions in *.tsv file
71 // static function
GetDescription(const string & value,StringPiece description,StringPiece additional_description)72 const string SymbolRewriter::GetDescription(
73     const string &value,
74     StringPiece description,
75     StringPiece additional_description) {
76   if (description.empty()) {
77     return "";
78   }
79   string result = description.as_string();
80   // Merge description
81   if (!additional_description.empty()) {
82     result.append(1, '(');
83     result.append(additional_description.data(), additional_description.size());
84     result.append(1, ')');
85   }
86   return result;
87 }
88 
89 // return true key has no-hiragana
90 // static function
IsSymbol(const string & key)91 bool SymbolRewriter::IsSymbol(const string &key) {
92   for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) {
93     const char32 ucs4 = iter.Get();
94     if (ucs4 >= 0x3041 && ucs4 <= 0x309F) {  // hiragana
95       return false;
96     }
97   }
98   return true;
99 }
100 
101 // static function
ExpandSpace(Segment * segment)102 void SymbolRewriter::ExpandSpace(Segment *segment) {
103   for (size_t i = 0; i < segment->candidates_size(); ++i) {
104     if (segment->candidate(i).value == " ") {
105       Segment::Candidate *c = segment->insert_candidate(i + 1);
106       *c = segment->candidate(i);
107       c->value = " ";  // Full-width space
108       c->content_value = " ";  // Full-width space
109       // Boundary is invalidated and unnecessary for space.
110       c->inner_segment_boundary.clear();
111       return;
112     } else if (segment->candidate(i).value == " ") {  // Full-width space
113       Segment::Candidate *c = segment->insert_candidate(i + 1);
114       *c = segment->candidate(i);
115       c->value = " ";
116       c->content_value = " ";
117       // Boundary is invalidated and unnecessary for space.
118       c->inner_segment_boundary.clear();
119       return;
120     }
121   }
122 }
123 
124 // TODO(toshiyuki): Should we move this under Util module?
IsPlatformDependent(SerializedDictionary::const_iterator iter)125 bool SymbolRewriter::IsPlatformDependent(
126     SerializedDictionary::const_iterator iter) {
127   if (iter.value().empty()) {
128     return false;
129   }
130   const Util::CharacterSet cset = Util::GetCharacterSet(iter.value());
131   return (cset >= Util::JISX0212);
132 }
133 
134 // Return true if two symbols are in same group
135 // static function
InSameSymbolGroup(SerializedDictionary::const_iterator lhs,SerializedDictionary::const_iterator rhs)136 bool SymbolRewriter::InSameSymbolGroup(
137     SerializedDictionary::const_iterator lhs,
138     SerializedDictionary::const_iterator rhs) {
139   // "矢印記号", "矢印記号"
140   // "ギリシャ(大文字)", "ギリシャ(小文字)"
141   if (lhs.description().empty() || rhs.description().empty()) {
142     return false;
143   }
144   const size_t cmp_len =
145       std::max(lhs.description().size(), rhs.description().size());
146   return std::strncmp(lhs.description().data(),
147                       rhs.description().data(), cmp_len) == 0;
148 }
149 
150 // Insert Symbol into segment.
151 // static function
InsertCandidates(const SerializedDictionary::IterRange & range,bool context_sensitive,Segment * segment)152 void SymbolRewriter::InsertCandidates(
153     const SerializedDictionary::IterRange &range,
154     bool context_sensitive,
155     Segment *segment) {
156   if (segment->candidates_size() == 0) {
157     LOG(WARNING) << "candiadtes_size is 0";
158     return;
159   }
160 
161   // work around for space.
162   // space is not expanded in ExpandAlternative because it is not registered in
163   // CharacterFormManager.
164   // We do not want to make the form of spaces configurable, so we do not
165   // register space to CharacterFormManager.
166   ExpandSpace(segment);
167 
168   // If the original candidates given by ImmutableConveter already
169   // include the target symbols, do assign description to these candidates.
170   AddDescForCurrentCandidates(range, segment);
171 
172   const string &candidate_key = ((!segment->key().empty()) ?
173                                  segment->key() :
174                                  segment->candidate(0).key);
175   size_t offset = 0;
176 
177   // If the key is "かおもじ", set the insert position at the bottom,
178   // giving priority to emoticons inserted by EmoticonRewriter.
179   if (candidate_key == "かおもじ") {
180     offset = segment->candidates_size();
181   } else {
182     // Find the position wehere we start to insert the symbols
183     // We want to skip the single-kanji we inserted by single-kanji rewriter.
184     // We also skip transliterated key candidates.
185     offset = std::min(kOffsetSize, segment->candidates_size());
186     for (size_t i = offset; i < segment->candidates_size(); ++i) {
187       const string &target_value = segment->candidate(i).value;
188       if ((Util::CharsLen(target_value) == 1 &&
189            Util::IsScriptType(target_value, Util::KANJI)) ||
190           Util::IsScriptType(target_value, Util::HIRAGANA) ||
191           Util::IsScriptType(target_value, Util::KATAKANA)) {
192         ++offset;
193       } else {
194         break;
195       }
196     }
197   }
198 
199   const size_t range_size = range.second - range.first;
200   size_t inserted_count = 0;
201   bool finish_first_part = false;
202   const Segment::Candidate &base_candidate = segment->candidate(0);
203   for (auto iter = range.first; iter != range.second; ++iter) {
204     Segment::Candidate *candidate = segment->insert_candidate(offset);
205     DCHECK(candidate);
206 
207     candidate->Init();
208     candidate->lid = iter.lid();
209     candidate->rid = iter.rid();
210     candidate->cost = base_candidate.cost;
211     candidate->structure_cost = base_candidate.structure_cost;
212     candidate->value.assign(iter.value().data(), iter.value().size());
213     candidate->content_value.assign(iter.value().data(), iter.value().size());
214     candidate->key = candidate_key;
215     candidate->content_key = candidate_key;
216 
217     if (context_sensitive) {
218       candidate->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
219     }
220 
221     // The first two consist of two characters but the one of characters doesn't
222     // have alternative character.
223     if (candidate->value == "“”" || candidate->value == "‘’" ||
224         candidate->value == "w" || candidate->value == "www") {
225       candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
226     }
227 
228     candidate->description = GetDescription(candidate->value,
229                                             iter.description(),
230                                             iter.additional_description());
231     ++offset;
232     ++inserted_count;
233 
234     // Insert to latter position
235     // If number of rest symbols is small, insert current position.
236     const auto next = iter + 1;
237     if (next != range.second &&
238         !finish_first_part &&
239         inserted_count >= kMaxInsertToMedium &&
240         range_size - inserted_count >= 5 &&
241         // Do not divide symbols which seem to be in the same group
242         // providing that they are not platform dependent characters.
243         (!InSameSymbolGroup(iter, next) || IsPlatformDependent(next))) {
244       offset = segment->candidates_size();
245       finish_first_part = true;
246     }
247   }
248 }
249 
250 // static
AddDescForCurrentCandidates(const SerializedDictionary::IterRange & range,Segment * segment)251 void SymbolRewriter::AddDescForCurrentCandidates(
252     const SerializedDictionary::IterRange &range, Segment *segment) {
253   for (size_t i = 0; i < segment->candidates_size(); ++i) {
254     Segment::Candidate *candidate = segment->mutable_candidate(i);
255     string full_width_value, half_width_value;
256     Util::HalfWidthToFullWidth(candidate->value, &full_width_value);
257     Util::FullWidthToHalfWidth(candidate->value, &half_width_value);
258 
259     for (auto iter = range.first; iter != range.second; ++iter) {
260       if (candidate->value == iter.value() ||
261           full_width_value == iter.value() ||
262           half_width_value == iter.value()) {
263         candidate->description =
264             GetDescription(candidate->value,
265                            iter.description(),
266                            iter.additional_description());
267         break;
268       }
269     }
270   }
271 }
272 
RewriteEachCandidate(Segments * segments) const273 bool SymbolRewriter::RewriteEachCandidate(Segments *segments) const {
274   bool modified = false;
275   for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
276     const string &key = segments->conversion_segment(i).key();
277     const SerializedDictionary::IterRange range = dictionary_->equal_range(key);
278     if (range.first == range.second) {
279       continue;
280     }
281 
282     // if key is symbol, no need to see the context
283     const bool context_sensitive = !IsSymbol(key);
284 
285     InsertCandidates(range, context_sensitive,
286                      segments->mutable_conversion_segment(i));
287 
288     modified = true;
289   }
290 
291   return modified;
292 }
293 
RewriteEntireCandidate(const ConversionRequest & request,Segments * segments) const294 bool SymbolRewriter::RewriteEntireCandidate(const ConversionRequest &request,
295                                             Segments *segments) const {
296   string key;
297   for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
298     key += segments->conversion_segment(i).key();
299   }
300 
301   const SerializedDictionary::IterRange range = dictionary_->equal_range(key);
302   if (range.first == range.second) {
303     return false;
304   }
305 
306   if (segments->conversion_segments_size() > 1) {
307     if (segments->resized()) {
308       // the given segments are resized by user
309       // so don't modify anymore
310       return false;
311     }
312     // need to resize
313     const size_t all_length = Util::CharsLen(key);
314     const size_t first_length =
315         Util::CharsLen(segments->conversion_segment(0).key());
316     const int diff = static_cast<int>(all_length - first_length);
317     if (diff > 0) {
318       parent_converter_->ResizeSegment(segments, request, 0, diff);
319     }
320   } else {
321     InsertCandidates(range,
322                      false,   // not context sensitive
323                      segments->mutable_conversion_segment(0));
324   }
325 
326   return true;
327 }
328 
SymbolRewriter(const ConverterInterface * parent_converter,const DataManagerInterface * data_manager)329 SymbolRewriter::SymbolRewriter(const ConverterInterface *parent_converter,
330                                const DataManagerInterface *data_manager)
331     : parent_converter_(parent_converter) {
332   DCHECK(parent_converter_);
333   StringPiece token_array_data, string_array_data;
334   data_manager->GetSymbolRewriterData(&token_array_data, &string_array_data);
335   DCHECK(SerializedDictionary::VerifyData(token_array_data, string_array_data));
336   dictionary_.reset(new SerializedDictionary(token_array_data,
337                                              string_array_data));
338 }
339 
~SymbolRewriter()340 SymbolRewriter::~SymbolRewriter() {}
341 
capability(const ConversionRequest & request) const342 int SymbolRewriter::capability(const ConversionRequest &request) const {
343   if (request.request().mixed_conversion()) {
344     return RewriterInterface::ALL;
345   }
346   return RewriterInterface::CONVERSION;
347 }
348 
Rewrite(const ConversionRequest & request,Segments * segments) const349 bool SymbolRewriter::Rewrite(const ConversionRequest &request,
350                              Segments *segments) const {
351   if (!request.config().use_symbol_conversion()) {
352     VLOG(2) << "no use_symbol_conversion";
353     return false;
354   }
355 
356   // apply entire candidate first, as we want to
357   // find character combinations first, e.g.,
358   // "->" -> "→"
359   return (RewriteEntireCandidate(request, segments) ||
360           RewriteEachCandidate(segments));
361 }
362 
363 }  // namespace mozc
364