1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/language_aware_rewriter.h"
31 
32 #include <string>
33 #include <utility>
34 
35 #include "base/logging.h"
36 #include "base/util.h"
37 #include "composer/composer.h"
38 #include "config/config_handler.h"
39 #include "converter/segments.h"
40 #include "dictionary/dictionary_interface.h"
41 #include "dictionary/pos_matcher.h"
42 #include "protocol/commands.pb.h"
43 #include "protocol/config.pb.h"
44 #include "request/conversion_request.h"
45 #include "usage_stats/usage_stats.h"
46 
47 namespace mozc {
48 
49 using dictionary::DictionaryInterface;
50 using dictionary::POSMatcher;
51 
LanguageAwareRewriter(const POSMatcher & pos_matcher,const DictionaryInterface * dictionary)52 LanguageAwareRewriter::LanguageAwareRewriter(
53     const POSMatcher &pos_matcher,
54     const DictionaryInterface *dictionary)
55     : unknown_id_(pos_matcher.GetUnknownId()),
56       dictionary_(dictionary) {}
57 
58 LanguageAwareRewriter::~LanguageAwareRewriter() = default;
59 
60 namespace {
61 
IsRomanHiraganaInput(const ConversionRequest & request)62 bool IsRomanHiraganaInput(const ConversionRequest &request) {
63   const auto table = request.request().special_romanji_table();
64   switch (table) {
65     case commands::Request::DEFAULT_TABLE:
66       return (request.config().preedit_method() == config::Config::ROMAN);
67     case commands::Request::QWERTY_MOBILE_TO_HIRAGANA:
68       return true;
69     default:
70       return false;
71   }
72 }
73 
IsEnabled(const ConversionRequest & request)74 bool IsEnabled(const ConversionRequest &request) {
75   const auto mode = request.request().language_aware_input();
76   if (mode == commands::Request::NO_LANGUAGE_AWARE_INPUT) {
77     return false;
78   }
79   if (mode == commands::Request::LANGUAGE_AWARE_SUGGESTION) {
80     return IsRomanHiraganaInput(request);
81   }
82   DCHECK_EQ(commands::Request::DEFAULT_LANGUAGE_AWARE_BEHAVIOR, mode);
83   return request.config().use_spelling_correction();
84 }
85 
86 }  // namespace
87 
capability(const ConversionRequest & request) const88 int LanguageAwareRewriter::capability(
89     const ConversionRequest &request) const {
90   // Language aware input is performed only on suggestion or prediction.
91   if (!IsEnabled(request)) {
92     return RewriterInterface::NOT_AVAILABLE;
93   }
94 
95   return (RewriterInterface::SUGGESTION | RewriterInterface::PREDICTION);
96 }
97 
98 namespace {
IsRawQuery(const composer::Composer & composer,const DictionaryInterface * dictionary,int * rank)99 bool IsRawQuery(const composer::Composer &composer,
100                 const DictionaryInterface *dictionary,
101                 int *rank) {
102   string raw_text;
103   composer.GetRawString(&raw_text);
104 
105   // Check if the length of text is less than or equal to three.
106   // For example, "cat" is not treated as a raw query so far to avoid
107   // false negative cases.
108   if (raw_text.size() <= 3) {
109     return false;
110   }
111 
112   // If the composition string is same with the raw_text, there is no
113   // need to add the candidate to suggestions.
114   string composition;
115   composer.GetStringForPreedit(&composition);
116   if (composition == raw_text) {
117     return false;
118   }
119 
120   // If the composition string is the full width form of the raw_text,
121   // there is no need to add the candidate to suggestions.
122   string composition_in_half_width_ascii;
123   Util::FullWidthAsciiToHalfWidthAscii(composition,
124                                        &composition_in_half_width_ascii);
125   if (composition_in_half_width_ascii == raw_text) {
126     return false;
127   }
128 
129   // If alphabet characters are in the middle of the composition, it is
130   // probably a raw query.  For example, "えぁmpぇ" (example) contains
131   // "m" and "p" in the middle.  So it is treated as a raw query.  On the
132   // other hand, "くえry" (query) contains alphabet characters, but they
133   // are at the end of the string, so it cannot be determined here.
134   //
135   // Note, GetQueryForPrediction omits the trailing alphabet characters of
136   // the composition string and returns it.
137   string key;
138   composer.GetQueryForPrediction(&key);
139   if (Util::ContainsScriptType(key, Util::ALPHABET)) {
140     *rank = 0;
141     return true;
142   }
143 
144   // If the composition is storead as a key in the dictionary like
145   // "はな" (hana), "たけ" (take), the query is not handled as a raw query.
146   // It is a little conservative, but a safer way.
147   if (dictionary->HasKey(key)) {
148     return false;
149   }
150 
151   // If the input text is stored in the dictionary, it is perhaps a raw query.
152   // For example, the input characters of "れもヴぇ" (remove) is in the
153   // dictionary, so it is treated as a raw text.
154   if (dictionary->HasValue(raw_text)) {
155     *rank = 2;
156     return true;
157   }
158 
159   return false;
160 }
161 
162 // Get T13n candidate ids from existing candidates.
GetAlphabetIds(const Segment & segment,uint16 * lid,uint16 * rid)163 void GetAlphabetIds(const Segment &segment, uint16 *lid, uint16 *rid) {
164   DCHECK(lid);
165   DCHECK(rid);
166 
167   for (int i = 0; i < segment.candidates_size(); ++i) {
168     const Segment::Candidate &candidate = segment.candidate(i);
169     const Util::ScriptType type = Util::GetScriptType(candidate.value);
170     if (type == Util::ALPHABET) {
171       *lid = candidate.lid;
172       *rid = candidate.rid;
173       return;
174     }
175   }
176 }
177 }  // namespace
178 
179 // Note: This function seemed slow, but the benchmark tests
180 // resulted that it was only less than 0.1% point penalty.
181 // = session_handler_benchmark_test
182 // BM_PerformanceForRandomKeyEvents: 891944807 -> 892740748 (1.00089)
183 // = converter_benchmark_test
184 // BM_DesktopAnthyCorpusConversion 25062440090 -> 25101542382 (1.002)
185 // BM_DesktopStationPredictionCorpusPrediction 8695341697 -> 8672187681 (0.997)
186 // BM_DesktopStationPredictionCorpusSuggestion 6149502840 -> 6152393270 (1.000)
FillRawText(const ConversionRequest & request,Segments * segments) const187 bool LanguageAwareRewriter::FillRawText(
188     const ConversionRequest &request, Segments *segments) const {
189   if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
190     return false;
191   }
192 
193   int rank = 0;
194   if (!IsRawQuery(request.composer(), dictionary_, &rank)) {
195     return false;
196   }
197 
198   Segment *segment = segments->mutable_conversion_segment(0);
199 
200   string raw_string;
201   request.composer().GetRawString(&raw_string);
202 
203   uint16 lid = unknown_id_;
204   uint16 rid = unknown_id_;
205   GetAlphabetIds(*segment, &lid, &rid);
206 
207   // Create a candidate.
208   if (rank > segment->candidates_size()) {
209     rank = segment->candidates_size();
210   }
211   Segment::Candidate *candidate = segment->insert_candidate(rank);
212   candidate->Init();
213   candidate->value = raw_string;
214   candidate->key = raw_string;
215   candidate->content_value = raw_string;
216   // raw_string is no longer used, so move it.
217   candidate->content_key = std::move(raw_string);
218   candidate->lid = lid;
219   candidate->rid = rid;
220 
221   candidate->attributes |= (Segment::Candidate::NO_VARIANTS_EXPANSION |
222                             Segment::Candidate::NO_EXTRA_DESCRIPTION);
223   candidate->prefix = "→ ";
224   candidate->description = "もしかして";
225 
226   // Set usage stats
227   usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionTriggered");
228 
229   return true;
230 }
231 
Rewrite(const ConversionRequest & request,Segments * segments) const232 bool LanguageAwareRewriter::Rewrite(
233     const ConversionRequest &request, Segments *segments) const {
234   if (!IsEnabled(request)) {
235     return false;
236   }
237   return FillRawText(request, segments);
238 }
239 
240 namespace {
IsLanguageAwareInputCandidate(const composer::Composer & composer,const Segment::Candidate & candidate)241 bool IsLanguageAwareInputCandidate(const composer::Composer &composer,
242                                    const Segment::Candidate &candidate) {
243   // Check candidate.prefix to filter if the candidate is probably generated
244   // from LanguangeAwareInput or not.
245   if (candidate.prefix != "→ ") {
246     return false;
247   }
248 
249   string raw_string;
250   composer.GetRawString(&raw_string);
251   if (raw_string != candidate.value) {
252     return false;
253   }
254   return true;
255 }
256 }  // namespace
257 
Finish(const ConversionRequest & request,Segments * segments)258 void LanguageAwareRewriter::Finish(const ConversionRequest &request,
259                                    Segments *segments) {
260   if (!IsEnabled(request)) {
261     return;
262   }
263 
264   if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
265     return;
266   }
267 
268   // Update usage stats
269   const Segment &segment = segments->conversion_segment(0);
270   // Ignores segments which are not converted or not committed.
271   if (segment.candidates_size() == 0 ||
272       segment.segment_type() != Segment::FIXED_VALUE) {
273     return;
274   }
275 
276   if (IsLanguageAwareInputCandidate(request.composer(),
277                                     segment.candidate(0))) {
278     usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionCommitted");
279   }
280 }
281 
282 }  // namespace mozc
283