1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "rewriter/language_aware_rewriter.h"
31
32 #include <string>
33 #include <utility>
34
35 #include "base/logging.h"
36 #include "base/util.h"
37 #include "composer/composer.h"
38 #include "config/config_handler.h"
39 #include "converter/segments.h"
40 #include "dictionary/dictionary_interface.h"
41 #include "dictionary/pos_matcher.h"
42 #include "protocol/commands.pb.h"
43 #include "protocol/config.pb.h"
44 #include "request/conversion_request.h"
45 #include "usage_stats/usage_stats.h"
46
47 namespace mozc {
48
49 using dictionary::DictionaryInterface;
50 using dictionary::POSMatcher;
51
LanguageAwareRewriter(const POSMatcher & pos_matcher,const DictionaryInterface * dictionary)52 LanguageAwareRewriter::LanguageAwareRewriter(
53 const POSMatcher &pos_matcher,
54 const DictionaryInterface *dictionary)
55 : unknown_id_(pos_matcher.GetUnknownId()),
56 dictionary_(dictionary) {}
57
58 LanguageAwareRewriter::~LanguageAwareRewriter() = default;
59
60 namespace {
61
IsRomanHiraganaInput(const ConversionRequest & request)62 bool IsRomanHiraganaInput(const ConversionRequest &request) {
63 const auto table = request.request().special_romanji_table();
64 switch (table) {
65 case commands::Request::DEFAULT_TABLE:
66 return (request.config().preedit_method() == config::Config::ROMAN);
67 case commands::Request::QWERTY_MOBILE_TO_HIRAGANA:
68 return true;
69 default:
70 return false;
71 }
72 }
73
IsEnabled(const ConversionRequest & request)74 bool IsEnabled(const ConversionRequest &request) {
75 const auto mode = request.request().language_aware_input();
76 if (mode == commands::Request::NO_LANGUAGE_AWARE_INPUT) {
77 return false;
78 }
79 if (mode == commands::Request::LANGUAGE_AWARE_SUGGESTION) {
80 return IsRomanHiraganaInput(request);
81 }
82 DCHECK_EQ(commands::Request::DEFAULT_LANGUAGE_AWARE_BEHAVIOR, mode);
83 return request.config().use_spelling_correction();
84 }
85
86 } // namespace
87
capability(const ConversionRequest & request) const88 int LanguageAwareRewriter::capability(
89 const ConversionRequest &request) const {
90 // Language aware input is performed only on suggestion or prediction.
91 if (!IsEnabled(request)) {
92 return RewriterInterface::NOT_AVAILABLE;
93 }
94
95 return (RewriterInterface::SUGGESTION | RewriterInterface::PREDICTION);
96 }
97
98 namespace {
IsRawQuery(const composer::Composer & composer,const DictionaryInterface * dictionary,int * rank)99 bool IsRawQuery(const composer::Composer &composer,
100 const DictionaryInterface *dictionary,
101 int *rank) {
102 string raw_text;
103 composer.GetRawString(&raw_text);
104
105 // Check if the length of text is less than or equal to three.
106 // For example, "cat" is not treated as a raw query so far to avoid
107 // false negative cases.
108 if (raw_text.size() <= 3) {
109 return false;
110 }
111
112 // If the composition string is same with the raw_text, there is no
113 // need to add the candidate to suggestions.
114 string composition;
115 composer.GetStringForPreedit(&composition);
116 if (composition == raw_text) {
117 return false;
118 }
119
120 // If the composition string is the full width form of the raw_text,
121 // there is no need to add the candidate to suggestions.
122 string composition_in_half_width_ascii;
123 Util::FullWidthAsciiToHalfWidthAscii(composition,
124 &composition_in_half_width_ascii);
125 if (composition_in_half_width_ascii == raw_text) {
126 return false;
127 }
128
129 // If alphabet characters are in the middle of the composition, it is
130 // probably a raw query. For example, "えぁmpぇ" (example) contains
131 // "m" and "p" in the middle. So it is treated as a raw query. On the
132 // other hand, "くえry" (query) contains alphabet characters, but they
133 // are at the end of the string, so it cannot be determined here.
134 //
135 // Note, GetQueryForPrediction omits the trailing alphabet characters of
136 // the composition string and returns it.
137 string key;
138 composer.GetQueryForPrediction(&key);
139 if (Util::ContainsScriptType(key, Util::ALPHABET)) {
140 *rank = 0;
141 return true;
142 }
143
144 // If the composition is storead as a key in the dictionary like
145 // "はな" (hana), "たけ" (take), the query is not handled as a raw query.
146 // It is a little conservative, but a safer way.
147 if (dictionary->HasKey(key)) {
148 return false;
149 }
150
151 // If the input text is stored in the dictionary, it is perhaps a raw query.
152 // For example, the input characters of "れもヴぇ" (remove) is in the
153 // dictionary, so it is treated as a raw text.
154 if (dictionary->HasValue(raw_text)) {
155 *rank = 2;
156 return true;
157 }
158
159 return false;
160 }
161
162 // Get T13n candidate ids from existing candidates.
GetAlphabetIds(const Segment & segment,uint16 * lid,uint16 * rid)163 void GetAlphabetIds(const Segment &segment, uint16 *lid, uint16 *rid) {
164 DCHECK(lid);
165 DCHECK(rid);
166
167 for (int i = 0; i < segment.candidates_size(); ++i) {
168 const Segment::Candidate &candidate = segment.candidate(i);
169 const Util::ScriptType type = Util::GetScriptType(candidate.value);
170 if (type == Util::ALPHABET) {
171 *lid = candidate.lid;
172 *rid = candidate.rid;
173 return;
174 }
175 }
176 }
177 } // namespace
178
179 // Note: This function seemed slow, but the benchmark tests
180 // resulted that it was only less than 0.1% point penalty.
181 // = session_handler_benchmark_test
182 // BM_PerformanceForRandomKeyEvents: 891944807 -> 892740748 (1.00089)
183 // = converter_benchmark_test
184 // BM_DesktopAnthyCorpusConversion 25062440090 -> 25101542382 (1.002)
185 // BM_DesktopStationPredictionCorpusPrediction 8695341697 -> 8672187681 (0.997)
186 // BM_DesktopStationPredictionCorpusSuggestion 6149502840 -> 6152393270 (1.000)
FillRawText(const ConversionRequest & request,Segments * segments) const187 bool LanguageAwareRewriter::FillRawText(
188 const ConversionRequest &request, Segments *segments) const {
189 if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
190 return false;
191 }
192
193 int rank = 0;
194 if (!IsRawQuery(request.composer(), dictionary_, &rank)) {
195 return false;
196 }
197
198 Segment *segment = segments->mutable_conversion_segment(0);
199
200 string raw_string;
201 request.composer().GetRawString(&raw_string);
202
203 uint16 lid = unknown_id_;
204 uint16 rid = unknown_id_;
205 GetAlphabetIds(*segment, &lid, &rid);
206
207 // Create a candidate.
208 if (rank > segment->candidates_size()) {
209 rank = segment->candidates_size();
210 }
211 Segment::Candidate *candidate = segment->insert_candidate(rank);
212 candidate->Init();
213 candidate->value = raw_string;
214 candidate->key = raw_string;
215 candidate->content_value = raw_string;
216 // raw_string is no longer used, so move it.
217 candidate->content_key = std::move(raw_string);
218 candidate->lid = lid;
219 candidate->rid = rid;
220
221 candidate->attributes |= (Segment::Candidate::NO_VARIANTS_EXPANSION |
222 Segment::Candidate::NO_EXTRA_DESCRIPTION);
223 candidate->prefix = "→ ";
224 candidate->description = "もしかして";
225
226 // Set usage stats
227 usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionTriggered");
228
229 return true;
230 }
231
Rewrite(const ConversionRequest & request,Segments * segments) const232 bool LanguageAwareRewriter::Rewrite(
233 const ConversionRequest &request, Segments *segments) const {
234 if (!IsEnabled(request)) {
235 return false;
236 }
237 return FillRawText(request, segments);
238 }
239
240 namespace {
IsLanguageAwareInputCandidate(const composer::Composer & composer,const Segment::Candidate & candidate)241 bool IsLanguageAwareInputCandidate(const composer::Composer &composer,
242 const Segment::Candidate &candidate) {
243 // Check candidate.prefix to filter if the candidate is probably generated
244 // from LanguangeAwareInput or not.
245 if (candidate.prefix != "→ ") {
246 return false;
247 }
248
249 string raw_string;
250 composer.GetRawString(&raw_string);
251 if (raw_string != candidate.value) {
252 return false;
253 }
254 return true;
255 }
256 } // namespace
257
Finish(const ConversionRequest & request,Segments * segments)258 void LanguageAwareRewriter::Finish(const ConversionRequest &request,
259 Segments *segments) {
260 if (!IsEnabled(request)) {
261 return;
262 }
263
264 if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
265 return;
266 }
267
268 // Update usage stats
269 const Segment &segment = segments->conversion_segment(0);
270 // Ignores segments which are not converted or not committed.
271 if (segment.candidates_size() == 0 ||
272 segment.segment_type() != Segment::FIXED_VALUE) {
273 return;
274 }
275
276 if (IsLanguageAwareInputCandidate(request.composer(),
277 segment.candidate(0))) {
278 usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionCommitted");
279 }
280 }
281
282 } // namespace mozc
283