1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "rewriter/focus_candidate_rewriter.h"
31
32 #include <algorithm>
33 #include <string>
34
35 #include "base/logging.h"
36 #include "base/number_util.h"
37 #include "base/serialized_string_array.h"
38 #include "base/util.h"
39 #include "converter/segments.h"
40 #include "data_manager/data_manager_interface.h"
41 #include "rewriter/number_compound_util.h"
42
43 namespace mozc {
44 namespace {
45
46 enum {
47 NUMBER = 1,
48 SUFFIX = 2,
49 CONNECTOR = 4
50 };
51
52 // TODO(taku): See POS and increase the coverage.
IsConnectorSegment(const Segment & segment)53 bool IsConnectorSegment(const Segment &segment) {
54 return (segment.key() == "と" || segment.key() == "や");
55 }
56
57 // Finds value from the candidates list and move the canidate to the top.
RewriteCandidate(Segment * segment,const string & value)58 bool RewriteCandidate(Segment *segment, const string &value) {
59 for (int i = 0; i < segment->candidates_size(); ++i) {
60 if (segment->candidate(i).content_value == value) {
61 segment->move_candidate(i, 0); // move to top
62 return true;
63 }
64 }
65 // Find value from meta candidates.
66 for (int i = 0; i < segment->meta_candidates_size(); ++i) {
67 if (segment->meta_candidate(i).content_value == value) {
68 segment->move_candidate(-i - 1, 0); // copy to top
69 return true;
70 }
71 }
72 return false;
73 }
74
75 // Returns true if the segment is valid.
IsValidSegment(const Segment & segment)76 bool IsValidSegment(const Segment &segment) {
77 return (segment.segment_type() == Segment::FREE ||
78 segment.segment_type() == Segment::FIXED_BOUNDARY ||
79 segment.segment_type() == Segment::FIXED_VALUE);
80 }
81
IsNumberCandidate(const Segment::Candidate & candidate)82 bool IsNumberCandidate(const Segment::Candidate &candidate) {
83 return (candidate.style != NumberUtil::NumberString::DEFAULT_STYLE ||
84 Util::GetScriptType(candidate.value) == Util::NUMBER);
85 }
86
IsNumberSegment(const Segment & segment)87 bool IsNumberSegment(const Segment &segment) {
88 return (segment.candidates_size() > 0 &&
89 IsNumberCandidate(segment.candidate(0)));
90 }
91
92 // Returns true if two candidates have the same number form.
IsSameNumberType(const Segment::Candidate & candidate1,const Segment::Candidate & candidate2)93 bool IsSameNumberType(const Segment::Candidate &candidate1,
94 const Segment::Candidate &candidate2) {
95 if (candidate1.style == candidate2.style) {
96 if (candidate1.style == NumberUtil::NumberString::DEFAULT_STYLE) {
97 if (IsNumberCandidate(candidate1) && IsNumberCandidate(candidate2) &&
98 Util::GetFormType(candidate1.value) ==
99 Util::GetFormType(candidate2.value)) {
100 return true;
101 }
102 } else {
103 return true;
104 }
105 }
106 return false;
107 }
108
RewriteNumber(Segment * segment,const Segment::Candidate & candidate)109 bool RewriteNumber(Segment *segment, const Segment::Candidate &candidate) {
110 for (int i = 0; i < segment->candidates_size(); ++i) {
111 if (IsSameNumberType(candidate, segment->candidate(i))) {
112 segment->move_candidate(i, 0); // move to top
113 return true;
114 }
115 }
116
117 // Find value from meta candidates.
118 for (int i = 0; i < segment->meta_candidates_size(); ++i) {
119 if (IsSameNumberType(candidate, segment->meta_candidate(i))) {
120 segment->move_candidate(-i - 1, 0); // copy to top
121 return true;
122 }
123 }
124
125 return false;
126 }
127
128 } // namespace
129
FocusCandidateRewriter(const DataManagerInterface * data_manager)130 FocusCandidateRewriter::FocusCandidateRewriter(
131 const DataManagerInterface *data_manager)
132 : pos_matcher_(data_manager->GetPOSMatcherData()) {
133 const char *array = nullptr;
134 size_t size = 0;
135 data_manager->GetCounterSuffixSortedArray(&array, &size);
136 const StringPiece data(array, size);
137 // Data manager is responsible for providing a valid data. Just verify data
138 // in debug build.
139 DCHECK(SerializedStringArray::VerifyData(data));
140 suffix_array_.Set(data);
141 }
142
~FocusCandidateRewriter()143 FocusCandidateRewriter::~FocusCandidateRewriter() {}
144
Focus(Segments * segments,size_t segment_index,int candidate_index) const145 bool FocusCandidateRewriter::Focus(Segments *segments,
146 size_t segment_index,
147 int candidate_index) const {
148 if (segments == NULL) {
149 LOG(ERROR) << "Segments is NULL";
150 return false;
151 }
152
153 if (segment_index >= segments->segments_size()) {
154 LOG(WARNING) << "Segment index out of range";
155 return false;
156 }
157
158 const Segment &seg = segments->segment(segment_index);
159
160 // segment_type must be FREE or FIXED_BOUNDARY
161 if (!IsValidSegment(seg)) {
162 LOG(WARNING) << "Segment is not valid";
163 return false;
164 }
165
166 // invalid candidate index
167 if ((candidate_index < 0 &&
168 -candidate_index - 1 >= static_cast<int>(seg.meta_candidates_size())) ||
169 candidate_index >= static_cast<int>(seg.candidates_size())) {
170 LOG(WARNING) << "Candidate index out of range: "
171 << candidate_index << " " << seg.candidates_size();
172 return false;
173 }
174
175 // left to right
176 {
177 const string &left_value = seg.candidate(candidate_index).content_value;
178 string right_value;
179
180 if (Util::IsOpenBracket(left_value, &right_value)) {
181 int num_nest = 1;
182 for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
183 Segment *target_right_seg = segments->mutable_segment(i);
184 if (target_right_seg == NULL ||
185 target_right_seg->candidates_size() <= 0) {
186 LOG(WARNING) << "target right seg is not valid";
187 return false;
188 }
189 if (!IsValidSegment(*target_right_seg)) {
190 continue;
191 }
192 const string &target_right_value =
193 target_right_seg->candidate(0).content_value;
194 string tmp;
195 if (Util::IsOpenBracket(target_right_value, &tmp)) {
196 ++num_nest;
197 } else if (Util::IsCloseBracket(target_right_value, &tmp)) {
198 --num_nest;
199 }
200
201 if (num_nest == 0 && RewriteCandidate(target_right_seg, right_value)) {
202 return true;
203 }
204 }
205 VLOG(1) << "could not find close bracket";
206 return false;
207 }
208 }
209
210 // right to left
211 {
212 const string &right_value = seg.candidate(candidate_index).content_value;
213 string left_value;
214
215 if (Util::IsCloseBracket(right_value, &left_value)) {
216 int num_nest = 1;
217 for (int i = segment_index - 1; i >= 0; --i) {
218 Segment *target_left_seg = segments->mutable_segment(i);
219 if (target_left_seg == NULL ||
220 target_left_seg->candidates_size() <= 0) {
221 LOG(WARNING) << "target left seg is not valid";
222 return false;
223 }
224 if (!IsValidSegment(*target_left_seg)) {
225 continue;
226 }
227 const string &target_left_value =
228 target_left_seg->candidate(0).content_value;
229 string tmp;
230 if (Util::IsCloseBracket(target_left_value, &tmp)) {
231 ++num_nest;
232 } else if (Util::IsOpenBracket(target_left_value, &tmp)) {
233 --num_nest;
234 }
235 if (num_nest == 0 && RewriteCandidate(target_left_seg, left_value)) {
236 return true;
237 }
238 }
239 VLOG(1) << "could not find open bracket";
240 return false;
241 }
242 }
243
244 {
245 if (IsNumberCandidate(seg.candidate(candidate_index))) {
246 bool modified = 0;
247 int distance = 0;
248 for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
249 Segment *target_right_seg = segments->mutable_segment(i);
250 if (target_right_seg == NULL ||
251 target_right_seg->candidates_size() <= 0) {
252 LOG(WARNING) << "target right seg is not valid";
253 return false;
254 }
255 if (!IsValidSegment(*target_right_seg)) {
256 continue;
257 }
258
259 // Make sure the first candidate of the segment is number.
260 if (IsNumberSegment(*target_right_seg) &&
261 RewriteNumber(target_right_seg,
262 seg.candidate(candidate_index))) {
263 modified = true;
264 distance = 0;
265 } else {
266 ++distance;
267 }
268 // more than two segments between the target numbers
269 if (distance >= 2) {
270 break;
271 }
272 }
273 return modified;
274 }
275 }
276
277 {
278 // <Number><Suffix><Connector>?<Number><Suffix><Connector>?
279 if (segment_index > 0 &&
280 IsNumberSegment(segments->segment(segment_index - 1)) &&
281 seg.candidates_size() > 0 &&
282 seg.candidate(0).content_key ==
283 seg.candidate(candidate_index).content_key) {
284 int next_stat = CONNECTOR | NUMBER;
285 bool modified = false;
286 for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
287 if (next_stat == (CONNECTOR | NUMBER)) {
288 if (IsConnectorSegment(segments->segment(i))) {
289 next_stat = NUMBER;
290 } else if (IsNumberSegment(segments->segment(i))) {
291 next_stat = SUFFIX;
292 } else {
293 break;
294 }
295 } else if (next_stat == NUMBER &&
296 IsNumberSegment(segments->segment(i))) {
297 next_stat = SUFFIX;
298 } else if (next_stat == SUFFIX &&
299 segments->segment(i).candidates_size() > 0 &&
300 segments->segment(i).candidate(0).content_key ==
301 seg.candidate(0).content_key) {
302 if (!IsValidSegment(segments->segment(i))) {
303 continue;
304 }
305 modified |= RewriteCandidate(
306 segments->mutable_segment(i),
307 seg.candidate(candidate_index).content_value);
308 next_stat = CONNECTOR | NUMBER;
309 } else {
310 break;
311 }
312 }
313 return modified;
314 }
315 }
316 return RerankNumberCandidates(segments, segment_index, candidate_index);
317 }
318
RerankNumberCandidates(Segments * segments,size_t segment_index,int candidate_index) const319 bool FocusCandidateRewriter::RerankNumberCandidates(Segments *segments,
320 size_t segment_index,
321 int candidate_index) const {
322 // Check if the focused candidate is a number compound.
323 StringPiece number, suffix;
324 uint32 number_script_type = 0;
325 const Segment &seg = segments->segment(segment_index);
326 if (!ParseNumberCandidate(seg.candidate(candidate_index), &number, &suffix,
327 &number_script_type)) {
328 return false;
329 }
330 if (number.empty()) {
331 return false;
332 }
333
334 // Try reranking top candidates of subsequent segments using the number
335 // compound style of the focused candidate.
336 bool modified = false;
337 int distance = 0;
338 for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
339 Segment *seg = segments->mutable_segment(i);
340 const int index = FindMatchingCandidates(*seg, number_script_type, suffix);
341 if (index == -1) {
342 // If there's no appropriate candidate having the same style, we increment
343 // the distance not to modify segments far from the focused one.
344 if (++distance > 2) {
345 break;
346 }
347 continue;
348 }
349 // Move the target candidate to the top. We don't need to move if the
350 // target is already at top (i.g., the case where index == 0).
351 if (index > 0) {
352 seg->move_candidate(index, 0);
353 modified = true;
354 distance = 0;
355 }
356 }
357 return modified;
358 }
359
FindMatchingCandidates(const Segment & seg,uint32 ref_script_type,StringPiece ref_suffix) const360 int FocusCandidateRewriter::FindMatchingCandidates(
361 const Segment &seg, uint32 ref_script_type, StringPiece ref_suffix) const {
362 // Only segments whose top candidate is a number compound are target of
363 // reranking.
364 const Segment::Candidate &cand = seg.candidate(0);
365 StringPiece number, suffix;
366 uint32 script_type = 0;
367 if (!ParseNumberCandidate(cand, &number, &suffix, &script_type)) {
368 return -1;
369 }
370
371 // Top candidate matches the style.
372 if (script_type == ref_script_type && suffix == ref_suffix) {
373 return 0;
374 }
375
376 // Check only top 10 candidates because, when the top candidate is a number
377 // candidate, other number compounds likely to appear near the top candidate.
378 const size_t max_size =
379 std::min(seg.candidates_size(), static_cast<size_t>(10));
380 for (size_t i = 1; i < max_size; ++i) {
381 if (!ParseNumberCandidate(seg.candidate(i), &number, &suffix,
382 &script_type)) {
383 continue;
384 }
385 if (script_type == ref_script_type && suffix == ref_suffix) {
386 return i;
387 }
388 }
389 return -1;
390 }
391
ParseNumberCandidate(const Segment::Candidate & cand,StringPiece * number,StringPiece * suffix,uint32 * script_type) const392 bool FocusCandidateRewriter::ParseNumberCandidate(
393 const Segment::Candidate &cand, StringPiece* number,
394 StringPiece* suffix, uint32 *script_type) const {
395 // If the lengths of content value and value are different, particles may be
396 // appended to value. In such cases, we only accept parallel markers.
397 // Otherwise, the following wrong rewrite will occur.
398 // Example: "一階へは | 二回 | 行った -> 一階へは | 二階 | 行った"
399 if (cand.content_value.size() != cand.value.size()) {
400 if (!pos_matcher_.IsParallelMarker(cand.rid)) {
401 return false;
402 }
403 }
404 return number_compound_util::SplitStringIntoNumberAndCounterSuffix(
405 suffix_array_, cand.content_value, number, suffix, script_type);
406 }
407
408 } // namespace mozc
409