1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/focus_candidate_rewriter.h"
31 
32 #include <algorithm>
33 #include <string>
34 
35 #include "base/logging.h"
36 #include "base/number_util.h"
37 #include "base/serialized_string_array.h"
38 #include "base/util.h"
39 #include "converter/segments.h"
40 #include "data_manager/data_manager_interface.h"
41 #include "rewriter/number_compound_util.h"
42 
43 namespace mozc {
44 namespace {
45 
46 enum {
47   NUMBER = 1,
48   SUFFIX = 2,
49   CONNECTOR = 4
50 };
51 
52 // TODO(taku): See POS and increase the coverage.
IsConnectorSegment(const Segment & segment)53 bool IsConnectorSegment(const Segment &segment) {
54   return (segment.key() == "と" || segment.key() == "や");
55 }
56 
57 // Finds value from the candidates list and move the canidate to the top.
RewriteCandidate(Segment * segment,const string & value)58 bool RewriteCandidate(Segment *segment, const string &value) {
59   for (int i = 0; i < segment->candidates_size(); ++i) {
60     if (segment->candidate(i).content_value == value) {
61       segment->move_candidate(i, 0);   // move to top
62       return true;
63     }
64   }
65   // Find value from meta candidates.
66   for (int i = 0; i < segment->meta_candidates_size(); ++i) {
67     if (segment->meta_candidate(i).content_value == value) {
68       segment->move_candidate(-i - 1, 0);   // copy to top
69       return true;
70     }
71   }
72   return false;
73 }
74 
75 // Returns true if the segment is valid.
IsValidSegment(const Segment & segment)76 bool IsValidSegment(const Segment &segment) {
77   return (segment.segment_type() == Segment::FREE ||
78           segment.segment_type() == Segment::FIXED_BOUNDARY ||
79           segment.segment_type() == Segment::FIXED_VALUE);
80 }
81 
IsNumberCandidate(const Segment::Candidate & candidate)82 bool IsNumberCandidate(const Segment::Candidate &candidate) {
83   return (candidate.style != NumberUtil::NumberString::DEFAULT_STYLE ||
84           Util::GetScriptType(candidate.value) == Util::NUMBER);
85 }
86 
IsNumberSegment(const Segment & segment)87 bool IsNumberSegment(const Segment &segment) {
88   return (segment.candidates_size() > 0 &&
89           IsNumberCandidate(segment.candidate(0)));
90 }
91 
92 // Returns true if two candidates have the same number form.
IsSameNumberType(const Segment::Candidate & candidate1,const Segment::Candidate & candidate2)93 bool IsSameNumberType(const Segment::Candidate &candidate1,
94                       const Segment::Candidate &candidate2) {
95   if (candidate1.style == candidate2.style) {
96     if (candidate1.style == NumberUtil::NumberString::DEFAULT_STYLE) {
97       if (IsNumberCandidate(candidate1) && IsNumberCandidate(candidate2) &&
98           Util::GetFormType(candidate1.value) ==
99           Util::GetFormType(candidate2.value)) {
100         return true;
101       }
102     } else {
103       return true;
104     }
105   }
106   return false;
107 }
108 
RewriteNumber(Segment * segment,const Segment::Candidate & candidate)109 bool RewriteNumber(Segment *segment, const Segment::Candidate &candidate) {
110   for (int i = 0; i < segment->candidates_size(); ++i) {
111     if (IsSameNumberType(candidate, segment->candidate(i))) {
112       segment->move_candidate(i, 0);   // move to top
113       return true;
114     }
115   }
116 
117   // Find value from meta candidates.
118   for (int i = 0; i < segment->meta_candidates_size(); ++i) {
119     if (IsSameNumberType(candidate, segment->meta_candidate(i))) {
120       segment->move_candidate(-i - 1, 0);   // copy to top
121       return true;
122     }
123   }
124 
125   return false;
126 }
127 
128 }  // namespace
129 
FocusCandidateRewriter(const DataManagerInterface * data_manager)130 FocusCandidateRewriter::FocusCandidateRewriter(
131     const DataManagerInterface *data_manager)
132     : pos_matcher_(data_manager->GetPOSMatcherData()) {
133   const char *array = nullptr;
134   size_t size = 0;
135   data_manager->GetCounterSuffixSortedArray(&array, &size);
136   const StringPiece data(array, size);
137   // Data manager is responsible for providing a valid data.  Just verify data
138   // in debug build.
139   DCHECK(SerializedStringArray::VerifyData(data));
140   suffix_array_.Set(data);
141 }
142 
~FocusCandidateRewriter()143 FocusCandidateRewriter::~FocusCandidateRewriter() {}
144 
Focus(Segments * segments,size_t segment_index,int candidate_index) const145 bool FocusCandidateRewriter::Focus(Segments *segments,
146                                    size_t segment_index,
147                                    int candidate_index) const {
148   if (segments == NULL) {
149     LOG(ERROR) << "Segments is NULL";
150     return false;
151   }
152 
153   if (segment_index >= segments->segments_size()) {
154     LOG(WARNING) << "Segment index out of range";
155     return false;
156   }
157 
158   const Segment &seg = segments->segment(segment_index);
159 
160   // segment_type must be FREE or FIXED_BOUNDARY
161   if (!IsValidSegment(seg)) {
162     LOG(WARNING) << "Segment is not valid";
163     return false;
164   }
165 
166   // invalid candidate index
167   if ((candidate_index < 0 &&
168        -candidate_index - 1 >= static_cast<int>(seg.meta_candidates_size())) ||
169       candidate_index >= static_cast<int>(seg.candidates_size())) {
170     LOG(WARNING) << "Candidate index out of range: "
171                  << candidate_index << " " << seg.candidates_size();
172     return false;
173   }
174 
175   // left to right
176   {
177     const string &left_value = seg.candidate(candidate_index).content_value;
178     string right_value;
179 
180     if (Util::IsOpenBracket(left_value, &right_value)) {
181       int num_nest = 1;
182       for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
183         Segment *target_right_seg = segments->mutable_segment(i);
184         if (target_right_seg == NULL ||
185             target_right_seg->candidates_size() <= 0) {
186           LOG(WARNING) << "target right seg is not valid";
187           return false;
188         }
189         if (!IsValidSegment(*target_right_seg)) {
190           continue;
191         }
192         const string &target_right_value =
193             target_right_seg->candidate(0).content_value;
194         string tmp;
195         if (Util::IsOpenBracket(target_right_value, &tmp)) {
196           ++num_nest;
197         } else if (Util::IsCloseBracket(target_right_value, &tmp)) {
198           --num_nest;
199         }
200 
201         if (num_nest == 0 && RewriteCandidate(target_right_seg, right_value)) {
202           return true;
203         }
204       }
205       VLOG(1) << "could not find close bracket";
206       return false;
207     }
208   }
209 
210   // right to left
211   {
212     const string &right_value = seg.candidate(candidate_index).content_value;
213     string left_value;
214 
215     if (Util::IsCloseBracket(right_value, &left_value)) {
216       int num_nest = 1;
217       for (int i = segment_index - 1; i >= 0; --i) {
218         Segment *target_left_seg = segments->mutable_segment(i);
219         if (target_left_seg == NULL ||
220             target_left_seg->candidates_size() <= 0) {
221           LOG(WARNING) << "target left seg is not valid";
222           return false;
223         }
224         if (!IsValidSegment(*target_left_seg)) {
225           continue;
226         }
227         const string &target_left_value =
228             target_left_seg->candidate(0).content_value;
229         string tmp;
230         if (Util::IsCloseBracket(target_left_value, &tmp)) {
231           ++num_nest;
232         } else if (Util::IsOpenBracket(target_left_value, &tmp)) {
233           --num_nest;
234         }
235         if (num_nest == 0 && RewriteCandidate(target_left_seg, left_value)) {
236           return true;
237         }
238       }
239       VLOG(1) << "could not find open bracket";
240       return false;
241     }
242   }
243 
244   {
245     if (IsNumberCandidate(seg.candidate(candidate_index))) {
246       bool modified = 0;
247       int distance = 0;
248       for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
249         Segment *target_right_seg = segments->mutable_segment(i);
250         if (target_right_seg == NULL ||
251             target_right_seg->candidates_size() <= 0) {
252           LOG(WARNING) << "target right seg is not valid";
253           return false;
254         }
255         if (!IsValidSegment(*target_right_seg)) {
256           continue;
257         }
258 
259         // Make sure the first candidate of the segment is number.
260         if (IsNumberSegment(*target_right_seg) &&
261             RewriteNumber(target_right_seg,
262                           seg.candidate(candidate_index))) {
263           modified = true;
264           distance = 0;
265         } else {
266           ++distance;
267         }
268         // more than two segments between the target numbers
269         if (distance >= 2) {
270           break;
271         }
272       }
273       return modified;
274     }
275   }
276 
277   {
278     // <Number><Suffix><Connector>?<Number><Suffix><Connector>?
279     if (segment_index > 0 &&
280         IsNumberSegment(segments->segment(segment_index - 1)) &&
281         seg.candidates_size() > 0 &&
282         seg.candidate(0).content_key ==
283         seg.candidate(candidate_index).content_key) {
284       int next_stat = CONNECTOR | NUMBER;
285       bool modified = false;
286       for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
287         if (next_stat == (CONNECTOR | NUMBER)) {
288           if (IsConnectorSegment(segments->segment(i))) {
289             next_stat = NUMBER;
290           } else if (IsNumberSegment(segments->segment(i))) {
291             next_stat = SUFFIX;
292           } else {
293             break;
294           }
295         } else if (next_stat == NUMBER &&
296                    IsNumberSegment(segments->segment(i))) {
297           next_stat = SUFFIX;
298         } else if (next_stat == SUFFIX &&
299                    segments->segment(i).candidates_size() > 0 &&
300                    segments->segment(i).candidate(0).content_key ==
301                    seg.candidate(0).content_key) {
302           if (!IsValidSegment(segments->segment(i))) {
303             continue;
304           }
305           modified |= RewriteCandidate(
306               segments->mutable_segment(i),
307               seg.candidate(candidate_index).content_value);
308           next_stat = CONNECTOR | NUMBER;
309         } else {
310           break;
311         }
312       }
313       return modified;
314     }
315   }
316   return RerankNumberCandidates(segments, segment_index, candidate_index);
317 }
318 
RerankNumberCandidates(Segments * segments,size_t segment_index,int candidate_index) const319 bool FocusCandidateRewriter::RerankNumberCandidates(Segments *segments,
320                                                     size_t segment_index,
321                                                     int candidate_index) const {
322   // Check if the focused candidate is a number compound.
323   StringPiece number, suffix;
324   uint32 number_script_type = 0;
325   const Segment &seg = segments->segment(segment_index);
326   if (!ParseNumberCandidate(seg.candidate(candidate_index), &number, &suffix,
327                             &number_script_type)) {
328     return false;
329   }
330   if (number.empty()) {
331     return false;
332   }
333 
334   // Try reranking top candidates of subsequent segments using the number
335   // compound style of the focused candidate.
336   bool modified = false;
337   int distance = 0;
338   for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
339     Segment *seg = segments->mutable_segment(i);
340     const int index = FindMatchingCandidates(*seg, number_script_type, suffix);
341     if (index == -1) {
342       // If there's no appropriate candidate having the same style, we increment
343       // the distance not to modify segments far from the focused one.
344       if (++distance > 2) {
345         break;
346       }
347       continue;
348     }
349     // Move the target candidate to the top.  We don't need to move if the
350     // target is already at top (i.g., the case where index == 0).
351     if (index > 0) {
352       seg->move_candidate(index, 0);
353       modified = true;
354       distance = 0;
355     }
356   }
357   return modified;
358 }
359 
FindMatchingCandidates(const Segment & seg,uint32 ref_script_type,StringPiece ref_suffix) const360 int FocusCandidateRewriter::FindMatchingCandidates(
361     const Segment &seg, uint32 ref_script_type, StringPiece ref_suffix) const {
362   // Only segments whose top candidate is a number compound are target of
363   // reranking.
364   const Segment::Candidate &cand = seg.candidate(0);
365   StringPiece number, suffix;
366   uint32 script_type = 0;
367   if (!ParseNumberCandidate(cand, &number, &suffix, &script_type)) {
368     return -1;
369   }
370 
371   // Top candidate matches the style.
372   if (script_type == ref_script_type && suffix == ref_suffix) {
373     return 0;
374   }
375 
376   // Check only top 10 candidates because, when the top candidate is a number
377   // candidate, other number compounds likely to appear near the top candidate.
378   const size_t max_size =
379       std::min(seg.candidates_size(), static_cast<size_t>(10));
380   for (size_t i = 1; i < max_size; ++i) {
381     if (!ParseNumberCandidate(seg.candidate(i), &number, &suffix,
382                               &script_type)) {
383       continue;
384     }
385     if (script_type == ref_script_type && suffix == ref_suffix) {
386       return i;
387     }
388   }
389   return -1;
390 }
391 
ParseNumberCandidate(const Segment::Candidate & cand,StringPiece * number,StringPiece * suffix,uint32 * script_type) const392 bool FocusCandidateRewriter::ParseNumberCandidate(
393     const Segment::Candidate &cand, StringPiece* number,
394     StringPiece* suffix, uint32 *script_type) const {
395   // If the lengths of content value and value are different, particles may be
396   // appended to value.  In such cases, we only accept parallel markers.
397   // Otherwise, the following wrong rewrite will occur.
398   // Example: "一階へは | 二回 | 行った -> 一階へは | 二階 | 行った"
399   if (cand.content_value.size() != cand.value.size()) {
400     if (!pos_matcher_.IsParallelMarker(cand.rid)) {
401       return false;
402     }
403   }
404   return number_compound_util::SplitStringIntoNumberAndCounterSuffix(
405       suffix_array_, cand.content_value, number, suffix, script_type);
406 }
407 
408 }  // namespace mozc
409