1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "rewriter/collocation_rewriter.h"
31 
32 #include <algorithm>
33 #include <memory>
34 #include <string>
35 #include <vector>
36 
37 #include "base/flags.h"
38 #include "base/hash.h"
39 #include "base/logging.h"
40 #include "base/string_piece.h"
41 #include "base/util.h"
42 #include "converter/segments.h"
43 #include "data_manager/data_manager_interface.h"
44 #include "dictionary/pos_matcher.h"
45 #include "request/conversion_request.h"
46 #include "rewriter/collocation_util.h"
47 #include "storage/existence_filter.h"
48 
49 DEFINE_bool(use_collocation, true, "use collocation rewrite");
50 
51 namespace mozc {
52 
53 using mozc::storage::ExistenceFilter;
54 
55 namespace {
56 const size_t kCandidateSize = 12;
57 const int kMaxCostDiff = 3453;  // -500*log(1/1000)
58 
59 // For collocation, we use two segments.
60 enum SegmentLookupType {
61   LEFT,
62   RIGHT,
63 };
64 
65 // returns true if the given string contains number including Kanji.
ContainsNumber(const string & str)66 bool ContainsNumber(const string &str) {
67   for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
68     if (CollocationUtil::IsNumber(iter.Get())) {
69       return true;
70     }
71   }
72   return false;
73 }
74 
75 // Returns true if value matches the pattern XXXPPPYYY, where XXX is a Kanji
76 // sequence, PPP is the given pattern, and YYY is a sequence containing at least
77 // one Kanji character. In the value matches the pattern, XXX and YYY are
78 // substituted to |first_content| and |second|, respectively. Returns false if
79 // the value isn't of the form XXXPPPYYY.
ParseCompound(const StringPiece value,const StringPiece pattern,StringPiece * first_content,StringPiece * second)80 bool ParseCompound(const StringPiece value, const StringPiece pattern,
81                    StringPiece *first_content, StringPiece *second) {
82   DCHECK(!value.empty());
83   DCHECK(!pattern.empty());
84 
85   // Find the |first_content| candidate and check if it consists of Kanji only.
86   StringPiece::const_iterator pattern_begin =
87       std::find(value.begin(), value.end(), pattern[0]);
88   if (pattern_begin == value.end()) {
89     return false;
90   }
91   *first_content =
92       StringPiece(value.data(), std::distance(value.begin(), pattern_begin));
93   if (!Util::IsScriptType(*first_content, Util::KANJI)) {
94     return false;
95   }
96 
97   // Check if the middle part matches |pattern|.
98   const StringPiece remaining_value =
99       ClippedSubstr(value, first_content->size());
100   if (!Util::StartsWith(remaining_value, pattern)) {
101     return false;
102   }
103 
104   // Check if the last substring is eligible for |second|.
105   *second = ClippedSubstr(remaining_value, pattern.size());
106   if (second->empty() || !Util::ContainsScriptType(*second, Util::KANJI)) {
107     return false;
108   }
109 
110   // Just verify that |value| = |first_content| + |pattern| + |second|.
111   DCHECK_EQ(
112       value,
113       first_content->as_string() + pattern.as_string() + second->as_string());
114   return true;
115 }
116 
117 // Fast way of pushing back a string piece to a vector.
PushBackStringPiece(const StringPiece s,std::vector<string> * v)118 inline void PushBackStringPiece(const StringPiece s, std::vector<string> *v) {
119   v->push_back(string());
120   v->back().assign(s.data(), s.size());
121 }
122 
123 // Fast way of pushing back the concatenated string of two string pieces to a
124 // vector.
PushBackJoinedStringPieces(const StringPiece s1,const StringPiece s2,std::vector<string> * v)125 inline void PushBackJoinedStringPieces(
126     const StringPiece s1, const StringPiece s2, std::vector<string> *v) {
127   v->push_back(string());
128   v->back().reserve(s1.size() + s2.size());
129   v->back().assign(s1.data(), s1.size()).append(s2.data(), s2.size());
130 }
131 
132 // Handles compound such as "本を読む"(one segment)
133 // we want to rewrite using it as if it was "<本|を><読む>"
134 // so that we can use collocation data like "厚い本"
ResolveCompoundSegment(const string & top_value,const string & value,SegmentLookupType type,std::vector<string> * output)135 void ResolveCompoundSegment(const string &top_value, const string &value,
136                             SegmentLookupType type,
137                             std::vector<string> *output) {
138   // see "http://ja.wikipedia.org/wiki/助詞"
139   static const char kPat1[] = "が";
140   // "の" was not good...
141   // static const char kPat2[] = "の";
142   static const char kPat3[] = "を";
143   static const char kPat4[] = "に";
144   static const char kPat5[] = "へ";
145   static const char kPat6[] = "と";
146   static const char kPat7[] = "から";
147   static const char kPat8[] = "より";
148   static const char kPat9[] = "で";
149 
150   static const struct {
151     const char *pat;
152     size_t len;
153   } kParticles[] = {
154     {kPat1, arraysize(kPat1) - 1},
155     //    {kPat2, arraysize(kPat2) - 1},
156     {kPat3, arraysize(kPat3) - 1},
157     {kPat4, arraysize(kPat4) - 1},
158     {kPat5, arraysize(kPat5) - 1},
159     {kPat6, arraysize(kPat6) - 1},
160     {kPat7, arraysize(kPat7) - 1},
161     {kPat8, arraysize(kPat8) - 1},
162     {kPat9, arraysize(kPat9) - 1},
163     {NULL, 0}
164   };
165 
166   for (size_t i = 0; kParticles[i].pat != NULL; ++i) {
167     const StringPiece particle(kParticles[i].pat, kParticles[i].len);
168     StringPiece first_content, second;
169     if (!ParseCompound(top_value, particle, &first_content, &second)) {
170       continue;
171     }
172     if (ParseCompound(value, particle, &first_content, &second)) {
173       if (type == LEFT) {
174         PushBackStringPiece(second, output);
175         PushBackJoinedStringPieces(first_content, particle, output);
176       } else {
177         PushBackStringPiece(first_content, output);
178       }
179       return;
180     }
181   }
182 }
183 
IsNaturalContent(const Segment::Candidate & cand,const Segment::Candidate & top_cand,SegmentLookupType type,std::vector<string> * output)184 bool IsNaturalContent(const Segment::Candidate &cand,
185                       const Segment::Candidate &top_cand,
186                       SegmentLookupType type,
187                       std::vector<string> *output) {
188   const string &content = cand.content_value;
189   const string &value = cand.value;
190   const string &top_content = top_cand.content_value;
191   const string &top_value = top_cand.value;
192 
193   const size_t top_content_len = Util::CharsLen(top_content);
194   const size_t content_len = Util::CharsLen(content);
195 
196   if (type == RIGHT &&
197       value != top_value &&
198       top_content_len >= 2 &&
199       content_len == 1) {
200     return false;
201   }
202 
203   if (type == LEFT) {
204     output->push_back(value);
205   } else {
206     output->push_back(content);
207     // "舞って" workaround
208     // V+"て" is often treated as one compound.
209     static const char kPat[] = "て";
210     if (Util::EndsWith(content, StringPiece(kPat, arraysize(kPat) - 1))) {
211       PushBackStringPiece(
212           Util::SubStringPiece(content, 0, content_len - 1), output);
213     }
214   }
215 
216   // we don't rewrite NUMBER to others and vice versa
217   if (ContainsNumber(value) != ContainsNumber(top_value)) {
218     return false;
219   }
220 
221   const StringPiece top_aux_value =
222       Util::SubStringPiece(top_value, top_content_len, string::npos);
223   const size_t top_aux_value_len = Util::CharsLen(top_aux_value);
224   const Util::ScriptType top_value_script_type = Util::GetScriptType(top_value);
225 
226   // we don't rewrite KATAKANA segment
227   // for example, we don't rewrite "コーヒー飲みます" to "珈琲飲みます"
228   if (type == LEFT &&
229       top_aux_value_len == 0 &&
230       top_value != value &&
231       top_value_script_type == Util::KATAKANA) {
232     return false;
233   }
234 
235   // special cases
236   if (top_content_len == 1) {
237     const char *begin = top_content.data();
238     const char *end = top_content.data() + top_content.size();
239     size_t mblen = 0;
240     const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen);
241 
242     switch (wchar) {
243       case 0x304a:  // "お"
244       case 0x5fa1:  // "御"
245       case 0x3054:  // "ご"
246         return true;
247       default:
248         break;
249     }
250   }
251 
252   const StringPiece aux_value =
253       Util::SubStringPiece(value, content_len, string::npos);
254 
255   // Remove number in normalization for the left segment.
256   string aux_normalized, top_aux_normalized;
257   CollocationUtil::GetNormalizedScript(
258       aux_value, (type == LEFT), &aux_normalized);
259   CollocationUtil::GetNormalizedScript(
260       top_aux_value, (type == LEFT), &top_aux_normalized);
261   if (!aux_normalized.empty() &&
262       !Util::IsScriptType(aux_normalized, Util::HIRAGANA)) {
263     if (type == RIGHT) {
264       return false;
265     }
266     if (aux_normalized != top_aux_normalized) {
267       return false;
268     }
269   }
270 
271   ResolveCompoundSegment(top_value, value, type, output);
272 
273   const size_t aux_value_len = Util::CharsLen(aux_value);
274   const size_t value_len = Util::CharsLen(value);
275 
276   // "<XXいる|>" can be rewrited to "<YY|いる>" and vice versa
277   {
278     static const char kPat[] = "いる";  // "いる"
279     const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
280     if (top_aux_value_len == 0 &&
281         aux_value_len == 2 &&
282         Util::EndsWith(top_value, kSuffix) &&
283         Util::EndsWith(aux_value, kSuffix)) {
284       if (type == RIGHT) {
285         // "YYいる" in addition to "YY"
286         output->push_back(value);
287       }
288       return true;
289     }
290     if (aux_value_len == 0 &&
291         top_aux_value_len == 2 &&
292         Util::EndsWith(value, kSuffix) &&
293         Util::EndsWith(top_aux_value, kSuffix)) {
294       if (type == RIGHT) {
295         // "YY" in addition to "YYいる"
296         PushBackStringPiece(
297             Util::SubStringPiece(value, 0, value_len - 2), output);
298       }
299       return true;
300     }
301   }
302 
303   // "<XXせる|>" can be rewrited to "<YY|せる>" and vice versa
304   {
305     const char kPat[] = "せる";
306     const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
307     if (top_aux_value_len == 0 &&
308         aux_value_len == 2 &&
309         Util::EndsWith(top_value, kSuffix) &&
310         Util::EndsWith(aux_value, kSuffix)) {
311       if (type == RIGHT) {
312         // "YYせる" in addition to "YY"
313         output->push_back(value);
314       }
315       return true;
316     }
317     if (aux_value_len == 0 &&
318         top_aux_value_len == 2 &&
319         Util::EndsWith(value, kSuffix) &&
320         Util::EndsWith(top_aux_value, kSuffix)) {
321       if (type == RIGHT) {
322         // "YY" in addition to "YYせる"
323         PushBackStringPiece(
324             Util::SubStringPiece(value, 0, value_len - 2), output);
325       }
326       return true;
327     }
328   }
329 
330   const Util::ScriptType content_script_type = Util::GetScriptType(content);
331 
332   // "<XX|する>" can be rewrited using "<XXす|る>" and "<XX|する>"
333   // in "<XX|する>", XX must be single script type
334   {
335     static const char kPat[] = "する";
336     const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
337     if (aux_value_len == 2 &&
338         Util::EndsWith(aux_value, kSuffix)) {
339       if (content_script_type != Util::KATAKANA &&
340           content_script_type != Util::HIRAGANA &&
341           content_script_type != Util::KANJI &&
342           content_script_type != Util::ALPHABET) {
343         return false;
344       }
345       if (type == RIGHT) {
346         // "YYす" in addition to "YY"
347         PushBackStringPiece(
348             Util::SubStringPiece(value, 0, value_len - 1), output);
349       }
350       return true;
351     }
352   }
353 
354   // "<XXる>" can be rewrited using "<XX|る>"
355   // "まとめる", "衰える"
356   {
357     static const char kPat[] = "る";
358     const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
359     if (aux_value_len == 0 &&
360         Util::EndsWith(value, kSuffix)) {
361       if (type == RIGHT) {
362         // "YY" in addition to "YYる"
363         PushBackStringPiece(
364             Util::SubStringPiece(value, 0, value_len - 1), output);
365       }
366       return true;
367     }
368   }
369 
370   // "<XXす>" can be rewrited using "XXする"
371   {
372     static const char kPat[] = "す";
373     const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
374     if (Util::EndsWith(value, kSuffix) &&
375         Util::IsScriptType(
376             Util::SubStringPiece(value, 0, value_len - 1),
377             Util::KANJI)) {
378       if (type == RIGHT) {
379         const char kRu[] = "る";
380         // "YYする" in addition to "YY"
381         PushBackJoinedStringPieces(
382             value, StringPiece(kRu, arraysize(kRu) - 1), output);
383       }
384       return true;
385     }
386   }
387 
388   // "<XXし|た>" can be rewrited using "<XX|した>"
389   {
390     static const char kPat[] = "した";
391     const StringPiece kShi(kPat, 3), kTa(kPat + 3, 3);
392     if (Util::EndsWith(content, kShi) &&
393         aux_value == kTa &&
394         Util::EndsWith(top_content, kShi) &&
395         top_aux_value == kTa) {
396       if (type == RIGHT) {
397         const StringPiece val =
398             Util::SubStringPiece(content, 0, content_len - 1);
399         // XX must be KANJI
400         if (Util::IsScriptType(val, Util::KANJI)) {
401           PushBackStringPiece(val, output);
402         }
403       }
404       return true;
405     }
406   }
407 
408   const int aux_len = value_len - content_len;
409   const int top_aux_len = Util::CharsLen(top_value) - top_content_len;
410   if (aux_len != top_aux_len) {
411     return false;
412   }
413 
414   const Util::ScriptType top_content_script_type =
415       Util::GetScriptType(top_content);
416 
417   // we don't rewrite HIRAGANA to KATAKANA
418   if (top_content_script_type == Util::HIRAGANA &&
419       content_script_type == Util::KATAKANA) {
420     return false;
421   }
422 
423   // we don't rewrite second KATAKANA
424   // for example, we don't rewrite "このコーヒー" to "この珈琲"
425   if (type == RIGHT &&
426       top_content_script_type == Util::KATAKANA &&
427       value != top_value) {
428     return false;
429   }
430 
431   if (top_content_len == 1 &&
432       top_content_script_type == Util::HIRAGANA) {
433     return false;
434   }
435 
436   // suppress "<身|ています>" etc.
437   if (top_content_len == 1 &&
438       content_len == 1 &&
439       top_aux_value_len >= 2 &&
440       aux_value_len >= 2 &&
441       top_content_script_type == Util::KANJI &&
442       content_script_type == Util::KANJI &&
443       top_content != content) {
444     return false;
445   }
446 
447   return true;
448 }
449 
450 // Just a wrapper of IsNaturalContent for debug.
VerifyNaturalContent(const Segment::Candidate & cand,const Segment::Candidate & top_cand,SegmentLookupType type)451 bool VerifyNaturalContent(const Segment::Candidate &cand,
452                           const Segment::Candidate &top_cand,
453                           SegmentLookupType type) {
454   std::vector<string> nexts;
455   return IsNaturalContent(cand, top_cand, RIGHT, &nexts);
456 }
457 
IsKeyUnknown(const Segment & seg)458 inline bool IsKeyUnknown(const Segment &seg) {
459   return Util::IsScriptType(seg.key(), Util::UNKNOWN_SCRIPT);
460 }
461 
462 }  // namespace
463 
RewriteCollocation(Segments * segments) const464 bool CollocationRewriter::RewriteCollocation(Segments *segments) const {
465   // return false if at least one segment is fixed.
466   for (size_t i = segments->history_segments_size();
467        i < segments->segments_size(); ++i) {
468     if (segments->segment(i).segment_type() == Segment::FIXED_VALUE) {
469       return false;
470     }
471   }
472 
473   std::vector<bool> segs_changed(segments->segments_size(), false);
474   bool changed = false;
475 
476   for (size_t i = segments->history_segments_size();
477        i < segments->segments_size(); ++i) {
478     bool rewrited_next = false;
479 
480     if (IsKeyUnknown(segments->segment(i))) {
481       continue;
482     }
483 
484     if (i + 1 < segments->segments_size() &&
485         RewriteUsingNextSegment(segments->mutable_segment(i + 1),
486                                 segments->mutable_segment(i))) {
487       changed = true;
488       rewrited_next = true;
489       segs_changed[i] = true;
490       segs_changed[i + 1] = true;
491     }
492 
493     if (!segs_changed[i] &&
494         !rewrited_next &&
495         i > 0 &&
496         RewriteFromPrevSegment(segments->segment(i - 1).candidate(0),
497                                segments->mutable_segment(i))) {
498       changed = true;
499       segs_changed[i - 1] = true;
500       segs_changed[i] = true;
501     }
502 
503     const Segment::Candidate &cand = segments->segment(i).candidate(0);
504     if (i >= 2 &&
505         // Cross over only adverbs
506         // Segment is adverb if;
507         //  1) lid and rid is adverb.
508         //  2) or rid is adverb suffix.
509         ((pos_matcher_.IsAdverb(segments->segment(i - 1).candidate(0).lid) &&
510           pos_matcher_.IsAdverb(segments->segment(i - 1).candidate(0).rid)) ||
511          pos_matcher_.IsAdverbSegmentSuffix(
512              segments->segment(i - 1).candidate(0).rid)) &&
513         (cand.content_value != cand.value ||
514          cand.value != "・")) {  // "・" workaround
515       if (!segs_changed[i - 2] &&
516           !segs_changed[i] &&
517           RewriteUsingNextSegment(segments->mutable_segment(i),
518                                   segments->mutable_segment(i - 2))) {
519         changed = true;
520         segs_changed[i] = true;
521         segs_changed[i - 2] = true;
522       } else if (!segs_changed[i] &&
523                  RewriteFromPrevSegment(
524                      segments->segment(i - 2).candidate(0),
525                      segments->mutable_segment(i))) {
526         changed = true;
527         segs_changed[i] = true;
528         segs_changed[i - 2] = true;
529       }
530     }
531   }
532 
533   return changed;
534 }
535 
536 class CollocationRewriter::CollocationFilter {
537  public:
CollocationFilter(const char * existence_data,size_t size)538   CollocationFilter(const char *existence_data, size_t size)
539       : filter_(ExistenceFilter::Read(existence_data, size)) {
540   }
~CollocationFilter()541   ~CollocationFilter() {
542   }
543 
Exists(const string & left,const string & right) const544   bool Exists(const string &left, const string &right) const {
545     if (left.empty() || right.empty()) {
546       return false;
547     }
548     string key;
549     key.reserve(left.size() + right.size());
550     key.assign(left).append(right);
551     const uint64 id = Hash::Fingerprint(key);
552     return filter_->Exists(id);
553   }
554 
555  private:
556   std::unique_ptr<ExistenceFilter> filter_;
557 
558   DISALLOW_COPY_AND_ASSIGN(CollocationFilter);
559 };
560 
561 class CollocationRewriter::SuppressionFilter {
562  public:
SuppressionFilter(const char * suppression_data,size_t size)563   SuppressionFilter(const char *suppression_data, size_t size)
564       : filter_(ExistenceFilter::Read(suppression_data, size)) {
565   }
~SuppressionFilter()566   ~SuppressionFilter() {
567   }
568 
Exists(const Segment::Candidate & cand) const569   bool Exists(const Segment::Candidate &cand) const {
570     // TODO(noriyukit): We should share key generation rule with
571     // gen_collocation_suppression_data_main.cc.
572     string key;
573     key.reserve(cand.content_value.size() + 1 + cand.content_key.size());
574     key.assign(cand.content_value).append("\t").append(cand.content_key);
575     const uint64 id = Hash::Fingerprint(key);
576     return filter_->Exists(id);
577   }
578 
579  private:
580   std::unique_ptr<ExistenceFilter> filter_;
581 
582   DISALLOW_COPY_AND_ASSIGN(SuppressionFilter);
583 };
584 
CollocationRewriter(const DataManagerInterface * data_manager)585 CollocationRewriter::CollocationRewriter(
586     const DataManagerInterface *data_manager)
587     : pos_matcher_(data_manager->GetPOSMatcherData()),
588       first_name_id_(pos_matcher_.GetFirstNameId()),
589       last_name_id_(pos_matcher_.GetLastNameId()) {
590   const char *data = NULL;
591   size_t size = 0;
592 
593   data_manager->GetCollocationData(&data, &size);
594   collocation_filter_.reset(new CollocationFilter(data, size));
595 
596   data_manager->GetCollocationSuppressionData(&data, &size);
597   suppression_filter_.reset(new SuppressionFilter(data, size));
598 }
599 
~CollocationRewriter()600 CollocationRewriter::~CollocationRewriter() {}
601 
Rewrite(const ConversionRequest & request,Segments * segments) const602 bool CollocationRewriter::Rewrite(const ConversionRequest &request,
603                                   Segments *segments) const {
604   if (!FLAGS_use_collocation) {
605     return false;
606   }
607   return RewriteCollocation(segments);
608 }
609 
IsName(const Segment::Candidate & cand) const610 bool CollocationRewriter::IsName(const Segment::Candidate &cand) const {
611   const bool ret = (cand.lid == last_name_id_ || cand.lid == first_name_id_);
612   VLOG_IF(3, ret) << cand.value << " is name sagment";
613   return ret;
614 }
615 
RewriteFromPrevSegment(const Segment::Candidate & prev_cand,Segment * seg) const616 bool CollocationRewriter::RewriteFromPrevSegment(
617     const Segment::Candidate &prev_cand,
618     Segment *seg) const {
619   string prev;
620   CollocationUtil::GetNormalizedScript(prev_cand.value, true, &prev);
621 
622   const size_t i_max = std::min(seg->candidates_size(), kCandidateSize);
623 
624   // Reuse |curs| and |cur| in the loop as this method is performance critical.
625   std::vector<string> curs;
626   string cur;
627   for (size_t i = 0; i < i_max; ++i) {
628     if (seg->candidate(i).cost > seg->candidate(0).cost + kMaxCostDiff) {
629       continue;
630     }
631     if (IsName(seg->candidate(i))) {
632       continue;
633     }
634     if (suppression_filter_->Exists(seg->candidate(i))) {
635       continue;
636     }
637     curs.clear();
638     if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), RIGHT, &curs)) {
639       continue;
640     }
641 
642     for (int j = 0; j < curs.size(); ++j) {
643       cur.clear();
644       CollocationUtil::GetNormalizedScript(curs[j], false, &cur);
645       if (collocation_filter_->Exists(prev, cur)) {
646         VLOG_IF(3, i != 0) << prev << cur << " "
647                            << seg->candidate(0).value << "->"
648                            << seg->candidate(i).value;
649         seg->move_candidate(i, 0);
650         seg->mutable_candidate(0)->attributes
651             |= Segment::Candidate::CONTEXT_SENSITIVE;
652         return true;
653       }
654     }
655   }
656   return false;
657 }
658 
RewriteUsingNextSegment(Segment * next_seg,Segment * seg) const659 bool CollocationRewriter::RewriteUsingNextSegment(Segment *next_seg,
660                                                   Segment *seg) const {
661   const size_t i_max = std::min(seg->candidates_size(), kCandidateSize);
662   const size_t j_max = std::min(next_seg->candidates_size(), kCandidateSize);
663 
664   // Cache the results for the next segment
665   std::vector<int> next_seg_ok(j_max);  // Avoiding std::vector<bool>
666   std::vector<std::vector<string> > normalized_string(j_max);
667 
668   // Reuse |nexts| in the loop as this method is performance critical.
669   std::vector<string> nexts;
670   for (size_t j = 0; j < j_max; ++j) {
671     next_seg_ok[j] = 0;
672 
673     if (IsName(next_seg->candidate(j))) {
674       continue;
675     }
676     if (suppression_filter_->Exists(next_seg->candidate(j))) {
677       continue;
678     }
679     nexts.clear();
680     if (!IsNaturalContent(next_seg->candidate(j),
681                           next_seg->candidate(0), RIGHT, &nexts)) {
682       continue;
683     }
684 
685     next_seg_ok[j] = 1;
686     for (std::vector<string>::const_iterator it = nexts.begin();
687          it != nexts.end(); ++it) {
688       normalized_string[j].push_back(string());
689       CollocationUtil::GetNormalizedScript(
690           *it, false, &normalized_string[j].back());
691     }
692   }
693 
694   // Reuse |curs| and |cur| in the loop as this method is performance critical.
695   std::vector<string> curs;
696   string cur;
697   for (size_t i = 0; i < i_max; ++i) {
698     if (seg->candidate(i).cost > seg->candidate(0).cost + kMaxCostDiff) {
699       continue;
700     }
701     if (IsName(seg->candidate(i))) {
702       continue;
703     }
704     if (suppression_filter_->Exists(seg->candidate(i))) {
705       continue;
706     }
707     curs.clear();
708     if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), LEFT, &curs)) {
709       continue;
710     }
711 
712     for (int k = 0; k < curs.size(); ++k) {
713       cur.clear();
714       CollocationUtil::GetNormalizedScript(curs[k], true, &cur);
715       for (size_t j = 0; j < j_max; ++j) {
716         if (next_seg->candidate(j).cost >
717             next_seg->candidate(0).cost + kMaxCostDiff) {
718           continue;
719         }
720         if (!next_seg_ok[j]) {
721           continue;
722         }
723 
724         for (int l = 0; l < normalized_string[j].size(); ++l) {
725           const string &next = normalized_string[j][l];
726           if (collocation_filter_->Exists(cur, next)) {
727             DCHECK(VerifyNaturalContent(
728                 next_seg->candidate(j), next_seg->candidate(0), RIGHT))
729                 << "IsNaturalContent() should not fail here.";
730             seg->move_candidate(i, 0);
731             seg->mutable_candidate(0)->attributes
732                 |= Segment::Candidate::CONTEXT_SENSITIVE;
733             next_seg->move_candidate(j, 0);
734             next_seg->mutable_candidate(0)->attributes
735                 |= Segment::Candidate::CONTEXT_SENSITIVE;
736             return true;
737           }
738         }
739       }
740     }
741   }
742   return false;
743 }
744 
745 }  // namespace mozc
746