1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "rewriter/collocation_rewriter.h"
31
32 #include <algorithm>
33 #include <memory>
34 #include <string>
35 #include <vector>
36
37 #include "base/flags.h"
38 #include "base/hash.h"
39 #include "base/logging.h"
40 #include "base/string_piece.h"
41 #include "base/util.h"
42 #include "converter/segments.h"
43 #include "data_manager/data_manager_interface.h"
44 #include "dictionary/pos_matcher.h"
45 #include "request/conversion_request.h"
46 #include "rewriter/collocation_util.h"
47 #include "storage/existence_filter.h"
48
49 DEFINE_bool(use_collocation, true, "use collocation rewrite");
50
51 namespace mozc {
52
53 using mozc::storage::ExistenceFilter;
54
55 namespace {
56 const size_t kCandidateSize = 12;
57 const int kMaxCostDiff = 3453; // -500*log(1/1000)
58
59 // For collocation, we use two segments.
60 enum SegmentLookupType {
61 LEFT,
62 RIGHT,
63 };
64
65 // returns true if the given string contains number including Kanji.
ContainsNumber(const string & str)66 bool ContainsNumber(const string &str) {
67 for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
68 if (CollocationUtil::IsNumber(iter.Get())) {
69 return true;
70 }
71 }
72 return false;
73 }
74
75 // Returns true if value matches the pattern XXXPPPYYY, where XXX is a Kanji
76 // sequence, PPP is the given pattern, and YYY is a sequence containing at least
77 // one Kanji character. In the value matches the pattern, XXX and YYY are
78 // substituted to |first_content| and |second|, respectively. Returns false if
79 // the value isn't of the form XXXPPPYYY.
ParseCompound(const StringPiece value,const StringPiece pattern,StringPiece * first_content,StringPiece * second)80 bool ParseCompound(const StringPiece value, const StringPiece pattern,
81 StringPiece *first_content, StringPiece *second) {
82 DCHECK(!value.empty());
83 DCHECK(!pattern.empty());
84
85 // Find the |first_content| candidate and check if it consists of Kanji only.
86 StringPiece::const_iterator pattern_begin =
87 std::find(value.begin(), value.end(), pattern[0]);
88 if (pattern_begin == value.end()) {
89 return false;
90 }
91 *first_content =
92 StringPiece(value.data(), std::distance(value.begin(), pattern_begin));
93 if (!Util::IsScriptType(*first_content, Util::KANJI)) {
94 return false;
95 }
96
97 // Check if the middle part matches |pattern|.
98 const StringPiece remaining_value =
99 ClippedSubstr(value, first_content->size());
100 if (!Util::StartsWith(remaining_value, pattern)) {
101 return false;
102 }
103
104 // Check if the last substring is eligible for |second|.
105 *second = ClippedSubstr(remaining_value, pattern.size());
106 if (second->empty() || !Util::ContainsScriptType(*second, Util::KANJI)) {
107 return false;
108 }
109
110 // Just verify that |value| = |first_content| + |pattern| + |second|.
111 DCHECK_EQ(
112 value,
113 first_content->as_string() + pattern.as_string() + second->as_string());
114 return true;
115 }
116
117 // Fast way of pushing back a string piece to a vector.
PushBackStringPiece(const StringPiece s,std::vector<string> * v)118 inline void PushBackStringPiece(const StringPiece s, std::vector<string> *v) {
119 v->push_back(string());
120 v->back().assign(s.data(), s.size());
121 }
122
123 // Fast way of pushing back the concatenated string of two string pieces to a
124 // vector.
PushBackJoinedStringPieces(const StringPiece s1,const StringPiece s2,std::vector<string> * v)125 inline void PushBackJoinedStringPieces(
126 const StringPiece s1, const StringPiece s2, std::vector<string> *v) {
127 v->push_back(string());
128 v->back().reserve(s1.size() + s2.size());
129 v->back().assign(s1.data(), s1.size()).append(s2.data(), s2.size());
130 }
131
132 // Handles compound such as "本を読む"(one segment)
133 // we want to rewrite using it as if it was "<本|を><読む>"
134 // so that we can use collocation data like "厚い本"
ResolveCompoundSegment(const string & top_value,const string & value,SegmentLookupType type,std::vector<string> * output)135 void ResolveCompoundSegment(const string &top_value, const string &value,
136 SegmentLookupType type,
137 std::vector<string> *output) {
138 // see "http://ja.wikipedia.org/wiki/助詞"
139 static const char kPat1[] = "が";
140 // "の" was not good...
141 // static const char kPat2[] = "の";
142 static const char kPat3[] = "を";
143 static const char kPat4[] = "に";
144 static const char kPat5[] = "へ";
145 static const char kPat6[] = "と";
146 static const char kPat7[] = "から";
147 static const char kPat8[] = "より";
148 static const char kPat9[] = "で";
149
150 static const struct {
151 const char *pat;
152 size_t len;
153 } kParticles[] = {
154 {kPat1, arraysize(kPat1) - 1},
155 // {kPat2, arraysize(kPat2) - 1},
156 {kPat3, arraysize(kPat3) - 1},
157 {kPat4, arraysize(kPat4) - 1},
158 {kPat5, arraysize(kPat5) - 1},
159 {kPat6, arraysize(kPat6) - 1},
160 {kPat7, arraysize(kPat7) - 1},
161 {kPat8, arraysize(kPat8) - 1},
162 {kPat9, arraysize(kPat9) - 1},
163 {NULL, 0}
164 };
165
166 for (size_t i = 0; kParticles[i].pat != NULL; ++i) {
167 const StringPiece particle(kParticles[i].pat, kParticles[i].len);
168 StringPiece first_content, second;
169 if (!ParseCompound(top_value, particle, &first_content, &second)) {
170 continue;
171 }
172 if (ParseCompound(value, particle, &first_content, &second)) {
173 if (type == LEFT) {
174 PushBackStringPiece(second, output);
175 PushBackJoinedStringPieces(first_content, particle, output);
176 } else {
177 PushBackStringPiece(first_content, output);
178 }
179 return;
180 }
181 }
182 }
183
IsNaturalContent(const Segment::Candidate & cand,const Segment::Candidate & top_cand,SegmentLookupType type,std::vector<string> * output)184 bool IsNaturalContent(const Segment::Candidate &cand,
185 const Segment::Candidate &top_cand,
186 SegmentLookupType type,
187 std::vector<string> *output) {
188 const string &content = cand.content_value;
189 const string &value = cand.value;
190 const string &top_content = top_cand.content_value;
191 const string &top_value = top_cand.value;
192
193 const size_t top_content_len = Util::CharsLen(top_content);
194 const size_t content_len = Util::CharsLen(content);
195
196 if (type == RIGHT &&
197 value != top_value &&
198 top_content_len >= 2 &&
199 content_len == 1) {
200 return false;
201 }
202
203 if (type == LEFT) {
204 output->push_back(value);
205 } else {
206 output->push_back(content);
207 // "舞って" workaround
208 // V+"て" is often treated as one compound.
209 static const char kPat[] = "て";
210 if (Util::EndsWith(content, StringPiece(kPat, arraysize(kPat) - 1))) {
211 PushBackStringPiece(
212 Util::SubStringPiece(content, 0, content_len - 1), output);
213 }
214 }
215
216 // we don't rewrite NUMBER to others and vice versa
217 if (ContainsNumber(value) != ContainsNumber(top_value)) {
218 return false;
219 }
220
221 const StringPiece top_aux_value =
222 Util::SubStringPiece(top_value, top_content_len, string::npos);
223 const size_t top_aux_value_len = Util::CharsLen(top_aux_value);
224 const Util::ScriptType top_value_script_type = Util::GetScriptType(top_value);
225
226 // we don't rewrite KATAKANA segment
227 // for example, we don't rewrite "コーヒー飲みます" to "珈琲飲みます"
228 if (type == LEFT &&
229 top_aux_value_len == 0 &&
230 top_value != value &&
231 top_value_script_type == Util::KATAKANA) {
232 return false;
233 }
234
235 // special cases
236 if (top_content_len == 1) {
237 const char *begin = top_content.data();
238 const char *end = top_content.data() + top_content.size();
239 size_t mblen = 0;
240 const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen);
241
242 switch (wchar) {
243 case 0x304a: // "お"
244 case 0x5fa1: // "御"
245 case 0x3054: // "ご"
246 return true;
247 default:
248 break;
249 }
250 }
251
252 const StringPiece aux_value =
253 Util::SubStringPiece(value, content_len, string::npos);
254
255 // Remove number in normalization for the left segment.
256 string aux_normalized, top_aux_normalized;
257 CollocationUtil::GetNormalizedScript(
258 aux_value, (type == LEFT), &aux_normalized);
259 CollocationUtil::GetNormalizedScript(
260 top_aux_value, (type == LEFT), &top_aux_normalized);
261 if (!aux_normalized.empty() &&
262 !Util::IsScriptType(aux_normalized, Util::HIRAGANA)) {
263 if (type == RIGHT) {
264 return false;
265 }
266 if (aux_normalized != top_aux_normalized) {
267 return false;
268 }
269 }
270
271 ResolveCompoundSegment(top_value, value, type, output);
272
273 const size_t aux_value_len = Util::CharsLen(aux_value);
274 const size_t value_len = Util::CharsLen(value);
275
276 // "<XXいる|>" can be rewrited to "<YY|いる>" and vice versa
277 {
278 static const char kPat[] = "いる"; // "いる"
279 const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
280 if (top_aux_value_len == 0 &&
281 aux_value_len == 2 &&
282 Util::EndsWith(top_value, kSuffix) &&
283 Util::EndsWith(aux_value, kSuffix)) {
284 if (type == RIGHT) {
285 // "YYいる" in addition to "YY"
286 output->push_back(value);
287 }
288 return true;
289 }
290 if (aux_value_len == 0 &&
291 top_aux_value_len == 2 &&
292 Util::EndsWith(value, kSuffix) &&
293 Util::EndsWith(top_aux_value, kSuffix)) {
294 if (type == RIGHT) {
295 // "YY" in addition to "YYいる"
296 PushBackStringPiece(
297 Util::SubStringPiece(value, 0, value_len - 2), output);
298 }
299 return true;
300 }
301 }
302
303 // "<XXせる|>" can be rewrited to "<YY|せる>" and vice versa
304 {
305 const char kPat[] = "せる";
306 const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
307 if (top_aux_value_len == 0 &&
308 aux_value_len == 2 &&
309 Util::EndsWith(top_value, kSuffix) &&
310 Util::EndsWith(aux_value, kSuffix)) {
311 if (type == RIGHT) {
312 // "YYせる" in addition to "YY"
313 output->push_back(value);
314 }
315 return true;
316 }
317 if (aux_value_len == 0 &&
318 top_aux_value_len == 2 &&
319 Util::EndsWith(value, kSuffix) &&
320 Util::EndsWith(top_aux_value, kSuffix)) {
321 if (type == RIGHT) {
322 // "YY" in addition to "YYせる"
323 PushBackStringPiece(
324 Util::SubStringPiece(value, 0, value_len - 2), output);
325 }
326 return true;
327 }
328 }
329
330 const Util::ScriptType content_script_type = Util::GetScriptType(content);
331
332 // "<XX|する>" can be rewrited using "<XXす|る>" and "<XX|する>"
333 // in "<XX|する>", XX must be single script type
334 {
335 static const char kPat[] = "する";
336 const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
337 if (aux_value_len == 2 &&
338 Util::EndsWith(aux_value, kSuffix)) {
339 if (content_script_type != Util::KATAKANA &&
340 content_script_type != Util::HIRAGANA &&
341 content_script_type != Util::KANJI &&
342 content_script_type != Util::ALPHABET) {
343 return false;
344 }
345 if (type == RIGHT) {
346 // "YYす" in addition to "YY"
347 PushBackStringPiece(
348 Util::SubStringPiece(value, 0, value_len - 1), output);
349 }
350 return true;
351 }
352 }
353
354 // "<XXる>" can be rewrited using "<XX|る>"
355 // "まとめる", "衰える"
356 {
357 static const char kPat[] = "る";
358 const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
359 if (aux_value_len == 0 &&
360 Util::EndsWith(value, kSuffix)) {
361 if (type == RIGHT) {
362 // "YY" in addition to "YYる"
363 PushBackStringPiece(
364 Util::SubStringPiece(value, 0, value_len - 1), output);
365 }
366 return true;
367 }
368 }
369
370 // "<XXす>" can be rewrited using "XXする"
371 {
372 static const char kPat[] = "す";
373 const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
374 if (Util::EndsWith(value, kSuffix) &&
375 Util::IsScriptType(
376 Util::SubStringPiece(value, 0, value_len - 1),
377 Util::KANJI)) {
378 if (type == RIGHT) {
379 const char kRu[] = "る";
380 // "YYする" in addition to "YY"
381 PushBackJoinedStringPieces(
382 value, StringPiece(kRu, arraysize(kRu) - 1), output);
383 }
384 return true;
385 }
386 }
387
388 // "<XXし|た>" can be rewrited using "<XX|した>"
389 {
390 static const char kPat[] = "した";
391 const StringPiece kShi(kPat, 3), kTa(kPat + 3, 3);
392 if (Util::EndsWith(content, kShi) &&
393 aux_value == kTa &&
394 Util::EndsWith(top_content, kShi) &&
395 top_aux_value == kTa) {
396 if (type == RIGHT) {
397 const StringPiece val =
398 Util::SubStringPiece(content, 0, content_len - 1);
399 // XX must be KANJI
400 if (Util::IsScriptType(val, Util::KANJI)) {
401 PushBackStringPiece(val, output);
402 }
403 }
404 return true;
405 }
406 }
407
408 const int aux_len = value_len - content_len;
409 const int top_aux_len = Util::CharsLen(top_value) - top_content_len;
410 if (aux_len != top_aux_len) {
411 return false;
412 }
413
414 const Util::ScriptType top_content_script_type =
415 Util::GetScriptType(top_content);
416
417 // we don't rewrite HIRAGANA to KATAKANA
418 if (top_content_script_type == Util::HIRAGANA &&
419 content_script_type == Util::KATAKANA) {
420 return false;
421 }
422
423 // we don't rewrite second KATAKANA
424 // for example, we don't rewrite "このコーヒー" to "この珈琲"
425 if (type == RIGHT &&
426 top_content_script_type == Util::KATAKANA &&
427 value != top_value) {
428 return false;
429 }
430
431 if (top_content_len == 1 &&
432 top_content_script_type == Util::HIRAGANA) {
433 return false;
434 }
435
436 // suppress "<身|ています>" etc.
437 if (top_content_len == 1 &&
438 content_len == 1 &&
439 top_aux_value_len >= 2 &&
440 aux_value_len >= 2 &&
441 top_content_script_type == Util::KANJI &&
442 content_script_type == Util::KANJI &&
443 top_content != content) {
444 return false;
445 }
446
447 return true;
448 }
449
450 // Just a wrapper of IsNaturalContent for debug.
VerifyNaturalContent(const Segment::Candidate & cand,const Segment::Candidate & top_cand,SegmentLookupType type)451 bool VerifyNaturalContent(const Segment::Candidate &cand,
452 const Segment::Candidate &top_cand,
453 SegmentLookupType type) {
454 std::vector<string> nexts;
455 return IsNaturalContent(cand, top_cand, RIGHT, &nexts);
456 }
457
IsKeyUnknown(const Segment & seg)458 inline bool IsKeyUnknown(const Segment &seg) {
459 return Util::IsScriptType(seg.key(), Util::UNKNOWN_SCRIPT);
460 }
461
462 } // namespace
463
RewriteCollocation(Segments * segments) const464 bool CollocationRewriter::RewriteCollocation(Segments *segments) const {
465 // return false if at least one segment is fixed.
466 for (size_t i = segments->history_segments_size();
467 i < segments->segments_size(); ++i) {
468 if (segments->segment(i).segment_type() == Segment::FIXED_VALUE) {
469 return false;
470 }
471 }
472
473 std::vector<bool> segs_changed(segments->segments_size(), false);
474 bool changed = false;
475
476 for (size_t i = segments->history_segments_size();
477 i < segments->segments_size(); ++i) {
478 bool rewrited_next = false;
479
480 if (IsKeyUnknown(segments->segment(i))) {
481 continue;
482 }
483
484 if (i + 1 < segments->segments_size() &&
485 RewriteUsingNextSegment(segments->mutable_segment(i + 1),
486 segments->mutable_segment(i))) {
487 changed = true;
488 rewrited_next = true;
489 segs_changed[i] = true;
490 segs_changed[i + 1] = true;
491 }
492
493 if (!segs_changed[i] &&
494 !rewrited_next &&
495 i > 0 &&
496 RewriteFromPrevSegment(segments->segment(i - 1).candidate(0),
497 segments->mutable_segment(i))) {
498 changed = true;
499 segs_changed[i - 1] = true;
500 segs_changed[i] = true;
501 }
502
503 const Segment::Candidate &cand = segments->segment(i).candidate(0);
504 if (i >= 2 &&
505 // Cross over only adverbs
506 // Segment is adverb if;
507 // 1) lid and rid is adverb.
508 // 2) or rid is adverb suffix.
509 ((pos_matcher_.IsAdverb(segments->segment(i - 1).candidate(0).lid) &&
510 pos_matcher_.IsAdverb(segments->segment(i - 1).candidate(0).rid)) ||
511 pos_matcher_.IsAdverbSegmentSuffix(
512 segments->segment(i - 1).candidate(0).rid)) &&
513 (cand.content_value != cand.value ||
514 cand.value != "・")) { // "・" workaround
515 if (!segs_changed[i - 2] &&
516 !segs_changed[i] &&
517 RewriteUsingNextSegment(segments->mutable_segment(i),
518 segments->mutable_segment(i - 2))) {
519 changed = true;
520 segs_changed[i] = true;
521 segs_changed[i - 2] = true;
522 } else if (!segs_changed[i] &&
523 RewriteFromPrevSegment(
524 segments->segment(i - 2).candidate(0),
525 segments->mutable_segment(i))) {
526 changed = true;
527 segs_changed[i] = true;
528 segs_changed[i - 2] = true;
529 }
530 }
531 }
532
533 return changed;
534 }
535
536 class CollocationRewriter::CollocationFilter {
537 public:
CollocationFilter(const char * existence_data,size_t size)538 CollocationFilter(const char *existence_data, size_t size)
539 : filter_(ExistenceFilter::Read(existence_data, size)) {
540 }
~CollocationFilter()541 ~CollocationFilter() {
542 }
543
Exists(const string & left,const string & right) const544 bool Exists(const string &left, const string &right) const {
545 if (left.empty() || right.empty()) {
546 return false;
547 }
548 string key;
549 key.reserve(left.size() + right.size());
550 key.assign(left).append(right);
551 const uint64 id = Hash::Fingerprint(key);
552 return filter_->Exists(id);
553 }
554
555 private:
556 std::unique_ptr<ExistenceFilter> filter_;
557
558 DISALLOW_COPY_AND_ASSIGN(CollocationFilter);
559 };
560
561 class CollocationRewriter::SuppressionFilter {
562 public:
SuppressionFilter(const char * suppression_data,size_t size)563 SuppressionFilter(const char *suppression_data, size_t size)
564 : filter_(ExistenceFilter::Read(suppression_data, size)) {
565 }
~SuppressionFilter()566 ~SuppressionFilter() {
567 }
568
Exists(const Segment::Candidate & cand) const569 bool Exists(const Segment::Candidate &cand) const {
570 // TODO(noriyukit): We should share key generation rule with
571 // gen_collocation_suppression_data_main.cc.
572 string key;
573 key.reserve(cand.content_value.size() + 1 + cand.content_key.size());
574 key.assign(cand.content_value).append("\t").append(cand.content_key);
575 const uint64 id = Hash::Fingerprint(key);
576 return filter_->Exists(id);
577 }
578
579 private:
580 std::unique_ptr<ExistenceFilter> filter_;
581
582 DISALLOW_COPY_AND_ASSIGN(SuppressionFilter);
583 };
584
CollocationRewriter(const DataManagerInterface * data_manager)585 CollocationRewriter::CollocationRewriter(
586 const DataManagerInterface *data_manager)
587 : pos_matcher_(data_manager->GetPOSMatcherData()),
588 first_name_id_(pos_matcher_.GetFirstNameId()),
589 last_name_id_(pos_matcher_.GetLastNameId()) {
590 const char *data = NULL;
591 size_t size = 0;
592
593 data_manager->GetCollocationData(&data, &size);
594 collocation_filter_.reset(new CollocationFilter(data, size));
595
596 data_manager->GetCollocationSuppressionData(&data, &size);
597 suppression_filter_.reset(new SuppressionFilter(data, size));
598 }
599
~CollocationRewriter()600 CollocationRewriter::~CollocationRewriter() {}
601
Rewrite(const ConversionRequest & request,Segments * segments) const602 bool CollocationRewriter::Rewrite(const ConversionRequest &request,
603 Segments *segments) const {
604 if (!FLAGS_use_collocation) {
605 return false;
606 }
607 return RewriteCollocation(segments);
608 }
609
IsName(const Segment::Candidate & cand) const610 bool CollocationRewriter::IsName(const Segment::Candidate &cand) const {
611 const bool ret = (cand.lid == last_name_id_ || cand.lid == first_name_id_);
612 VLOG_IF(3, ret) << cand.value << " is name sagment";
613 return ret;
614 }
615
RewriteFromPrevSegment(const Segment::Candidate & prev_cand,Segment * seg) const616 bool CollocationRewriter::RewriteFromPrevSegment(
617 const Segment::Candidate &prev_cand,
618 Segment *seg) const {
619 string prev;
620 CollocationUtil::GetNormalizedScript(prev_cand.value, true, &prev);
621
622 const size_t i_max = std::min(seg->candidates_size(), kCandidateSize);
623
624 // Reuse |curs| and |cur| in the loop as this method is performance critical.
625 std::vector<string> curs;
626 string cur;
627 for (size_t i = 0; i < i_max; ++i) {
628 if (seg->candidate(i).cost > seg->candidate(0).cost + kMaxCostDiff) {
629 continue;
630 }
631 if (IsName(seg->candidate(i))) {
632 continue;
633 }
634 if (suppression_filter_->Exists(seg->candidate(i))) {
635 continue;
636 }
637 curs.clear();
638 if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), RIGHT, &curs)) {
639 continue;
640 }
641
642 for (int j = 0; j < curs.size(); ++j) {
643 cur.clear();
644 CollocationUtil::GetNormalizedScript(curs[j], false, &cur);
645 if (collocation_filter_->Exists(prev, cur)) {
646 VLOG_IF(3, i != 0) << prev << cur << " "
647 << seg->candidate(0).value << "->"
648 << seg->candidate(i).value;
649 seg->move_candidate(i, 0);
650 seg->mutable_candidate(0)->attributes
651 |= Segment::Candidate::CONTEXT_SENSITIVE;
652 return true;
653 }
654 }
655 }
656 return false;
657 }
658
RewriteUsingNextSegment(Segment * next_seg,Segment * seg) const659 bool CollocationRewriter::RewriteUsingNextSegment(Segment *next_seg,
660 Segment *seg) const {
661 const size_t i_max = std::min(seg->candidates_size(), kCandidateSize);
662 const size_t j_max = std::min(next_seg->candidates_size(), kCandidateSize);
663
664 // Cache the results for the next segment
665 std::vector<int> next_seg_ok(j_max); // Avoiding std::vector<bool>
666 std::vector<std::vector<string> > normalized_string(j_max);
667
668 // Reuse |nexts| in the loop as this method is performance critical.
669 std::vector<string> nexts;
670 for (size_t j = 0; j < j_max; ++j) {
671 next_seg_ok[j] = 0;
672
673 if (IsName(next_seg->candidate(j))) {
674 continue;
675 }
676 if (suppression_filter_->Exists(next_seg->candidate(j))) {
677 continue;
678 }
679 nexts.clear();
680 if (!IsNaturalContent(next_seg->candidate(j),
681 next_seg->candidate(0), RIGHT, &nexts)) {
682 continue;
683 }
684
685 next_seg_ok[j] = 1;
686 for (std::vector<string>::const_iterator it = nexts.begin();
687 it != nexts.end(); ++it) {
688 normalized_string[j].push_back(string());
689 CollocationUtil::GetNormalizedScript(
690 *it, false, &normalized_string[j].back());
691 }
692 }
693
694 // Reuse |curs| and |cur| in the loop as this method is performance critical.
695 std::vector<string> curs;
696 string cur;
697 for (size_t i = 0; i < i_max; ++i) {
698 if (seg->candidate(i).cost > seg->candidate(0).cost + kMaxCostDiff) {
699 continue;
700 }
701 if (IsName(seg->candidate(i))) {
702 continue;
703 }
704 if (suppression_filter_->Exists(seg->candidate(i))) {
705 continue;
706 }
707 curs.clear();
708 if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), LEFT, &curs)) {
709 continue;
710 }
711
712 for (int k = 0; k < curs.size(); ++k) {
713 cur.clear();
714 CollocationUtil::GetNormalizedScript(curs[k], true, &cur);
715 for (size_t j = 0; j < j_max; ++j) {
716 if (next_seg->candidate(j).cost >
717 next_seg->candidate(0).cost + kMaxCostDiff) {
718 continue;
719 }
720 if (!next_seg_ok[j]) {
721 continue;
722 }
723
724 for (int l = 0; l < normalized_string[j].size(); ++l) {
725 const string &next = normalized_string[j][l];
726 if (collocation_filter_->Exists(cur, next)) {
727 DCHECK(VerifyNaturalContent(
728 next_seg->candidate(j), next_seg->candidate(0), RIGHT))
729 << "IsNaturalContent() should not fail here.";
730 seg->move_candidate(i, 0);
731 seg->mutable_candidate(0)->attributes
732 |= Segment::Candidate::CONTEXT_SENSITIVE;
733 next_seg->move_candidate(j, 0);
734 next_seg->mutable_candidate(0)->attributes
735 |= Segment::Candidate::CONTEXT_SENSITIVE;
736 return true;
737 }
738 }
739 }
740 }
741 }
742 return false;
743 }
744
745 } // namespace mozc
746