1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "converter/lattice.h"
31 
32 #include <algorithm>
33 #include <sstream>
34 #include <string>
35 #include <vector>
36 
37 #include "base/logging.h"
38 #include "base/port.h"
39 #include "base/singleton.h"
40 #include "base/util.h"
41 #include "converter/node.h"
42 #include "converter/node_allocator.h"
43 
44 namespace mozc {
45 namespace {
46 
InitBOSNode(Lattice * lattice,uint16 length)47 Node *InitBOSNode(Lattice *lattice, uint16 length) {
48   Node *bos_node = lattice->NewNode();
49   DCHECK(bos_node);
50   bos_node->rid = 0;  // 0 is reserved for EOS/BOS
51   bos_node->lid = 0;
52   bos_node->key.clear();
53   bos_node->value = "BOS";
54   bos_node->node_type = Node::BOS_NODE;
55   bos_node->wcost = 0;
56   bos_node->cost = 0;
57   bos_node->begin_pos = length;
58   bos_node->end_pos = length;
59   bos_node->enext = NULL;
60   return bos_node;
61 }
62 
InitEOSNode(Lattice * lattice,uint16 length)63 Node *InitEOSNode(Lattice *lattice, uint16 length) {
64   Node *eos_node = lattice->NewNode();
65   DCHECK(eos_node);
66   eos_node->rid = 0;  // 0 is reserved for EOS/BOS
67   eos_node->lid = 0;
68   eos_node->key.clear();
69   eos_node->value = "EOS";
70   eos_node->node_type = Node::EOS_NODE;
71   eos_node->wcost = 0;
72   eos_node->cost = 0;
73   eos_node->begin_pos = length;
74   eos_node->end_pos = length;
75   eos_node->bnext = NULL;
76   return eos_node;
77 }
78 
PathContainsString(const Node * node,size_t begin_pos,size_t end_pos,const string & str)79 bool PathContainsString(const Node *node, size_t begin_pos, size_t end_pos,
80                         const string &str) {
81   CHECK(node);
82   for (; node->prev != NULL; node = node->prev) {
83     if (node->begin_pos == begin_pos && node->end_pos == end_pos &&
84         node->value == str) {
85       return true;
86     }
87   }
88   return false;
89 }
90 
GetDebugStringForNode(const Node * node,const Node * prev_node)91 string GetDebugStringForNode(const Node *node, const Node *prev_node) {
92   CHECK(node);
93   std::stringstream os;
94   os << "[con:" << node->cost - (prev_node ? prev_node->cost : 0) -
95       node->wcost << "]";
96   os << "[lid:" << node->lid << "]";
97   os << "\"" << node->value << "\"";
98   os << "[wcost:" << node->wcost << "]";
99   os << "[cost:" << node->cost << "]";
100   os << "[rid:" << node->rid << "]";
101   return os.str();
102 }
103 
GetDebugStringForPath(const Node * end_node)104 string GetDebugStringForPath(const Node *end_node) {
105   CHECK(end_node);
106   std::stringstream os;
107   std::vector<const Node *> node_vector;
108 
109   for (const Node *node = end_node; node; node = node->prev) {
110     node_vector.push_back(node);
111   }
112   const Node *prev_node = NULL;
113 
114   for (int i = static_cast<int>(node_vector.size()) - 1; i >= 0; --i) {
115     const Node *node = node_vector[i];
116     os << GetDebugStringForNode(node, prev_node);
117     prev_node = node;
118   }
119   return os.str();
120 }
121 
GetCommonPrefix(const string & str1,const string & str2)122 string GetCommonPrefix(const string &str1, const string &str2) {
123   std::vector<string> split1, split2;
124   Util::SplitStringToUtf8Chars(str1, &split1);
125   Util::SplitStringToUtf8Chars(str2, &split2);
126   string common_prefix = "";
127   for (int i = 0; i < std::min(split1.size(), split2.size()); ++i) {
128     if (split1[i] == split2[i]) {
129       common_prefix += split1[i];
130     } else {
131       break;
132     }
133   }
134   return common_prefix;
135 }
136 
137 }  // namespace
138 
139 struct LatticeDisplayNodeInfo {
140   size_t display_node_begin_pos_;
141   size_t display_node_end_pos_;
142   string display_node_str_;
143 };
144 
Lattice()145 Lattice::Lattice() : history_end_pos_(0), node_allocator_(new NodeAllocator) {}
146 
~Lattice()147 Lattice::~Lattice() {}
148 
node_allocator() const149 NodeAllocator *Lattice::node_allocator() const {
150   return node_allocator_.get();
151 }
152 
NewNode()153 Node *Lattice::NewNode() {
154   return node_allocator_->NewNode();
155 }
156 
begin_nodes(size_t pos) const157 Node *Lattice::begin_nodes(size_t pos) const {
158   return begin_nodes_[pos];
159 }
160 
end_nodes(size_t pos) const161 Node *Lattice::end_nodes(size_t pos) const {
162   return end_nodes_[pos];
163 }
164 
SetKey(StringPiece key)165 void Lattice::SetKey(StringPiece key) {
166   Clear();
167   key_.assign(key.data(), key.size());
168   const size_t size = key.size();
169   begin_nodes_.resize(size + 4);
170   end_nodes_.resize(size + 4);
171   cache_info_.resize(size + 4);
172 
173   std::fill(begin_nodes_.begin(), begin_nodes_.end(),
174             static_cast<Node *>(NULL));
175   std::fill(end_nodes_.begin(), end_nodes_.end(), static_cast<Node *>(NULL));
176   std::fill(cache_info_.begin(), cache_info_.end(), 0);
177 
178   end_nodes_[0] = InitBOSNode(this,
179                               static_cast<uint16>(0));
180   begin_nodes_[key_.size()] =
181       InitEOSNode(this, static_cast<uint16>(key_.size()));
182 }
183 
bos_nodes() const184 Node *Lattice::bos_nodes() const {
185   return end_nodes_[0];
186 }
187 
eos_nodes() const188 Node *Lattice::eos_nodes() const {
189   return begin_nodes_[key_.size()];
190 }
191 
Insert(size_t pos,Node * node)192 void Lattice::Insert(size_t pos, Node *node) {
193   for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
194     const size_t end_pos = std::min(rnode->key.size() + pos, key_.size());
195     rnode->begin_pos = static_cast<uint16>(pos);
196     rnode->end_pos = static_cast<uint16>(end_pos);
197     rnode->prev = NULL;
198     rnode->next = NULL;
199     rnode->cost = 0;
200     rnode->enext = end_nodes_[end_pos];
201     end_nodes_[end_pos] = rnode;
202   }
203 
204   if (begin_nodes_[pos] == NULL) {
205     begin_nodes_[pos] = node;
206   } else {
207     for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
208       if (rnode->bnext == NULL) {
209         rnode->bnext = begin_nodes_[pos];
210         begin_nodes_[pos] = node;
211         break;
212       }
213     }
214   }
215 }
216 
key() const217 const string &Lattice::key() const {
218   return key_;
219 }
220 
has_lattice() const221 bool Lattice::has_lattice() const {
222   return !begin_nodes_.empty();
223 }
224 
Clear()225 void Lattice::Clear() {
226   key_.clear();
227   begin_nodes_.clear();
228   end_nodes_.clear();
229   node_allocator_->Free();
230   cache_info_.clear();
231   history_end_pos_ = 0;
232 }
233 
SetDebugDisplayNode(size_t begin_pos,size_t end_pos,const string & str)234 void Lattice::SetDebugDisplayNode(size_t begin_pos, size_t end_pos,
235                                   const string &str) {
236   LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
237   info->display_node_begin_pos_ = begin_pos;
238   info->display_node_end_pos_ = end_pos;
239   info->display_node_str_ = str;
240 }
241 
ResetDebugDisplayNode()242 void Lattice::ResetDebugDisplayNode() {
243   LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
244   info->display_node_str_.clear();
245 }
246 
set_history_end_pos(size_t pos)247 void Lattice::set_history_end_pos(size_t pos) {
248   history_end_pos_ = pos;
249 }
250 
history_end_pos() const251 size_t Lattice::history_end_pos() const {
252   return history_end_pos_;
253 }
254 
UpdateKey(const string & new_key)255 void Lattice::UpdateKey(const string &new_key) {
256   const string old_key = key_;
257   const string common_prefix = GetCommonPrefix(new_key, old_key);
258 
259   // if the length of common prefix is too short, call SetKey
260   if (common_prefix.size() <= old_key.size() / 2) {
261     SetKey(new_key);
262     return;
263   }
264 
265   // if node_allocator has many nodes, then clean up
266   const size_t size_threshold = node_allocator_->max_nodes_size();
267   if (node_allocator_->node_count() > size_threshold) {
268     SetKey(new_key);
269     return;
270   }
271 
272   // erase the suffix of old_key so that the key becomes common_prefix
273   ShrinkKey(common_prefix.size());
274   // add a suffix so that the key becomes new_key
275   AddSuffix(new_key.substr(common_prefix.size()));
276 }
277 
AddSuffix(const string & suffix_key)278 void Lattice::AddSuffix(const string &suffix_key) {
279   if (suffix_key.empty()) {
280     return;
281   }
282   const size_t old_size = key_.size();
283   const size_t new_size = key_.size() + suffix_key.size();
284 
285   // update begin_nodes and end_nodes
286   begin_nodes_.resize(new_size + 4);
287   end_nodes_.resize(new_size + 4);
288 
289   std::fill(begin_nodes_.begin() + old_size, begin_nodes_.end(),
290             static_cast<Node *>(NULL));
291   std::fill(end_nodes_.begin() + old_size + 1, end_nodes_.end(),
292             static_cast<Node *>(NULL));
293 
294   end_nodes_[0] = InitBOSNode(this,
295                               static_cast<uint16>(0));
296   begin_nodes_[new_size] =
297       InitEOSNode(this, static_cast<uint16>(new_size));
298 
299   // update cache_info
300   cache_info_.resize(new_size + 4, 0);
301 
302   // update key
303   key_ += suffix_key;
304 }
305 
ShrinkKey(const size_t new_len)306 void Lattice::ShrinkKey(const size_t new_len) {
307   const size_t old_len = key_.size();
308   CHECK_LE(new_len, old_len);
309   if (new_len == old_len) {
310     return;
311   }
312 
313   // erase nodes whose end position exceeds new_len
314   for (size_t i = 0; i < new_len; ++i) {
315     Node *begin = begin_nodes_[i];
316     if (begin == NULL) {
317       continue;
318     }
319 
320     for (Node *prev = begin, *curr = begin->bnext; curr != NULL; ) {
321       CHECK(prev);
322       if (curr->end_pos > new_len) {
323         prev->bnext = curr->bnext;
324         curr = curr->bnext;
325       } else {
326         prev = prev->bnext;
327         curr = curr->bnext;
328       }
329     }
330     if (begin->end_pos > new_len) {
331       begin_nodes_[i] = begin->bnext;
332     }
333   }
334 
335   // update begin_nodes and end_nodes
336   for (size_t i = new_len; i <= old_len; ++i) {
337     begin_nodes_[i] = NULL;
338   }
339   for (size_t i = new_len + 1; i <= old_len; ++i) {
340     end_nodes_[i] = NULL;
341   }
342   begin_nodes_[new_len] =
343       InitEOSNode(this, static_cast<uint16>(new_len));
344 
345   // update cache_info
346   for (size_t i = 0; i < new_len; ++i) {
347     cache_info_[i] = std::min(cache_info_[i], new_len - i);
348   }
349   std::fill(cache_info_.begin() + new_len, cache_info_.end(), 0);
350 
351   // update key
352   key_.erase(new_len);
353 }
354 
cache_info(const size_t pos) const355 size_t Lattice::cache_info(const size_t pos) const {
356   CHECK_LE(pos, key_.size());
357   return cache_info_[pos];
358 }
359 
SetCacheInfo(const size_t pos,const size_t len)360 void Lattice::SetCacheInfo(const size_t pos, const size_t len) {
361   CHECK_LE(pos, key_.size());
362   cache_info_[pos] = len;
363 }
364 
ResetNodeCost()365 void Lattice::ResetNodeCost() {
366   for (size_t i = 0; i <= key_.size(); ++i) {
367     if (begin_nodes_[i] != NULL) {
368       Node *prev = NULL;
369       for (Node *node = begin_nodes_[i]; node != NULL; node = node->bnext) {
370         // do not process BOS / EOS nodes
371         if (node->node_type == Node::BOS_NODE ||
372             node->node_type == Node::EOS_NODE) {
373           continue;
374         }
375         // if the node has ENABLE_CACHE attribute, then revert its wcost.
376         // Otherwise, erase the node from the lattice.
377         if (node->attributes & Node::ENABLE_CACHE) {
378           node->wcost = node->raw_wcost;
379         } else {
380           if (node == begin_nodes_[i]) {
381             if (node->bnext == NULL) {
382               begin_nodes_[i] = NULL;
383             } else {
384               begin_nodes_[i] = node->bnext;
385             }
386           } else {
387             CHECK(prev);
388             CHECK_EQ(prev->bnext, node);
389             prev->next = node->bnext;
390           }
391         }
392         // traverse a next node
393         prev = node;
394       }
395     }
396 
397     if (end_nodes_[i] != NULL) {
398       Node *prev = NULL;
399       for (Node *node = end_nodes_[i]; node != NULL; node = node->enext) {
400         if (node->node_type == Node::BOS_NODE ||
401             node->node_type == Node::EOS_NODE) {
402           continue;
403         }
404         if (node->attributes & Node::ENABLE_CACHE) {
405           node->wcost = node->raw_wcost;
406         } else {
407           if (node == end_nodes_[i]) {
408             if (node->enext == NULL) {
409               end_nodes_[i] = NULL;
410             } else {
411               end_nodes_[i] = node->enext;
412             }
413           } else {
414             CHECK(prev);
415             CHECK_EQ(prev->enext, node);
416             prev->next = node->enext;
417           }
418         }
419         prev = node;
420       }
421     }
422   }
423 }
424 
DebugString() const425 string Lattice::DebugString() const {
426   std::stringstream os;
427   if (!has_lattice()) {
428     return "";
429   }
430 
431   std::vector<const Node *> best_path_nodes;
432 
433   const Node *node = eos_nodes();
434   // Print the best path
435   os << "Best path: ";
436   os << GetDebugStringForPath(node);
437   os << std::endl;
438 
439   LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
440 
441   if (info->display_node_str_.empty()) {
442     return os.str();
443   }
444 
445   for (; node != NULL; node = node->prev) {
446     best_path_nodes.push_back(node);
447   }
448 
449   // Print tha path that contains the designated node
450   for (std::vector<const Node *>::const_iterator it = best_path_nodes.begin();
451        it != best_path_nodes.end(); ++it) {
452     const Node *best_path_node = *it;
453     if (best_path_node->begin_pos < info->display_node_end_pos_) {
454       break;
455     }
456     for (const Node *prev_node = end_nodes(best_path_node->begin_pos);
457          prev_node; prev_node = prev_node->enext) {
458       if (!PathContainsString(prev_node,
459                               info->display_node_begin_pos_,
460                               info->display_node_end_pos_,
461                               info->display_node_str_)) {
462         continue;
463       }
464       os << "The path " << GetDebugStringForPath(prev_node)
465          << " ( + connection cost + wcost: " << best_path_node->wcost << ")"
466          << std::endl
467          << "was defeated"
468          << " by the path " << std::endl
469          << GetDebugStringForPath(best_path_node->prev)
470          << " connecting to the node "
471          << GetDebugStringForNode(best_path_node, best_path_node->prev)
472          << std::endl;
473     }
474   }
475 
476   return os.str();
477 }
478 
479 }  // namespace mozc
480