1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 //  Copyright(C) 2001-2011 Taku Kudo <taku@chasen.org>
5 //  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <algorithm>
7 #include <iterator>
8 #include <cmath>
9 #include <cstring>
10 #include "common.h"
11 #include "connector.h"
12 #include "mecab.h"
13 #include "nbest_generator.h"
14 #include "param.h"
15 #include "viterbi.h"
16 #include "scoped_ptr.h"
17 #include "string_buffer.h"
18 #include "tokenizer.h"
19 
20 namespace MeCab {
21 
22 namespace {
calc_alpha(Node * n,double beta)23 void calc_alpha(Node *n, double beta) {
24   n->alpha = 0.0;
25   for (Path *path = n->lpath; path; path = path->lnext) {
26     n->alpha = logsumexp(n->alpha,
27                          -beta * path->cost + path->lnode->alpha,
28                          path == n->lpath);
29   }
30 }
31 
calc_beta(Node * n,double beta)32 void calc_beta(Node *n, double beta) {
33   n->beta = 0.0;
34   for (Path *path = n->rpath; path; path = path->rnext) {
35     n->beta = logsumexp(n->beta,
36                         -beta * path->cost + path->rnode->beta,
37                         path == n->rpath);
38   }
39 }
40 }  // namespace
41 
Viterbi()42 Viterbi::Viterbi()
43     :  tokenizer_(0), connector_(0),
44        cost_factor_(0) {}
45 
~Viterbi()46 Viterbi::~Viterbi() {}
47 
open(const Param & param)48 bool Viterbi::open(const Param &param) {
49   tokenizer_.reset(new Tokenizer<Node, Path>);
50   CHECK_FALSE(tokenizer_->open(param)) << tokenizer_->what();
51   CHECK_FALSE(tokenizer_->dictionary_info()) << "Dictionary is empty";
52 
53   connector_.reset(new Connector);
54   CHECK_FALSE(connector_->open(param)) << connector_->what();
55 
56   CHECK_FALSE(tokenizer_->dictionary_info()->lsize ==
57               connector_->left_size() &&
58               tokenizer_->dictionary_info()->rsize ==
59               connector_->right_size())
60       << "Transition table and dictionary are not compatible";
61 
62   cost_factor_ = param.get<int>("cost-factor");
63   if (cost_factor_ == 0) {
64     cost_factor_ = 800;
65   }
66 
67   return true;
68 }
69 
analyze(Lattice * lattice) const70 bool Viterbi::analyze(Lattice *lattice) const {
71   if (!lattice || !lattice->sentence()) {
72     return false;
73   }
74 
75   if (!initPartial(lattice)) {
76     return false;
77   }
78 
79   bool result = false;
80   if (lattice->has_request_type(MECAB_NBEST) ||
81       lattice->has_request_type(MECAB_MARGINAL_PROB)) {
82     // IsAllPath=true
83     if (lattice->has_constraint()) {
84       result = viterbi<true, true>(lattice);
85     } else {
86       result = viterbi<true, false>(lattice);
87     }
88   } else {
89     // IsAllPath=false
90     if (lattice->has_constraint()) {
91       result = viterbi<false, true>(lattice);
92     } else {
93       result = viterbi<false, false>(lattice);
94     }
95   }
96 
97   if (!result) {
98     return false;
99   }
100 
101   if (!forwardbackward(lattice)) {
102     return false;
103   }
104 
105   if (!buildBestLattice(lattice)) {
106     return false;
107   }
108 
109   if (!buildAllLattice(lattice)) {
110     return false;
111   }
112 
113   if (!initNBest(lattice)) {
114     return false;
115   }
116 
117   return true;
118 }
119 
tokenizer() const120 const Tokenizer<Node, Path> *Viterbi::tokenizer() const {
121   return tokenizer_.get();
122 }
123 
connector() const124 const Connector *Viterbi::connector() const {
125   return connector_.get();
126 }
127 
128 // static
forwardbackward(Lattice * lattice)129 bool Viterbi::forwardbackward(Lattice *lattice) {
130   if (!lattice->has_request_type(MECAB_MARGINAL_PROB)) {
131     return true;
132   }
133 
134   Node **end_node_list   = lattice->end_nodes();
135   Node **begin_node_list = lattice->begin_nodes();
136 
137   const size_t len = lattice->size();
138   const double theta = lattice->theta();
139 
140   end_node_list[0]->alpha = 0.0;
141   for (int pos = 0; pos <= static_cast<long>(len); ++pos) {
142     for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
143       calc_alpha(node, theta);
144     }
145   }
146 
147   begin_node_list[len]->beta = 0.0;
148   for (int pos = static_cast<long>(len); pos >= 0; --pos) {
149     for (Node *node = end_node_list[pos]; node; node = node->enext) {
150       calc_beta(node, theta);
151     }
152   }
153 
154   const double Z = begin_node_list[len]->alpha;
155   lattice->set_Z(Z);  // alpha of EOS
156 
157   for (int pos = 0; pos <= static_cast<long>(len); ++pos) {
158     for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
159       node->prob = std::exp(node->alpha + node->beta - Z);
160       for (Path *path = node->lpath; path; path = path->lnext) {
161         path->prob = std::exp(path->lnode->alpha
162                               - theta * path->cost
163                               + path->rnode->beta - Z);
164       }
165     }
166   }
167 
168   return true;
169 }
170 
171 // static
buildResultForNBest(Lattice * lattice)172 bool Viterbi::buildResultForNBest(Lattice *lattice) {
173   return buildAllLattice(lattice);
174 }
175 
176 // static
buildAllLattice(Lattice * lattice)177 bool Viterbi::buildAllLattice(Lattice *lattice) {
178   if (!lattice->has_request_type(MECAB_ALL_MORPHS)) {
179     return true;
180   }
181 
182   Node *prev = lattice->bos_node();
183   const size_t len = lattice->size();
184   Node **begin_node_list = lattice->begin_nodes();
185 
186   for (long pos = 0; pos <= static_cast<long>(len); ++pos) {
187     for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
188       prev->next = node;
189       node->prev = prev;
190       prev = node;
191     }
192   }
193 
194   return true;
195 }
196 
197 // static
buildAlternative(Lattice * lattice)198 bool Viterbi::buildAlternative(Lattice *lattice) {
199   Node **begin_node_list = lattice->begin_nodes();
200 
201   const Node *bos_node = lattice->bos_node();
202   for (const Node *node = bos_node; node; node = node->next) {
203     if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) {
204       continue;
205     }
206     const size_t pos = node->surface - lattice->sentence() -
207         node->rlength + node->length;
208     std::cout.write(node->surface, node->length);
209     std::cout << "\t" << node->feature << std::endl;
210     for (const Node *anode = begin_node_list[pos];
211          anode; anode = anode->bnext) {
212       if (anode->rlength == node->rlength &&
213           anode->length == node->length) {
214         std::cout << "@ ";
215         std::cout.write(anode->surface, anode->length);
216         std::cout << "\t" << anode->feature << std::endl;
217       }
218     }
219   }
220 
221   std::cout << "EOS" << std::endl;
222 
223   return true;
224 }
225 
226 // static
buildBestLattice(Lattice * lattice)227 bool Viterbi::buildBestLattice(Lattice *lattice) {
228   Node *node = lattice->eos_node();
229   for (Node *prev_node; node->prev;) {
230     node->isbest = 1;
231     prev_node = node->prev;
232     prev_node->next = node;
233     node = prev_node;
234   }
235 
236   return true;
237 }
238 
239 // static
initNBest(Lattice * lattice)240 bool Viterbi::initNBest(Lattice *lattice) {
241   if (!lattice->has_request_type(MECAB_NBEST)) {
242     return true;
243   }
244   lattice->allocator()->nbest_generator()->set(lattice);
245   return true;
246 }
247 
248 // static
initPartial(Lattice * lattice)249 bool Viterbi::initPartial(Lattice *lattice) {
250   if (!lattice->has_request_type(MECAB_PARTIAL)) {
251     if (lattice->has_constraint()) {
252       lattice->set_boundary_constraint(0, MECAB_TOKEN_BOUNDARY);
253       lattice->set_boundary_constraint(lattice->size(),
254                                        MECAB_TOKEN_BOUNDARY);
255     }
256     return true;
257   }
258 
259   Allocator<Node, Path> *allocator = lattice->allocator();
260   char *str = allocator->partial_buffer(lattice->size() + 1);
261   strncpy(str, lattice->sentence(), lattice->size() + 1);
262 
263   std::vector<char *> lines;
264   const size_t lsize = tokenize(str, "\n",
265                                 std::back_inserter(lines),
266                                 lattice->size() + 1);
267   char* column[2];
268   scoped_array<char> buf(new char[lattice->size() + 1]);
269   StringBuffer os(buf.get(), lattice->size() + 1);
270 
271   std::vector<std::pair<char *, char *> > tokens;
272   tokens.reserve(lsize);
273 
274   size_t pos = 0;
275   for (size_t i = 0; i < lsize; ++i) {
276     const size_t size = tokenize(lines[i], "\t", column, 2);
277     if (size == 1 && std::strcmp(column[0], "EOS") == 0) {
278       break;
279     }
280     const size_t len = std::strlen(column[0]);
281     if (size == 2) {
282       tokens.push_back(std::make_pair(column[0], column[1]));
283     } else {
284       tokens.push_back(std::make_pair(column[0], reinterpret_cast<char *>(0)));
285     }
286     os << column[0];
287     pos += len;
288   }
289 
290   os << '\0';
291 
292   lattice->set_sentence(os.str());
293 
294   pos = 0;
295   for (size_t i = 0; i < tokens.size(); ++i) {
296     const char *surface = tokens[i].first;
297     const char *feature = tokens[i].second;
298     const size_t len = std::strlen(surface);
299     lattice->set_boundary_constraint(pos, MECAB_TOKEN_BOUNDARY);
300     lattice->set_boundary_constraint(pos + len, MECAB_TOKEN_BOUNDARY);
301     if (feature) {
302       lattice->set_feature_constraint(pos, pos + len, feature);
303       for (size_t n = 1; n < len; ++n) {
304         lattice->set_boundary_constraint(pos + n,
305                                          MECAB_INSIDE_TOKEN);
306       }
307     }
308     pos += len;
309   }
310 
311   return true;
312 }
313 
314 namespace {
connect(size_t pos,Node * rnode,Node ** begin_node_list,Node ** end_node_list,const Connector * connector,Allocator<Node,Path> * allocator)315 template <bool IsAllPath> bool connect(size_t pos, Node *rnode,
316                                        Node **begin_node_list,
317                                        Node **end_node_list,
318                                        const Connector *connector,
319                                        Allocator<Node, Path> *allocator) {
320   for (;rnode; rnode = rnode->bnext) {
321     register long best_cost = 2147483647;
322     Node* best_node = 0;
323     for (Node *lnode = end_node_list[pos]; lnode; lnode = lnode->enext) {
324       register int lcost = connector->cost(lnode, rnode);  // local cost
325       register long cost = lnode->cost + lcost;
326 
327       if (cost < best_cost) {
328         best_node  = lnode;
329         best_cost  = cost;
330       }
331 
332       if (IsAllPath) {
333         Path *path   = allocator->newPath();
334         path->cost   = lcost;
335         path->rnode  = rnode;
336         path->lnode  = lnode;
337         path->lnext  = rnode->lpath;
338         rnode->lpath = path;
339         path->rnext  = lnode->rpath;
340         lnode->rpath = path;
341       }
342     }
343 
344     // overflow check 2003/03/09
345     if (!best_node) {
346       return false;
347     }
348 
349     rnode->prev = best_node;
350     rnode->next = 0;
351     rnode->cost = best_cost;
352     const size_t x = rnode->rlength + pos;
353     rnode->enext = end_node_list[x];
354     end_node_list[x] = rnode;
355   }
356 
357   return true;
358 }
359 }  // namespace
360 
361 template <bool IsAllPath, bool IsPartial>
viterbi(Lattice * lattice) const362 bool Viterbi::viterbi(Lattice *lattice) const {
363   Node **end_node_list   = lattice->end_nodes();
364   Node **begin_node_list = lattice->begin_nodes();
365   Allocator<Node, Path> *allocator = lattice->allocator();
366   const size_t len = lattice->size();
367   const char *begin = lattice->sentence();
368   const char *end = begin + len;
369 
370   Node *bos_node = tokenizer_->getBOSNode(lattice->allocator());
371   bos_node->surface = lattice->sentence();
372   end_node_list[0] = bos_node;
373 
374   for (size_t pos = 0; pos < len; ++pos) {
375     if (end_node_list[pos]) {
376       Node *right_node = tokenizer_->lookup<IsPartial>(begin + pos, end,
377                                                        allocator, lattice);
378       begin_node_list[pos] = right_node;
379       if (!connect<IsAllPath>(pos, right_node,
380                               begin_node_list,
381                               end_node_list,
382                               connector_.get(),
383                               allocator)) {
384         lattice->set_what("too long sentence.");
385         return false;
386       }
387     }
388   }
389 
390   Node *eos_node = tokenizer_->getEOSNode(lattice->allocator());
391   eos_node->surface = lattice->sentence() + lattice->size();
392   begin_node_list[lattice->size()] = eos_node;
393 
394   for (long pos = len; static_cast<long>(pos) >= 0; --pos) {
395     if (end_node_list[pos]) {
396       if (!connect<IsAllPath>(pos, eos_node,
397                               begin_node_list,
398                               end_node_list,
399                               connector_.get(),
400                               allocator)) {
401         lattice->set_what("too long sentence.");
402         return false;
403       }
404       break;
405     }
406   }
407 
408   end_node_list[0] = bos_node;
409   begin_node_list[lattice->size()] = eos_node;
410 
411   return true;
412 }
413 }  // Mecab
414