1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 // Copyright(C) 2001-2011 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <algorithm>
7 #include <iterator>
8 #include <cmath>
9 #include <cstring>
10 #include "common.h"
11 #include "connector.h"
12 #include "mecab.h"
13 #include "nbest_generator.h"
14 #include "param.h"
15 #include "viterbi.h"
16 #include "scoped_ptr.h"
17 #include "string_buffer.h"
18 #include "tokenizer.h"
19
20 namespace MeCab {
21
22 namespace {
calc_alpha(Node * n,double beta)23 void calc_alpha(Node *n, double beta) {
24 n->alpha = 0.0;
25 for (Path *path = n->lpath; path; path = path->lnext) {
26 n->alpha = logsumexp(n->alpha,
27 -beta * path->cost + path->lnode->alpha,
28 path == n->lpath);
29 }
30 }
31
calc_beta(Node * n,double beta)32 void calc_beta(Node *n, double beta) {
33 n->beta = 0.0;
34 for (Path *path = n->rpath; path; path = path->rnext) {
35 n->beta = logsumexp(n->beta,
36 -beta * path->cost + path->rnode->beta,
37 path == n->rpath);
38 }
39 }
40 } // namespace
41
Viterbi()42 Viterbi::Viterbi()
43 : tokenizer_(0), connector_(0),
44 cost_factor_(0) {}
45
~Viterbi()46 Viterbi::~Viterbi() {}
47
open(const Param & param)48 bool Viterbi::open(const Param ¶m) {
49 tokenizer_.reset(new Tokenizer<Node, Path>);
50 CHECK_FALSE(tokenizer_->open(param)) << tokenizer_->what();
51 CHECK_FALSE(tokenizer_->dictionary_info()) << "Dictionary is empty";
52
53 connector_.reset(new Connector);
54 CHECK_FALSE(connector_->open(param)) << connector_->what();
55
56 CHECK_FALSE(tokenizer_->dictionary_info()->lsize ==
57 connector_->left_size() &&
58 tokenizer_->dictionary_info()->rsize ==
59 connector_->right_size())
60 << "Transition table and dictionary are not compatible";
61
62 cost_factor_ = param.get<int>("cost-factor");
63 if (cost_factor_ == 0) {
64 cost_factor_ = 800;
65 }
66
67 return true;
68 }
69
analyze(Lattice * lattice) const70 bool Viterbi::analyze(Lattice *lattice) const {
71 if (!lattice || !lattice->sentence()) {
72 return false;
73 }
74
75 if (!initPartial(lattice)) {
76 return false;
77 }
78
79 bool result = false;
80 if (lattice->has_request_type(MECAB_NBEST) ||
81 lattice->has_request_type(MECAB_MARGINAL_PROB)) {
82 // IsAllPath=true
83 if (lattice->has_constraint()) {
84 result = viterbi<true, true>(lattice);
85 } else {
86 result = viterbi<true, false>(lattice);
87 }
88 } else {
89 // IsAllPath=false
90 if (lattice->has_constraint()) {
91 result = viterbi<false, true>(lattice);
92 } else {
93 result = viterbi<false, false>(lattice);
94 }
95 }
96
97 if (!result) {
98 return false;
99 }
100
101 if (!forwardbackward(lattice)) {
102 return false;
103 }
104
105 if (!buildBestLattice(lattice)) {
106 return false;
107 }
108
109 if (!buildAllLattice(lattice)) {
110 return false;
111 }
112
113 if (!initNBest(lattice)) {
114 return false;
115 }
116
117 return true;
118 }
119
tokenizer() const120 const Tokenizer<Node, Path> *Viterbi::tokenizer() const {
121 return tokenizer_.get();
122 }
123
connector() const124 const Connector *Viterbi::connector() const {
125 return connector_.get();
126 }
127
128 // static
forwardbackward(Lattice * lattice)129 bool Viterbi::forwardbackward(Lattice *lattice) {
130 if (!lattice->has_request_type(MECAB_MARGINAL_PROB)) {
131 return true;
132 }
133
134 Node **end_node_list = lattice->end_nodes();
135 Node **begin_node_list = lattice->begin_nodes();
136
137 const size_t len = lattice->size();
138 const double theta = lattice->theta();
139
140 end_node_list[0]->alpha = 0.0;
141 for (int pos = 0; pos <= static_cast<long>(len); ++pos) {
142 for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
143 calc_alpha(node, theta);
144 }
145 }
146
147 begin_node_list[len]->beta = 0.0;
148 for (int pos = static_cast<long>(len); pos >= 0; --pos) {
149 for (Node *node = end_node_list[pos]; node; node = node->enext) {
150 calc_beta(node, theta);
151 }
152 }
153
154 const double Z = begin_node_list[len]->alpha;
155 lattice->set_Z(Z); // alpha of EOS
156
157 for (int pos = 0; pos <= static_cast<long>(len); ++pos) {
158 for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
159 node->prob = std::exp(node->alpha + node->beta - Z);
160 for (Path *path = node->lpath; path; path = path->lnext) {
161 path->prob = std::exp(path->lnode->alpha
162 - theta * path->cost
163 + path->rnode->beta - Z);
164 }
165 }
166 }
167
168 return true;
169 }
170
171 // static
buildResultForNBest(Lattice * lattice)172 bool Viterbi::buildResultForNBest(Lattice *lattice) {
173 return buildAllLattice(lattice);
174 }
175
176 // static
buildAllLattice(Lattice * lattice)177 bool Viterbi::buildAllLattice(Lattice *lattice) {
178 if (!lattice->has_request_type(MECAB_ALL_MORPHS)) {
179 return true;
180 }
181
182 Node *prev = lattice->bos_node();
183 const size_t len = lattice->size();
184 Node **begin_node_list = lattice->begin_nodes();
185
186 for (long pos = 0; pos <= static_cast<long>(len); ++pos) {
187 for (Node *node = begin_node_list[pos]; node; node = node->bnext) {
188 prev->next = node;
189 node->prev = prev;
190 prev = node;
191 }
192 }
193
194 return true;
195 }
196
197 // static
buildAlternative(Lattice * lattice)198 bool Viterbi::buildAlternative(Lattice *lattice) {
199 Node **begin_node_list = lattice->begin_nodes();
200
201 const Node *bos_node = lattice->bos_node();
202 for (const Node *node = bos_node; node; node = node->next) {
203 if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) {
204 continue;
205 }
206 const size_t pos = node->surface - lattice->sentence() -
207 node->rlength + node->length;
208 std::cout.write(node->surface, node->length);
209 std::cout << "\t" << node->feature << std::endl;
210 for (const Node *anode = begin_node_list[pos];
211 anode; anode = anode->bnext) {
212 if (anode->rlength == node->rlength &&
213 anode->length == node->length) {
214 std::cout << "@ ";
215 std::cout.write(anode->surface, anode->length);
216 std::cout << "\t" << anode->feature << std::endl;
217 }
218 }
219 }
220
221 std::cout << "EOS" << std::endl;
222
223 return true;
224 }
225
226 // static
buildBestLattice(Lattice * lattice)227 bool Viterbi::buildBestLattice(Lattice *lattice) {
228 Node *node = lattice->eos_node();
229 for (Node *prev_node; node->prev;) {
230 node->isbest = 1;
231 prev_node = node->prev;
232 prev_node->next = node;
233 node = prev_node;
234 }
235
236 return true;
237 }
238
239 // static
initNBest(Lattice * lattice)240 bool Viterbi::initNBest(Lattice *lattice) {
241 if (!lattice->has_request_type(MECAB_NBEST)) {
242 return true;
243 }
244 lattice->allocator()->nbest_generator()->set(lattice);
245 return true;
246 }
247
248 // static
initPartial(Lattice * lattice)249 bool Viterbi::initPartial(Lattice *lattice) {
250 if (!lattice->has_request_type(MECAB_PARTIAL)) {
251 if (lattice->has_constraint()) {
252 lattice->set_boundary_constraint(0, MECAB_TOKEN_BOUNDARY);
253 lattice->set_boundary_constraint(lattice->size(),
254 MECAB_TOKEN_BOUNDARY);
255 }
256 return true;
257 }
258
259 Allocator<Node, Path> *allocator = lattice->allocator();
260 char *str = allocator->partial_buffer(lattice->size() + 1);
261 strncpy(str, lattice->sentence(), lattice->size() + 1);
262
263 std::vector<char *> lines;
264 const size_t lsize = tokenize(str, "\n",
265 std::back_inserter(lines),
266 lattice->size() + 1);
267 char* column[2];
268 scoped_array<char> buf(new char[lattice->size() + 1]);
269 StringBuffer os(buf.get(), lattice->size() + 1);
270
271 std::vector<std::pair<char *, char *> > tokens;
272 tokens.reserve(lsize);
273
274 size_t pos = 0;
275 for (size_t i = 0; i < lsize; ++i) {
276 const size_t size = tokenize(lines[i], "\t", column, 2);
277 if (size == 1 && std::strcmp(column[0], "EOS") == 0) {
278 break;
279 }
280 const size_t len = std::strlen(column[0]);
281 if (size == 2) {
282 tokens.push_back(std::make_pair(column[0], column[1]));
283 } else {
284 tokens.push_back(std::make_pair(column[0], reinterpret_cast<char *>(0)));
285 }
286 os << column[0];
287 pos += len;
288 }
289
290 os << '\0';
291
292 lattice->set_sentence(os.str());
293
294 pos = 0;
295 for (size_t i = 0; i < tokens.size(); ++i) {
296 const char *surface = tokens[i].first;
297 const char *feature = tokens[i].second;
298 const size_t len = std::strlen(surface);
299 lattice->set_boundary_constraint(pos, MECAB_TOKEN_BOUNDARY);
300 lattice->set_boundary_constraint(pos + len, MECAB_TOKEN_BOUNDARY);
301 if (feature) {
302 lattice->set_feature_constraint(pos, pos + len, feature);
303 for (size_t n = 1; n < len; ++n) {
304 lattice->set_boundary_constraint(pos + n,
305 MECAB_INSIDE_TOKEN);
306 }
307 }
308 pos += len;
309 }
310
311 return true;
312 }
313
314 namespace {
connect(size_t pos,Node * rnode,Node ** begin_node_list,Node ** end_node_list,const Connector * connector,Allocator<Node,Path> * allocator)315 template <bool IsAllPath> bool connect(size_t pos, Node *rnode,
316 Node **begin_node_list,
317 Node **end_node_list,
318 const Connector *connector,
319 Allocator<Node, Path> *allocator) {
320 for (;rnode; rnode = rnode->bnext) {
321 register long best_cost = 2147483647;
322 Node* best_node = 0;
323 for (Node *lnode = end_node_list[pos]; lnode; lnode = lnode->enext) {
324 register int lcost = connector->cost(lnode, rnode); // local cost
325 register long cost = lnode->cost + lcost;
326
327 if (cost < best_cost) {
328 best_node = lnode;
329 best_cost = cost;
330 }
331
332 if (IsAllPath) {
333 Path *path = allocator->newPath();
334 path->cost = lcost;
335 path->rnode = rnode;
336 path->lnode = lnode;
337 path->lnext = rnode->lpath;
338 rnode->lpath = path;
339 path->rnext = lnode->rpath;
340 lnode->rpath = path;
341 }
342 }
343
344 // overflow check 2003/03/09
345 if (!best_node) {
346 return false;
347 }
348
349 rnode->prev = best_node;
350 rnode->next = 0;
351 rnode->cost = best_cost;
352 const size_t x = rnode->rlength + pos;
353 rnode->enext = end_node_list[x];
354 end_node_list[x] = rnode;
355 }
356
357 return true;
358 }
359 } // namespace
360
361 template <bool IsAllPath, bool IsPartial>
viterbi(Lattice * lattice) const362 bool Viterbi::viterbi(Lattice *lattice) const {
363 Node **end_node_list = lattice->end_nodes();
364 Node **begin_node_list = lattice->begin_nodes();
365 Allocator<Node, Path> *allocator = lattice->allocator();
366 const size_t len = lattice->size();
367 const char *begin = lattice->sentence();
368 const char *end = begin + len;
369
370 Node *bos_node = tokenizer_->getBOSNode(lattice->allocator());
371 bos_node->surface = lattice->sentence();
372 end_node_list[0] = bos_node;
373
374 for (size_t pos = 0; pos < len; ++pos) {
375 if (end_node_list[pos]) {
376 Node *right_node = tokenizer_->lookup<IsPartial>(begin + pos, end,
377 allocator, lattice);
378 begin_node_list[pos] = right_node;
379 if (!connect<IsAllPath>(pos, right_node,
380 begin_node_list,
381 end_node_list,
382 connector_.get(),
383 allocator)) {
384 lattice->set_what("too long sentence.");
385 return false;
386 }
387 }
388 }
389
390 Node *eos_node = tokenizer_->getEOSNode(lattice->allocator());
391 eos_node->surface = lattice->sentence() + lattice->size();
392 begin_node_list[lattice->size()] = eos_node;
393
394 for (long pos = len; static_cast<long>(pos) >= 0; --pos) {
395 if (end_node_list[pos]) {
396 if (!connect<IsAllPath>(pos, eos_node,
397 begin_node_list,
398 end_node_list,
399 connector_.get(),
400 allocator)) {
401 lattice->set_what("too long sentence.");
402 return false;
403 }
404 break;
405 }
406 }
407
408 end_node_list[0] = bos_node;
409 begin_node_list[lattice->size()] = eos_node;
410
411 return true;
412 }
413 } // Mecab
414