1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <cstring>
7 #include <iostream>
8 #include <iterator>
9 #include "common.h"
10 #include "connector.h"
11 #include "mecab.h"
12 #include "nbest_generator.h"
13 #include "param.h"
14 #include "scoped_ptr.h"
15 #include "stream_wrapper.h"
16 #include "string_buffer.h"
17 #include "thread.h"
18 #include "tokenizer.h"
19 #include "viterbi.h"
20 #include "writer.h"
21
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25
26 const char *getGlobalError();
27 void setGlobalError(const char *str);
28
29 namespace MeCab {
30 namespace {
31
32 const float kDefaultTheta = 0.75;
33
34 const MeCab::Option long_options[] = {
35 { "rcfile", 'r', 0, "FILE", "use FILE as resource file" },
36 { "dicdir", 'd', 0, "DIR", "set DIR as a system dicdir" },
37 { "userdic", 'u', 0, "FILE", "use FILE as a user dictionary" },
38 { "lattice-level", 'l', "0", "INT",
39 "lattice information level (DEPRECATED)" },
40 { "dictionary-info", 'D', 0, 0, "show dictionary information and exit" },
41 { "output-format-type", 'O', 0, "TYPE",
42 "set output format type (wakati,none,...)" },
43 { "all-morphs", 'a', 0, 0, "output all morphs(default false)" },
44 { "nbest", 'N', "1",
45 "INT", "output N best results (default 1)" },
46 { "partial", 'p', 0, 0,
47 "partial parsing mode (default false)" },
48 { "marginal", 'm', 0, 0,
49 "output marginal probability (default false)" },
50 { "max-grouping-size", 'M', "24",
51 "INT", "maximum grouping size for unknown words (default 24)" },
52 { "node-format", 'F', "%m\\t%H\\n", "STR",
53 "use STR as the user-defined node format" },
54 { "unk-format", 'U', "%m\\t%H\\n", "STR",
55 "use STR as the user-defined unknown node format" },
56 { "bos-format", 'B', "", "STR",
57 "use STR as the user-defined beginning-of-sentence format" },
58 { "eos-format", 'E', "EOS\\n", "STR",
59 "use STR as the user-defined end-of-sentence format" },
60 { "eon-format", 'S', "", "STR",
61 "use STR as the user-defined end-of-NBest format" },
62 { "unk-feature", 'x', 0, "STR",
63 "use STR as the feature for unknown word" },
64 { "input-buffer-size", 'b', 0, "INT",
65 "set input buffer size (default 8192)" },
66 { "dump-config", 'P', 0, 0, "dump MeCab parameters" },
67 { "allocate-sentence", 'C', 0, 0,
68 "allocate new memory for input sentence" },
69 { "theta", 't', "0.75", "FLOAT",
70 "set temparature parameter theta (default 0.75)" },
71 { "cost-factor", 'c', "700", "INT",
72 "set cost factor (default 700)" },
73 { "output", 'o', 0, "FILE", "set the output file name" },
74 { "version", 'v', 0, 0, "show the version and exit." },
75 { "help", 'h', 0, 0, "show this help and exit." },
76 { 0, 0, 0, 0 }
77 };
78
79 class ModelImpl: public Model {
80 public:
81 ModelImpl();
82 virtual ~ModelImpl();
83
84 bool open(int argc, char **argv);
85 bool open(const char *arg);
86 bool open(const Param ¶m);
87
88 bool swap(Model *model);
89
is_available() const90 bool is_available() const {
91 return (viterbi_ && writer_.get());
92 }
93
request_type() const94 int request_type() const {
95 return request_type_;
96 }
97
theta() const98 double theta() const {
99 return theta_;
100 }
101
dictionary_info() const102 const DictionaryInfo *dictionary_info() const {
103 return viterbi_->tokenizer() ?
104 viterbi_->tokenizer()->dictionary_info() : 0;
105 }
106
transition_cost(unsigned short rcAttr,unsigned short lcAttr) const107 int transition_cost(unsigned short rcAttr,
108 unsigned short lcAttr) const {
109 return viterbi_->connector()->transition_cost(rcAttr, lcAttr);
110 }
111
lookup(const char * begin,const char * end,Lattice * lattice) const112 Node *lookup(const char *begin, const char *end,
113 Lattice *lattice) const {
114 return viterbi_->tokenizer()->lookup<false>(
115 begin, end,
116 lattice->allocator(), lattice);
117 }
118
119 Tagger *createTagger() const;
120
121 Lattice *createLattice() const;
122
viterbi() const123 const Viterbi *viterbi() const {
124 return viterbi_;
125 }
126
127 // moves the owership.
take_viterbi()128 Viterbi *take_viterbi() {
129 Viterbi *result = viterbi_;
130 viterbi_ = 0;
131 return result;
132 }
133
writer() const134 const Writer *writer() const {
135 return writer_.get();
136 }
137
138 #ifdef HAVE_ATOMIC_OPS
mutex() const139 read_write_mutex *mutex() const {
140 return &mutex_;
141 }
142 #endif
143
144 private:
145 Viterbi *viterbi_;
146 scoped_ptr<Writer> writer_;
147 int request_type_;
148 double theta_;
149
150 #ifdef HAVE_ATOMIC_OPS
151 mutable read_write_mutex mutex_;
152 #endif
153 };
154
155 class TaggerImpl: public Tagger {
156 public:
157 bool open(int argc, char **argv);
158 bool open(const char *arg);
159 bool open(const ModelImpl &model);
160
161 bool parse(Lattice *lattice) const;
162
163 void set_request_type(int request_type);
164 int request_type() const;
165
166 const char* parse(const char*);
167 const char* parse(const char*, size_t);
168 const char* parse(const char*, size_t, char*, size_t);
169 const Node* parseToNode(const char*);
170 const Node* parseToNode(const char*, size_t = 0);
171 const char* parseNBest(size_t, const char*);
172 const char* parseNBest(size_t, const char*, size_t);
173 const char* parseNBest(size_t, const char*,
174 size_t, char *, size_t);
175 bool parseNBestInit(const char*);
176 bool parseNBestInit(const char*, size_t);
177 const Node* nextNode();
178 const char* next();
179 const char* next(char*, size_t);
180
181 const char *formatNode(const Node *);
182 const char *formatNode(const Node *, char *, size_t);
183
184 const DictionaryInfo *dictionary_info() const;
185
186 void set_partial(bool partial);
187 bool partial() const;
188 void set_theta(float theta);
189 float theta() const;
190 void set_lattice_level(int level);
191 int lattice_level() const;
192 void set_all_morphs(bool all_morphs);
193 bool all_morphs() const;
194
195 const char* what() const;
196
197 TaggerImpl();
198 virtual ~TaggerImpl();
199
200 private:
model() const201 const ModelImpl *model() const { return current_model_; }
202
set_what(const char * str)203 void set_what(const char *str) {
204 what_.assign(str);
205 }
206
initRequestType()207 void initRequestType() {
208 mutable_lattice()->set_request_type(request_type_);
209 mutable_lattice()->set_theta(theta_);
210 }
211
mutable_lattice()212 Lattice *mutable_lattice() {
213 if (!lattice_.get()) {
214 lattice_.reset(model()->createLattice());
215 }
216 return lattice_.get();
217 }
218
219 const ModelImpl *current_model_;
220 scoped_ptr<ModelImpl> model_;
221 scoped_ptr<Lattice> lattice_;
222 int request_type_;
223 double theta_;
224 std::string what_;
225 };
226
227 class LatticeImpl : public Lattice {
228 public:
229 explicit LatticeImpl(const Writer *writer = 0);
230 ~LatticeImpl();
231
232 // clear internal lattice
233 void clear();
234
is_available() const235 bool is_available() const {
236 return (sentence_ &&
237 !begin_nodes_.empty() &&
238 !end_nodes_.empty());
239 }
240
241 // nbest;
242 bool next();
243
244 // return bos/eos node
bos_node() const245 Node *bos_node() const { return end_nodes_[0]; }
eos_node() const246 Node *eos_node() const { return begin_nodes_[size()]; }
begin_nodes() const247 Node **begin_nodes() const { return const_cast<Node **>(&begin_nodes_[0]); }
end_nodes() const248 Node **end_nodes() const { return const_cast<Node **>(&end_nodes_[0]); }
begin_nodes(size_t pos) const249 Node *begin_nodes(size_t pos) const { return begin_nodes_[pos]; }
end_nodes(size_t pos) const250 Node *end_nodes(size_t pos) const { return end_nodes_[pos]; }
251
sentence() const252 const char *sentence() const { return sentence_; }
253 void set_sentence(const char *sentence);
254 void set_sentence(const char *sentence, size_t len);
size() const255 size_t size() const { return size_; }
256
set_Z(double Z)257 void set_Z(double Z) { Z_ = Z; }
Z() const258 double Z() const { return Z_; }
259
theta() const260 float theta() const { return theta_; }
set_theta(float theta)261 void set_theta(float theta) { theta_ = theta; }
262
request_type() const263 int request_type() const { return request_type_; }
264
set_request_type(int request_type)265 void set_request_type(int request_type) {
266 request_type_ = request_type;
267 }
has_request_type(int request_type) const268 bool has_request_type(int request_type) const {
269 return request_type & request_type_;
270 }
add_request_type(int request_type)271 void add_request_type(int request_type) {
272 request_type_ |= request_type;
273 }
remove_request_type(int request_type)274 void remove_request_type(int request_type) {
275 request_type_ &= ~request_type;
276 }
277
allocator() const278 Allocator<Node, Path> *allocator() const {
279 return allocator_.get();
280 }
281
newNode()282 Node *newNode() {
283 return allocator_->newNode();
284 }
285
286 bool has_constraint() const;
287 int boundary_constraint(size_t pos) const;
288 const char *feature_constraint(size_t begin_pos) const;
289
290 void set_boundary_constraint(size_t pos,
291 int boundary_constraint_type);
292
293 void set_feature_constraint(size_t begin_pos, size_t end_pos,
294 const char *feature);
295
296 void set_result(const char *result);
297
what() const298 const char *what() const { return what_.c_str(); }
299
set_what(const char * str)300 void set_what(const char *str) {
301 what_.assign(str);
302 }
303
304 const char *toString();
305 const char *toString(char *buf, size_t size);
306 const char *toString(const Node *node);
307 const char *toString(const Node *node,
308 char *buf, size_t size);
309 const char *enumNBestAsString(size_t N);
310 const char *enumNBestAsString(size_t N, char *buf, size_t size);
311
312 private:
313 const char *sentence_;
314 size_t size_;
315 double theta_;
316 double Z_;
317 int request_type_;
318 std::string what_;
319 std::vector<Node *> end_nodes_;
320 std::vector<Node *> begin_nodes_;
321 std::vector<const char *> feature_constraint_;
322 std::vector<unsigned char> boundary_constraint_;
323 const Writer *writer_;
324 scoped_ptr<StringBuffer> ostrs_;
325 scoped_ptr<Allocator<Node, Path> > allocator_;
326
stream()327 StringBuffer *stream() {
328 if (!ostrs_.get()) {
329 ostrs_.reset(new StringBuffer);
330 }
331 return ostrs_.get();
332 }
333
334 const char *toStringInternal(StringBuffer *os);
335 const char *toStringInternal(const Node *node, StringBuffer *os);
336 const char *enumNBestAsStringInternal(size_t N, StringBuffer *os);
337 };
338
ModelImpl()339 ModelImpl::ModelImpl()
340 : viterbi_(new Viterbi), writer_(new Writer),
341 request_type_(MECAB_ONE_BEST), theta_(0.0) {}
342
~ModelImpl()343 ModelImpl::~ModelImpl() {
344 delete viterbi_;
345 viterbi_ = 0;
346 }
347
open(int argc,char ** argv)348 bool ModelImpl::open(int argc, char **argv) {
349 Param param;
350 if (!param.open(argc, argv, long_options) ||
351 !load_dictionary_resource(¶m)) {
352 setGlobalError(param.what());
353 return false;
354 }
355 return open(param);
356 }
357
open(const char * arg)358 bool ModelImpl::open(const char *arg) {
359 Param param;
360 if (!param.open(arg, long_options) ||
361 !load_dictionary_resource(¶m)) {
362 setGlobalError(param.what());
363 return false;
364 }
365 return open(param);
366 }
367
open(const Param & param)368 bool ModelImpl::open(const Param ¶m) {
369 if (!writer_->open(param) || !viterbi_->open(param)) {
370 std::string error = viterbi_->what();
371 if (!error.empty()) {
372 error.append(" ");
373 }
374 error.append(writer_->what());
375 setGlobalError(error.c_str());
376 return false;
377 }
378
379 request_type_ = load_request_type(param);
380 theta_ = param.get<double>("theta");
381
382 return is_available();
383 }
384
swap(Model * model)385 bool ModelImpl::swap(Model *model) {
386 scoped_ptr<Model> model_data(model);
387
388 if (!is_available()) {
389 setGlobalError("current model is not available");
390 return false;
391 }
392 #ifndef HAVE_ATOMIC_OPS
393 setGlobalError("atomic model replacement is not supported");
394 return false;
395 #else
396 ModelImpl *m = static_cast<ModelImpl *>(model_data.get());
397 if (!m) {
398 setGlobalError("Invalid model is passed");
399 return false;
400 }
401
402 if (!m->is_available()) {
403 setGlobalError("Passed model is not available");
404 return false;
405 }
406
407 Viterbi *current_viterbi = viterbi_;
408 {
409 scoped_writer_lock l(mutex());
410 viterbi_ = m->take_viterbi();
411 request_type_ = m->request_type();
412 theta_ = m->theta();
413 }
414
415 delete current_viterbi;
416
417 return true;
418 #endif
419 }
420
createTagger() const421 Tagger *ModelImpl::createTagger() const {
422 if (!is_available()) {
423 setGlobalError("Model is not available");
424 return 0;
425 }
426 TaggerImpl *tagger = new TaggerImpl;
427 if (!tagger->open(*this)) {
428 setGlobalError(tagger->what());
429 delete tagger;
430 return 0;
431 }
432 tagger->set_theta(theta_);
433 tagger->set_request_type(request_type_);
434 return tagger;
435 }
436
createLattice() const437 Lattice *ModelImpl::createLattice() const {
438 if (!is_available()) {
439 setGlobalError("Model is not available");
440 return 0;
441 }
442 return new LatticeImpl(writer_.get());
443 }
444
TaggerImpl()445 TaggerImpl::TaggerImpl()
446 : current_model_(0),
447 request_type_(MECAB_ONE_BEST), theta_(kDefaultTheta) {}
448
~TaggerImpl()449 TaggerImpl::~TaggerImpl() {}
450
what() const451 const char *TaggerImpl::what() const {
452 return what_.c_str();
453 }
454
open(int argc,char ** argv)455 bool TaggerImpl::open(int argc, char **argv) {
456 model_.reset(new ModelImpl);
457 if (!model_->open(argc, argv)) {
458 model_.reset(0);
459 return false;
460 }
461 current_model_ = model_.get();
462 request_type_ = model()->request_type();
463 theta_ = model()->theta();
464 return true;
465 }
466
open(const char * arg)467 bool TaggerImpl::open(const char *arg) {
468 model_.reset(new ModelImpl);
469 if (!model_->open(arg)) {
470 model_.reset(0);
471 return false;
472 }
473 current_model_ = model_.get();
474 request_type_ = model()->request_type();
475 theta_ = model()->theta();
476 return true;
477 }
478
open(const ModelImpl & model)479 bool TaggerImpl::open(const ModelImpl &model) {
480 if (!model.is_available()) {
481 return false;
482 }
483 model_.reset(0);
484 current_model_ = &model;
485 request_type_ = current_model_->request_type();
486 theta_ = current_model_->theta();
487 return true;
488 }
489
set_request_type(int request_type)490 void TaggerImpl::set_request_type(int request_type) {
491 request_type_ = request_type;
492 }
493
request_type() const494 int TaggerImpl::request_type() const {
495 return request_type_;
496 }
497
set_partial(bool partial)498 void TaggerImpl::set_partial(bool partial) {
499 if (partial) {
500 request_type_ |= MECAB_PARTIAL;
501 } else {
502 request_type_ &= ~MECAB_PARTIAL;
503 }
504 }
505
partial() const506 bool TaggerImpl::partial() const {
507 return request_type_ & MECAB_PARTIAL;
508 }
509
set_theta(float theta)510 void TaggerImpl::set_theta(float theta) {
511 theta_ = theta;
512 }
513
theta() const514 float TaggerImpl::theta() const {
515 return theta_;
516 }
517
set_lattice_level(int level)518 void TaggerImpl::set_lattice_level(int level) {
519 switch (level) {
520 case 0: request_type_ |= MECAB_ONE_BEST;
521 break;
522 case 1: request_type_ |= MECAB_NBEST;
523 break;
524 case 2: request_type_ |= MECAB_MARGINAL_PROB;
525 break;
526 default:
527 break;
528 }
529 }
530
lattice_level() const531 int TaggerImpl::lattice_level() const {
532 if (request_type_ & MECAB_MARGINAL_PROB) {
533 return 2;
534 } else if (request_type_ & MECAB_NBEST) {
535 return 1;
536 } else {
537 return 0;
538 }
539 }
540
set_all_morphs(bool all_morphs)541 void TaggerImpl::set_all_morphs(bool all_morphs) {
542 if (all_morphs) {
543 request_type_ |= MECAB_ALL_MORPHS;
544 } else {
545 request_type_ &= ~MECAB_ALL_MORPHS;
546 }
547 }
548
all_morphs() const549 bool TaggerImpl::all_morphs() const {
550 return request_type_ & MECAB_ALL_MORPHS;
551 }
552
parse(Lattice * lattice) const553 bool TaggerImpl::parse(Lattice *lattice) const {
554 #ifdef HAVE_ATOMIC_OPS
555 scoped_reader_lock l(model()->mutex());
556 #endif
557
558 return model()->viterbi()->analyze(lattice);
559 }
560
parse(const char * str)561 const char *TaggerImpl::parse(const char *str) {
562 return parse(str, std::strlen(str));
563 }
564
parse(const char * str,size_t len)565 const char *TaggerImpl::parse(const char *str, size_t len) {
566 Lattice *lattice = mutable_lattice();
567 lattice->set_sentence(str, len);
568 initRequestType();
569 if (!parse(lattice)) {
570 set_what(lattice->what());
571 return 0;
572 }
573 const char *result = lattice->toString();
574 if (!result) {
575 set_what(lattice->what());
576 return 0;
577 }
578 return result;
579 }
580
parse(const char * str,size_t len,char * out,size_t len2)581 const char *TaggerImpl::parse(const char *str, size_t len,
582 char *out, size_t len2) {
583 Lattice *lattice = mutable_lattice();
584 lattice->set_sentence(str, len);
585 initRequestType();
586 if (!parse(lattice)) {
587 set_what(lattice->what());
588 return 0;
589 }
590 const char *result = lattice->toString(out, len2);
591 if (!result) {
592 set_what(lattice->what());
593 return 0;
594 }
595 return result;
596 }
597
parseToNode(const char * str)598 const Node *TaggerImpl::parseToNode(const char *str) {
599 return parseToNode(str, std::strlen(str));
600 }
601
parseToNode(const char * str,size_t len)602 const Node *TaggerImpl::parseToNode(const char *str, size_t len) {
603 Lattice *lattice = mutable_lattice();
604 lattice->set_sentence(str, len);
605 initRequestType();
606 if (!parse(lattice)) {
607 set_what(lattice->what());
608 return 0;
609 }
610 return lattice->bos_node();
611 }
612
parseNBestInit(const char * str)613 bool TaggerImpl::parseNBestInit(const char *str) {
614 return parseNBestInit(str, std::strlen(str));
615 }
616
parseNBestInit(const char * str,size_t len)617 bool TaggerImpl::parseNBestInit(const char *str, size_t len) {
618 Lattice *lattice = mutable_lattice();
619 lattice->set_sentence(str, len);
620 initRequestType();
621 lattice->add_request_type(MECAB_NBEST);
622 if (!parse(lattice)) {
623 set_what(lattice->what());
624 return false;
625 }
626 return true;
627 }
628
nextNode()629 const Node* TaggerImpl::nextNode() {
630 Lattice *lattice = mutable_lattice();
631 if (!lattice->next()) {
632 lattice->set_what("no more results");
633 return 0;
634 }
635 return lattice->bos_node();
636 }
637
next()638 const char* TaggerImpl::next() {
639 Lattice *lattice = mutable_lattice();
640 if (!lattice->next()) {
641 lattice->set_what("no more results");
642 return 0;
643 }
644 const char *result = lattice->toString();
645 if (!result) {
646 set_what(lattice->what());
647 return 0;
648 }
649 return result;
650 }
651
next(char * out,size_t len2)652 const char* TaggerImpl::next(char *out, size_t len2) {
653 Lattice *lattice = mutable_lattice();
654 if (!lattice->next()) {
655 lattice->set_what("no more results");
656 return 0;
657 }
658 const char *result = lattice->toString(out, len2);
659 if (!result) {
660 set_what(lattice->what());
661 return 0;
662 }
663 return result;
664 }
665
parseNBest(size_t N,const char * str)666 const char* TaggerImpl::parseNBest(size_t N, const char* str) {
667 return parseNBest(N, str, std::strlen(str));
668 }
669
parseNBest(size_t N,const char * str,size_t len)670 const char* TaggerImpl::parseNBest(size_t N,
671 const char* str, size_t len) {
672 Lattice *lattice = mutable_lattice();
673 lattice->set_sentence(str, len);
674 initRequestType();
675 lattice->add_request_type(MECAB_NBEST);
676
677 if (!parse(lattice)) {
678 set_what(lattice->what());
679 return 0;
680 }
681
682 const char *result = lattice->enumNBestAsString(N);
683 if (!result) {
684 set_what(lattice->what());
685 return 0;
686 }
687 return result;
688 }
689
parseNBest(size_t N,const char * str,size_t len,char * out,size_t len2)690 const char* TaggerImpl::parseNBest(size_t N, const char* str, size_t len,
691 char *out, size_t len2) {
692 Lattice *lattice = mutable_lattice();
693 lattice->set_sentence(str, len);
694 initRequestType();
695 lattice->add_request_type(MECAB_NBEST);
696
697 if (!parse(lattice)) {
698 set_what(lattice->what());
699 return 0;
700 }
701
702 const char *result = lattice->enumNBestAsString(N, out, len2);
703 if (!result) {
704 set_what(lattice->what());
705 return 0;
706 }
707 return result;
708 }
709
formatNode(const Node * node)710 const char* TaggerImpl::formatNode(const Node* node) {
711 const char *result = mutable_lattice()->toString(node);
712 if (!result) {
713 set_what(mutable_lattice()->what());
714 return 0;
715 }
716 return result;
717 }
718
formatNode(const Node * node,char * out,size_t len)719 const char* TaggerImpl::formatNode(const Node* node,
720 char *out, size_t len) {
721 const char *result = mutable_lattice()->toString(node, out, len);
722 if (!result) {
723 set_what(mutable_lattice()->what());
724 return 0;
725 }
726 return result;
727 }
728
dictionary_info() const729 const DictionaryInfo *TaggerImpl::dictionary_info() const {
730 return model()->dictionary_info();
731 }
732
LatticeImpl(const Writer * writer)733 LatticeImpl::LatticeImpl(const Writer *writer)
734 : sentence_(0), size_(0), theta_(kDefaultTheta), Z_(0.0),
735 request_type_(MECAB_ONE_BEST),
736 writer_(writer),
737 ostrs_(0),
738 allocator_(new Allocator<Node, Path>) {
739 begin_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
740 end_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
741 }
742
~LatticeImpl()743 LatticeImpl::~LatticeImpl() {}
744
clear()745 void LatticeImpl::clear() {
746 allocator_->free();
747 if (ostrs_.get()) {
748 ostrs_->clear();
749 }
750 begin_nodes_.clear();
751 end_nodes_.clear();
752 feature_constraint_.clear();
753 boundary_constraint_.clear();
754 size_ = 0;
755 theta_ = kDefaultTheta;
756 Z_ = 0.0;
757 sentence_ = 0;
758 }
759
set_sentence(const char * sentence)760 void LatticeImpl::set_sentence(const char *sentence) {
761 return set_sentence(sentence, strlen(sentence));
762 }
763
set_sentence(const char * sentence,size_t len)764 void LatticeImpl::set_sentence(const char *sentence, size_t len) {
765 clear();
766 end_nodes_.resize(len + 4);
767 begin_nodes_.resize(len + 4);
768
769 if (has_request_type(MECAB_ALLOCATE_SENTENCE) ||
770 has_request_type(MECAB_PARTIAL)) {
771 char *new_sentence = allocator()->strdup(sentence, len);
772 sentence_ = new_sentence;
773 } else {
774 sentence_ = sentence;
775 }
776
777 size_ = len;
778 std::memset(&end_nodes_[0], 0,
779 sizeof(end_nodes_[0]) * (len + 4));
780 std::memset(&begin_nodes_[0], 0,
781 sizeof(begin_nodes_[0]) * (len + 4));
782 }
783
next()784 bool LatticeImpl::next() {
785 if (!has_request_type(MECAB_NBEST)) {
786 set_what("MECAB_NBEST request type is not set");
787 return false;
788 }
789
790 if (!allocator()->nbest_generator()->next()) {
791 return false;
792 }
793
794 Viterbi::buildResultForNBest(this);
795 return true;
796 }
797
set_result(const char * result)798 void LatticeImpl::set_result(const char *result) {
799 char *str = allocator()->strdup(result, std::strlen(result));
800 std::vector<char *> lines;
801 const size_t lsize = tokenize(str, "\n",
802 std::back_inserter(lines),
803 std::strlen(result));
804 CHECK_DIE(lsize == lines.size());
805
806 std::string sentence;
807 std::vector<std::string> surfaces, features;
808 for (size_t i = 0; i < lines.size(); ++i) {
809 if (::strcmp("EOS", lines[i]) == 0) {
810 break;
811 }
812 char *cols[2];
813 if (tokenize(lines[i], "\t", cols, 2) != 2) {
814 break;
815 }
816 sentence += cols[0];
817 surfaces.push_back(cols[0]);
818 features.push_back(cols[1]);
819 }
820
821 CHECK_DIE(features.size() == surfaces.size());
822
823 set_sentence(allocator()->strdup(sentence.c_str(), sentence.size()));
824
825 Node *bos_node = allocator()->newNode();
826 bos_node->surface = const_cast<const char *>(BOS_KEY); // dummy
827 bos_node->feature = "BOS/EOS";
828 bos_node->isbest = 1;
829 bos_node->stat = MECAB_BOS_NODE;
830
831 Node *eos_node = allocator()->newNode();
832 eos_node->surface = const_cast<const char *>(BOS_KEY); // dummy
833 eos_node->feature = "BOS/EOS";
834 eos_node->isbest = 1;
835 eos_node->stat = MECAB_EOS_NODE;
836
837 bos_node->surface = sentence_;
838 end_nodes_[0] = bos_node;
839
840 size_t offset = 0;
841 Node *prev = bos_node;
842 for (size_t i = 0; i < surfaces.size(); ++i) {
843 Node *node = allocator()->newNode();
844 node->prev = prev;
845 prev->next = node;
846 node->surface = sentence_ + offset;
847 node->length = surfaces[i].size();
848 node->rlength = surfaces[i].size();
849 node->isbest = 1;
850 node->stat = MECAB_NOR_NODE;
851 node->wcost = 0;
852 node->cost = 0;
853 node->feature = allocator()->strdup(features[i].c_str(),
854 features[i].size());
855 begin_nodes_[offset] = node;
856 end_nodes_[offset + node->length] = node;
857 offset += node->length;
858 prev = node;
859 }
860
861 prev->next = eos_node;
862 eos_node->prev = prev;
863 }
864
865 // default implementation of Lattice formatter.
866 namespace {
writeLattice(Lattice * lattice,StringBuffer * os)867 void writeLattice(Lattice *lattice, StringBuffer *os) {
868 for (const Node *node = lattice->bos_node()->next;
869 node->next; node = node->next) {
870 os->write(node->surface, node->length);
871 *os << '\t' << node->feature;
872 *os << '\n';
873 }
874 *os << "EOS\n";
875 }
876 } // namespace
877
toString()878 const char *LatticeImpl::toString() {
879 return toStringInternal(stream());
880 }
881
toString(char * buf,size_t size)882 const char *LatticeImpl::toString(char *buf, size_t size) {
883 StringBuffer os(buf, size);
884 return toStringInternal(&os);
885 }
886
toStringInternal(StringBuffer * os)887 const char *LatticeImpl::toStringInternal(StringBuffer *os) {
888 os->clear();
889 if (writer_) {
890 if (!writer_->write(this, os)) {
891 return 0;
892 }
893 } else {
894 writeLattice(this, os);
895 }
896 *os << '\0';
897 if (!os->str()) {
898 set_what("output buffer overflow");
899 return 0;
900 }
901 return os->str();
902 }
903
toString(const Node * node)904 const char *LatticeImpl::toString(const Node *node) {
905 return toStringInternal(node, stream());
906 }
907
toString(const Node * node,char * buf,size_t size)908 const char *LatticeImpl::toString(const Node *node,
909 char *buf, size_t size) {
910 StringBuffer os(buf, size);
911 return toStringInternal(node, &os);
912 }
913
toStringInternal(const Node * node,StringBuffer * os)914 const char *LatticeImpl::toStringInternal(const Node *node,
915 StringBuffer *os) {
916 os->clear();
917 if (!node) {
918 set_what("node is NULL");
919 return 0;
920 }
921 if (writer_) {
922 if (!writer_->writeNode(this, node, os)) {
923 return 0;
924 }
925 } else {
926 os->write(node->surface, node->length);
927 *os << '\t' << node->feature;
928 }
929 *os << '\0';
930 if (!os->str()) {
931 set_what("output buffer overflow");
932 return 0;
933 }
934 return os->str();
935 }
936
enumNBestAsString(size_t N)937 const char *LatticeImpl::enumNBestAsString(size_t N) {
938 return enumNBestAsStringInternal(N, stream());
939 }
940
enumNBestAsString(size_t N,char * buf,size_t size)941 const char *LatticeImpl::enumNBestAsString(size_t N, char *buf, size_t size) {
942 StringBuffer os(buf, size);
943 return enumNBestAsStringInternal(N, &os);
944 }
945
enumNBestAsStringInternal(size_t N,StringBuffer * os)946 const char *LatticeImpl::enumNBestAsStringInternal(size_t N,
947 StringBuffer *os) {
948 os->clear();
949
950 if (N == 0 || N > NBEST_MAX) {
951 set_what("nbest size must be 1 <= nbest <= 512");
952 return 0;
953 }
954
955 for (size_t i = 0; i < N; ++i) {
956 if (!next()) {
957 break;
958 }
959 if (writer_) {
960 if (!writer_->write(this, os)) {
961 return 0;
962 }
963 } else {
964 writeLattice(this, os);
965 }
966 }
967
968 // make a dummy node for EON
969 if (writer_) {
970 Node eon_node;
971 memset(&eon_node, 0, sizeof(eon_node));
972 eon_node.stat = MECAB_EON_NODE;
973 eon_node.next = 0;
974 eon_node.surface = this->sentence() + this->size();
975 if (!writer_->writeNode(this, &eon_node, os)) {
976 return 0;
977 }
978 }
979 *os << '\0';
980
981 if (!os->str()) {
982 set_what("output buffer overflow");
983 return 0;
984 }
985
986 return os->str();
987 }
988
has_constraint() const989 bool LatticeImpl::has_constraint() const {
990 return !boundary_constraint_.empty();
991 }
992
boundary_constraint(size_t pos) const993 int LatticeImpl::boundary_constraint(size_t pos) const {
994 if (!boundary_constraint_.empty()) {
995 return boundary_constraint_[pos];
996 }
997 return MECAB_ANY_BOUNDARY;
998 }
999
feature_constraint(size_t begin_pos) const1000 const char *LatticeImpl::feature_constraint(size_t begin_pos) const {
1001 if (!feature_constraint_.empty()) {
1002 return feature_constraint_[begin_pos];
1003 }
1004 return 0;
1005 }
1006
set_boundary_constraint(size_t pos,int boundary_constraint_type)1007 void LatticeImpl::set_boundary_constraint(size_t pos,
1008 int boundary_constraint_type) {
1009 if (boundary_constraint_.empty()) {
1010 boundary_constraint_.resize(size() + 4, MECAB_ANY_BOUNDARY);
1011 }
1012 boundary_constraint_[pos] = boundary_constraint_type;
1013 }
1014
set_feature_constraint(size_t begin_pos,size_t end_pos,const char * feature)1015 void LatticeImpl::set_feature_constraint(size_t begin_pos, size_t end_pos,
1016 const char *feature) {
1017 if (begin_pos >= end_pos || !feature) {
1018 return;
1019 }
1020
1021 if (feature_constraint_.empty()) {
1022 feature_constraint_.resize(size() + 4, 0);
1023 }
1024
1025 end_pos = std::min(end_pos, size());
1026
1027 set_boundary_constraint(begin_pos, MECAB_TOKEN_BOUNDARY);
1028 set_boundary_constraint(end_pos, MECAB_TOKEN_BOUNDARY);
1029 for (size_t i = begin_pos + 1; i < end_pos; ++i) {
1030 set_boundary_constraint(i, MECAB_INSIDE_TOKEN);
1031 }
1032
1033 feature_constraint_[begin_pos] = feature;
1034 }
1035 } // namespace
1036
create(int argc,char ** argv)1037 Tagger *Tagger::create(int argc, char **argv) {
1038 return createTagger(argc, argv);
1039 }
1040
create(const char * arg)1041 Tagger *Tagger::create(const char *arg) {
1042 return createTagger(arg);
1043 }
1044
version()1045 const char *Tagger::version() {
1046 return VERSION;
1047 }
1048
createTagger(int argc,char ** argv)1049 Tagger *createTagger(int argc, char **argv) {
1050 TaggerImpl *tagger = new TaggerImpl();
1051 if (!tagger->open(argc, argv)) {
1052 setGlobalError(tagger->what());
1053 delete tagger;
1054 return 0;
1055 }
1056 return tagger;
1057 }
1058
createTagger(const char * argv)1059 Tagger *createTagger(const char *argv) {
1060 TaggerImpl *tagger = new TaggerImpl();
1061 if (!tagger->open(argv)) {
1062 setGlobalError(tagger->what());
1063 delete tagger;
1064 return 0;
1065 }
1066 return tagger;
1067 }
1068
deleteTagger(Tagger * tagger)1069 void deleteTagger(Tagger *tagger) {
1070 delete tagger;
1071 }
1072
getTaggerError()1073 const char *getTaggerError() {
1074 return getLastError();
1075 }
1076
getLastError()1077 const char *getLastError() {
1078 return getGlobalError();
1079 }
1080
createModel(int argc,char ** argv)1081 Model *createModel(int argc, char **argv) {
1082 ModelImpl *model = new ModelImpl;
1083 if (!model->open(argc, argv)) {
1084 delete model;
1085 return 0;
1086 }
1087 return model;
1088 }
1089
createModel(const char * arg)1090 Model *createModel(const char *arg) {
1091 ModelImpl *model = new ModelImpl;
1092 if (!model->open(arg)) {
1093 delete model;
1094 return 0;
1095 }
1096 return model;
1097 }
1098
deleteModel(Model * model)1099 void deleteModel(Model *model) {
1100 delete model;
1101 }
1102
create(int argc,char ** argv)1103 Model *Model::create(int argc, char **argv) {
1104 return createModel(argc, argv);
1105 }
1106
create(const char * arg)1107 Model *Model::create(const char *arg) {
1108 return createModel(arg);
1109 }
1110
version()1111 const char *Model::version() {
1112 return VERSION;
1113 }
1114
parse(const Model & model,Lattice * lattice)1115 bool Tagger::parse(const Model &model, Lattice *lattice) {
1116 scoped_ptr<Tagger> tagger(model.createTagger());
1117 return tagger->parse(lattice);
1118 }
1119
create()1120 Lattice *Lattice::create() {
1121 return createLattice();
1122 }
1123
createLattice()1124 Lattice *createLattice() {
1125 return new LatticeImpl;
1126 }
1127
deleteLattice(Lattice * lattice)1128 void deleteLattice(Lattice *lattice) {
1129 delete lattice;
1130 }
1131 } // MeCab
1132
mecab_do(int argc,char ** argv)1133 int mecab_do(int argc, char **argv) {
1134 #define WHAT_ERROR(msg) do { \
1135 std::cout << msg << std::endl; \
1136 return EXIT_FAILURE; } \
1137 while (0);
1138
1139 MeCab::Param param;
1140 if (!param.open(argc, argv, MeCab::long_options)) {
1141 std::cout << param.what() << std::endl;
1142 return EXIT_FAILURE;
1143 }
1144
1145 if (param.get<bool>("help")) {
1146 std::cout << param.help() << std::endl;
1147 return EXIT_SUCCESS;
1148 }
1149
1150 if (param.get<bool>("version")) {
1151 std::cout << param.version() << std::endl;
1152 return EXIT_SUCCESS;
1153 }
1154
1155 if (!load_dictionary_resource(¶m)) {
1156 std::cout << param.what() << std::endl;
1157 return EXIT_SUCCESS;
1158 }
1159
1160 if (param.get<int>("lattice-level") >= 1) {
1161 std::cerr << "lattice-level is DEPERCATED. "
1162 << "use --marginal or --nbest." << std::endl;
1163 }
1164
1165 MeCab::scoped_ptr<MeCab::ModelImpl> model(new MeCab::ModelImpl);
1166 if (!model->open(param)) {
1167 std::cout << MeCab::getLastError() << std::endl;
1168 return EXIT_FAILURE;
1169 }
1170
1171 std::string ofilename = param.get<std::string>("output");
1172 if (ofilename.empty()) {
1173 ofilename = "-";
1174 }
1175
1176 const int nbest = param.get<int>("nbest");
1177 if (nbest <= 0 || nbest > NBEST_MAX) {
1178 WHAT_ERROR("invalid N value");
1179 }
1180
1181 MeCab::ostream_wrapper ofs(ofilename.c_str());
1182 if (!*ofs) {
1183 WHAT_ERROR("no such file or directory: " << ofilename);
1184 }
1185
1186 if (param.get<bool>("dump-config")) {
1187 param.dump_config(&*ofs);
1188 return EXIT_FAILURE;
1189 }
1190
1191 if (param.get<bool>("dictionary-info")) {
1192 for (const MeCab::DictionaryInfo *d = model->dictionary_info();
1193 d; d = d->next) {
1194 *ofs << "filename:\t" << d->filename << std::endl;
1195 *ofs << "version:\t" << d->version << std::endl;
1196 *ofs << "charset:\t" << d->charset << std::endl;
1197 *ofs << "type:\t" << d->type << std::endl;
1198 *ofs << "size:\t" << d->size << std::endl;
1199 *ofs << "left size:\t" << d->lsize << std::endl;
1200 *ofs << "right size:\t" << d->rsize << std::endl;
1201 *ofs << std::endl;
1202 }
1203 return EXIT_FAILURE;
1204 }
1205
1206 const std::vector<std::string>& rest_ = param.rest_args();
1207 std::vector<std::string> rest = rest_;
1208
1209 if (rest.empty()) {
1210 rest.push_back("-");
1211 }
1212
1213 size_t ibufsize = std::min(MAX_INPUT_BUFFER_SIZE,
1214 std::max(param.get<int>
1215 ("input-buffer-size"),
1216 MIN_INPUT_BUFFER_SIZE));
1217
1218 const bool partial = param.get<bool>("partial");
1219 if (partial) {
1220 ibufsize *= 8;
1221 }
1222
1223 MeCab::scoped_array<char> ibuf_data(new char[ibufsize]);
1224 char *ibuf = ibuf_data.get();
1225
1226 MeCab::scoped_ptr<MeCab::Tagger> tagger(model->createTagger());
1227
1228 if (!tagger.get()) {
1229 WHAT_ERROR("cannot create tagger");
1230 }
1231
1232 for (size_t i = 0; i < rest.size(); ++i) {
1233 MeCab::istream_wrapper ifs(rest[i].c_str());
1234 if (!*ifs) {
1235 WHAT_ERROR("no such file or directory: " << rest[i]);
1236 }
1237
1238 while (true) {
1239 if (!partial) {
1240 ifs->getline(ibuf, ibufsize);
1241 } else {
1242 std::string sentence;
1243 MeCab::scoped_fixed_array<char, BUF_SIZE> line;
1244 for (;;) {
1245 if (!ifs->getline(line.get(), line.size())) {
1246 ifs->clear(std::ios::eofbit|std::ios::badbit);
1247 break;
1248 }
1249 sentence += line.get();
1250 sentence += '\n';
1251 if (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0') {
1252 break;
1253 }
1254 }
1255 std::strncpy(ibuf, sentence.c_str(), ibufsize);
1256 }
1257 if (ifs->eof() && !ibuf[0]) {
1258 return false;
1259 }
1260 if (ifs->fail()) {
1261 std::cerr << "input-buffer overflow. "
1262 << "The line is split. use -b #SIZE option." << std::endl;
1263 ifs->clear();
1264 }
1265 const char *r = (nbest >= 2) ? tagger->parseNBest(nbest, ibuf) :
1266 tagger->parse(ibuf);
1267 if (!r) {
1268 WHAT_ERROR(tagger->what());
1269 }
1270 *ofs << r << std::flush;
1271 }
1272 }
1273
1274 return EXIT_SUCCESS;
1275
1276 #undef WHAT_ERROR
1277 }
1278