1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 //  Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 //  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <cstring>
7 #include <iostream>
8 #include <iterator>
9 #include "common.h"
10 #include "connector.h"
11 #include "mecab.h"
12 #include "nbest_generator.h"
13 #include "param.h"
14 #include "scoped_ptr.h"
15 #include "stream_wrapper.h"
16 #include "string_buffer.h"
17 #include "thread.h"
18 #include "tokenizer.h"
19 #include "viterbi.h"
20 #include "writer.h"
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25 
26 const char *getGlobalError();
27 void setGlobalError(const char *str);
28 
29 namespace MeCab {
30 namespace {
31 
32 const float kDefaultTheta = 0.75;
33 
34 const MeCab::Option long_options[] = {
35   { "rcfile",        'r',  0, "FILE",    "use FILE as resource file" },
36   { "dicdir",        'd',  0, "DIR",    "set DIR  as a system dicdir" },
37   { "userdic",        'u',  0, "FILE",    "use FILE as a user dictionary" },
38   { "lattice-level",      'l', "0", "INT",
39     "lattice information level (DEPRECATED)" },
40   { "dictionary-info",  'D', 0, 0, "show dictionary information and exit" },
41   { "output-format-type", 'O',  0, "TYPE",
42     "set output format type (wakati,none,...)" },
43   { "all-morphs",      'a', 0, 0,    "output all morphs(default false)" },
44   { "nbest",              'N', "1",
45     "INT", "output N best results (default 1)" },
46   { "partial",            'p',  0, 0,
47     "partial parsing mode (default false)" },
48   { "marginal",           'm',  0, 0,
49     "output marginal probability (default false)" },
50   { "max-grouping-size",  'M',  "24",
51     "INT",  "maximum grouping size for unknown words (default 24)" },
52   { "node-format",        'F',  "%m\\t%H\\n", "STR",
53     "use STR as the user-defined node format" },
54   { "unk-format",        'U',  "%m\\t%H\\n", "STR",
55     "use STR as the user-defined unknown node format"   },
56   { "bos-format",        'B',  "", "STR",
57     "use STR as the user-defined beginning-of-sentence format"   },
58   { "eos-format",        'E',  "EOS\\n", "STR",
59     "use STR as the user-defined end-of-sentence format"   },
60   { "eon-format",        'S',  "", "STR",
61     "use STR as the user-defined end-of-NBest format"   },
62   { "unk-feature",       'x',  0, "STR",
63     "use STR as the feature for unknown word" },
64   { "input-buffer-size",  'b',  0, "INT",
65     "set input buffer size (default 8192)" },
66   { "dump-config", 'P', 0, 0, "dump MeCab parameters" },
67   { "allocate-sentence",  'C', 0, 0,
68     "allocate new memory for input sentence" },
69   { "theta",        't',  "0.75",  "FLOAT",
70     "set temparature parameter theta (default 0.75)"  },
71   { "cost-factor",        'c',  "700",  "INT",
72     "set cost factor (default 700)"  },
73   { "output",        'o',  0,    "FILE",  "set the output file name" },
74   { "version",        'v',  0, 0,     "show the version and exit." },
75   { "help",          'h',  0, 0,     "show this help and exit." },
76   { 0, 0, 0, 0 }
77 };
78 
79 class ModelImpl: public Model {
80  public:
81   ModelImpl();
82   virtual ~ModelImpl();
83 
84   bool open(int argc, char **argv);
85   bool open(const char *arg);
86   bool open(const Param &param);
87 
88   bool swap(Model *model);
89 
is_available() const90   bool is_available() const {
91     return (viterbi_ && writer_.get());
92   }
93 
request_type() const94   int request_type() const {
95     return request_type_;
96   }
97 
theta() const98   double theta() const {
99     return theta_;
100   }
101 
dictionary_info() const102   const DictionaryInfo *dictionary_info() const {
103     return viterbi_->tokenizer() ?
104         viterbi_->tokenizer()->dictionary_info() : 0;
105   }
106 
transition_cost(unsigned short rcAttr,unsigned short lcAttr) const107   int transition_cost(unsigned short rcAttr,
108                       unsigned short lcAttr) const {
109     return viterbi_->connector()->transition_cost(rcAttr, lcAttr);
110   }
111 
lookup(const char * begin,const char * end,Lattice * lattice) const112   Node *lookup(const char *begin, const char *end,
113                Lattice *lattice) const {
114     return viterbi_->tokenizer()->lookup<false>(
115         begin, end,
116         lattice->allocator(), lattice);
117   }
118 
119   Tagger *createTagger() const;
120 
121   Lattice *createLattice() const;
122 
viterbi() const123   const Viterbi *viterbi() const {
124     return viterbi_;
125   }
126 
127   // moves the owership.
take_viterbi()128   Viterbi *take_viterbi() {
129     Viterbi *result = viterbi_;
130     viterbi_ = 0;
131     return result;
132   }
133 
writer() const134   const Writer *writer() const {
135     return writer_.get();
136   }
137 
138 #ifdef HAVE_ATOMIC_OPS
mutex() const139   read_write_mutex *mutex() const {
140     return &mutex_;
141   }
142 #endif
143 
144  private:
145   Viterbi            *viterbi_;
146   scoped_ptr<Writer>  writer_;
147   int                 request_type_;
148   double              theta_;
149 
150 #ifdef HAVE_ATOMIC_OPS
151   mutable read_write_mutex      mutex_;
152 #endif
153 };
154 
155 class TaggerImpl: public Tagger {
156  public:
157   bool                  open(int argc, char **argv);
158   bool                  open(const char *arg);
159   bool                  open(const ModelImpl &model);
160 
161   bool                  parse(Lattice *lattice) const;
162 
163   void                  set_request_type(int request_type);
164   int                   request_type() const;
165 
166   const char*           parse(const char*);
167   const char*           parse(const char*, size_t);
168   const char*           parse(const char*, size_t, char*, size_t);
169   const Node*           parseToNode(const char*);
170   const Node*           parseToNode(const char*, size_t = 0);
171   const char*           parseNBest(size_t, const char*);
172   const char*           parseNBest(size_t, const char*, size_t);
173   const char*           parseNBest(size_t, const char*,
174                                    size_t, char *, size_t);
175   bool                  parseNBestInit(const char*);
176   bool                  parseNBestInit(const char*, size_t);
177   const Node*           nextNode();
178   const char*           next();
179   const char*           next(char*, size_t);
180 
181   const char           *formatNode(const Node *);
182   const char           *formatNode(const Node *, char *, size_t);
183 
184   const DictionaryInfo *dictionary_info() const;
185 
186   void                  set_partial(bool partial);
187   bool                  partial() const;
188   void                  set_theta(float theta);
189   float                 theta() const;
190   void                  set_lattice_level(int level);
191   int                   lattice_level() const;
192   void                  set_all_morphs(bool all_morphs);
193   bool                  all_morphs() const;
194 
195   const char*           what() const;
196 
197   TaggerImpl();
198   virtual ~TaggerImpl();
199 
200  private:
model() const201   const ModelImpl *model() const { return current_model_; }
202 
set_what(const char * str)203    void set_what(const char *str) {
204      what_.assign(str);
205    }
206 
initRequestType()207   void initRequestType() {
208     mutable_lattice()->set_request_type(request_type_);
209     mutable_lattice()->set_theta(theta_);
210   }
211 
mutable_lattice()212   Lattice *mutable_lattice() {
213     if (!lattice_.get()) {
214       lattice_.reset(model()->createLattice());
215     }
216     return lattice_.get();
217   }
218 
219   const ModelImpl          *current_model_;
220   scoped_ptr<ModelImpl>     model_;
221   scoped_ptr<Lattice>       lattice_;
222   int                       request_type_;
223   double                    theta_;
224   std::string               what_;
225 };
226 
227 class LatticeImpl : public Lattice {
228  public:
229   explicit LatticeImpl(const Writer *writer = 0);
230   ~LatticeImpl();
231 
232   // clear internal lattice
233   void clear();
234 
is_available() const235   bool is_available() const {
236     return (sentence_ &&
237             !begin_nodes_.empty() &&
238             !end_nodes_.empty());
239   }
240 
241   // nbest;
242   bool next();
243 
244   // return bos/eos node
bos_node() const245   Node *bos_node() const { return end_nodes_[0]; }
eos_node() const246   Node *eos_node() const { return begin_nodes_[size()]; }
begin_nodes() const247   Node **begin_nodes() const { return const_cast<Node **>(&begin_nodes_[0]); }
end_nodes() const248   Node **end_nodes() const   { return const_cast<Node **>(&end_nodes_[0]); }
begin_nodes(size_t pos) const249   Node *begin_nodes(size_t pos) const { return begin_nodes_[pos]; }
end_nodes(size_t pos) const250   Node *end_nodes(size_t pos) const { return end_nodes_[pos]; }
251 
sentence() const252   const char *sentence() const { return sentence_; }
253   void set_sentence(const char *sentence);
254   void set_sentence(const char *sentence, size_t len);
size() const255   size_t size() const { return size_; }
256 
set_Z(double Z)257   void set_Z(double Z) { Z_ = Z; }
Z() const258   double Z() const { return Z_; }
259 
theta() const260   float theta() const { return theta_; }
set_theta(float theta)261   void  set_theta(float theta) { theta_ = theta; }
262 
request_type() const263   int request_type() const { return request_type_; }
264 
set_request_type(int request_type)265   void set_request_type(int request_type) {
266     request_type_ = request_type;
267   }
has_request_type(int request_type) const268   bool has_request_type(int request_type) const {
269     return request_type & request_type_;
270   }
add_request_type(int request_type)271   void add_request_type(int request_type) {
272     request_type_ |= request_type;
273   }
remove_request_type(int request_type)274   void remove_request_type(int request_type) {
275     request_type_ &= ~request_type;
276   }
277 
allocator() const278   Allocator<Node, Path> *allocator() const {
279     return allocator_.get();
280   }
281 
newNode()282   Node *newNode() {
283     return allocator_->newNode();
284   }
285 
286   bool has_constraint() const;
287   int boundary_constraint(size_t pos) const;
288   const char *feature_constraint(size_t begin_pos) const;
289 
290   void set_boundary_constraint(size_t pos,
291                                int boundary_constraint_type);
292 
293   void set_feature_constraint(size_t begin_pos, size_t end_pos,
294                               const char *feature);
295 
296   void set_result(const char *result);
297 
what() const298   const char *what() const { return what_.c_str(); }
299 
set_what(const char * str)300   void set_what(const char *str) {
301     what_.assign(str);
302   }
303 
304   const char *toString();
305   const char *toString(char *buf, size_t size);
306   const char *toString(const Node *node);
307   const char *toString(const Node *node,
308                        char *buf, size_t size);
309   const char *enumNBestAsString(size_t N);
310   const char *enumNBestAsString(size_t N, char *buf, size_t size);
311 
312  private:
313   const char                 *sentence_;
314   size_t                      size_;
315   double                      theta_;
316   double                      Z_;
317   int                         request_type_;
318   std::string                 what_;
319   std::vector<Node *>         end_nodes_;
320   std::vector<Node *>         begin_nodes_;
321   std::vector<const char *>   feature_constraint_;
322   std::vector<unsigned char>  boundary_constraint_;
323   const Writer               *writer_;
324   scoped_ptr<StringBuffer>    ostrs_;
325   scoped_ptr<Allocator<Node, Path> > allocator_;
326 
stream()327   StringBuffer *stream() {
328     if (!ostrs_.get()) {
329       ostrs_.reset(new StringBuffer);
330     }
331     return ostrs_.get();
332   }
333 
334   const char *toStringInternal(StringBuffer *os);
335   const char *toStringInternal(const Node *node, StringBuffer *os);
336   const char *enumNBestAsStringInternal(size_t N, StringBuffer *os);
337 };
338 
ModelImpl()339 ModelImpl::ModelImpl()
340     : viterbi_(new Viterbi), writer_(new Writer),
341       request_type_(MECAB_ONE_BEST), theta_(0.0) {}
342 
~ModelImpl()343 ModelImpl::~ModelImpl() {
344   delete viterbi_;
345   viterbi_ = 0;
346 }
347 
open(int argc,char ** argv)348 bool ModelImpl::open(int argc, char **argv) {
349   Param param;
350   if (!param.open(argc, argv, long_options) ||
351       !load_dictionary_resource(&param)) {
352     setGlobalError(param.what());
353     return false;
354   }
355   return open(param);
356 }
357 
open(const char * arg)358 bool ModelImpl::open(const char *arg) {
359   Param param;
360   if (!param.open(arg, long_options) ||
361       !load_dictionary_resource(&param)) {
362     setGlobalError(param.what());
363     return false;
364   }
365   return open(param);
366 }
367 
open(const Param & param)368 bool ModelImpl::open(const Param &param) {
369   if (!writer_->open(param) || !viterbi_->open(param)) {
370     std::string error = viterbi_->what();
371     if (!error.empty()) {
372       error.append(" ");
373     }
374     error.append(writer_->what());
375     setGlobalError(error.c_str());
376     return false;
377   }
378 
379   request_type_ = load_request_type(param);
380   theta_ = param.get<double>("theta");
381 
382   return is_available();
383 }
384 
swap(Model * model)385 bool ModelImpl::swap(Model *model) {
386   scoped_ptr<Model> model_data(model);
387 
388   if (!is_available()) {
389     setGlobalError("current model is not available");
390     return false;
391   }
392 #ifndef HAVE_ATOMIC_OPS
393   setGlobalError("atomic model replacement is not supported");
394   return false;
395 #else
396   ModelImpl *m = static_cast<ModelImpl *>(model_data.get());
397   if (!m) {
398     setGlobalError("Invalid model is passed");
399     return false;
400   }
401 
402   if (!m->is_available()) {
403     setGlobalError("Passed model is not available");
404     return false;
405   }
406 
407   Viterbi *current_viterbi = viterbi_;
408   {
409     scoped_writer_lock l(mutex());
410     viterbi_      = m->take_viterbi();
411     request_type_ = m->request_type();
412     theta_        = m->theta();
413   }
414 
415   delete current_viterbi;
416 
417   return true;
418 #endif
419 }
420 
createTagger() const421 Tagger *ModelImpl::createTagger() const {
422   if (!is_available()) {
423     setGlobalError("Model is not available");
424     return 0;
425   }
426   TaggerImpl *tagger = new TaggerImpl;
427   if (!tagger->open(*this)) {
428     setGlobalError(tagger->what());
429     delete tagger;
430     return 0;
431   }
432   tagger->set_theta(theta_);
433   tagger->set_request_type(request_type_);
434   return tagger;
435 }
436 
createLattice() const437 Lattice *ModelImpl::createLattice() const {
438   if (!is_available()) {
439     setGlobalError("Model is not available");
440     return 0;
441   }
442   return new LatticeImpl(writer_.get());
443 }
444 
TaggerImpl()445 TaggerImpl::TaggerImpl()
446     : current_model_(0),
447       request_type_(MECAB_ONE_BEST), theta_(kDefaultTheta) {}
448 
~TaggerImpl()449 TaggerImpl::~TaggerImpl() {}
450 
what() const451 const char *TaggerImpl::what() const {
452   return what_.c_str();
453 }
454 
open(int argc,char ** argv)455 bool TaggerImpl::open(int argc, char **argv) {
456   model_.reset(new ModelImpl);
457   if (!model_->open(argc, argv)) {
458     model_.reset(0);
459     return false;
460   }
461   current_model_ = model_.get();
462   request_type_ = model()->request_type();
463   theta_        = model()->theta();
464   return true;
465 }
466 
open(const char * arg)467 bool TaggerImpl::open(const char *arg) {
468   model_.reset(new ModelImpl);
469   if (!model_->open(arg)) {
470     model_.reset(0);
471     return false;
472   }
473   current_model_ = model_.get();
474   request_type_ = model()->request_type();
475   theta_        = model()->theta();
476   return true;
477 }
478 
open(const ModelImpl & model)479 bool TaggerImpl::open(const ModelImpl &model) {
480   if (!model.is_available()) {
481     return false;
482   }
483   model_.reset(0);
484   current_model_ = &model;
485   request_type_ = current_model_->request_type();
486   theta_        = current_model_->theta();
487   return true;
488 }
489 
set_request_type(int request_type)490 void TaggerImpl::set_request_type(int request_type) {
491   request_type_ = request_type;
492 }
493 
request_type() const494 int TaggerImpl::request_type() const {
495   return request_type_;
496 }
497 
set_partial(bool partial)498 void TaggerImpl::set_partial(bool partial) {
499   if (partial) {
500     request_type_ |= MECAB_PARTIAL;
501   } else {
502     request_type_ &= ~MECAB_PARTIAL;
503   }
504 }
505 
partial() const506 bool TaggerImpl::partial() const {
507   return request_type_ & MECAB_PARTIAL;
508 }
509 
set_theta(float theta)510 void TaggerImpl::set_theta(float theta) {
511   theta_ = theta;
512 }
513 
theta() const514 float TaggerImpl::theta() const {
515   return theta_;
516 }
517 
set_lattice_level(int level)518 void TaggerImpl::set_lattice_level(int level) {
519   switch (level) {
520     case 0: request_type_ |= MECAB_ONE_BEST;
521       break;
522     case 1: request_type_ |= MECAB_NBEST;
523       break;
524     case 2: request_type_ |= MECAB_MARGINAL_PROB;
525       break;
526     default:
527       break;
528   }
529 }
530 
lattice_level() const531 int TaggerImpl::lattice_level() const {
532   if (request_type_ & MECAB_MARGINAL_PROB) {
533     return 2;
534   } else if (request_type_ & MECAB_NBEST) {
535     return 1;
536   } else {
537     return 0;
538   }
539 }
540 
set_all_morphs(bool all_morphs)541 void TaggerImpl::set_all_morphs(bool all_morphs) {
542   if (all_morphs) {
543     request_type_ |= MECAB_ALL_MORPHS;
544   } else {
545     request_type_ &= ~MECAB_ALL_MORPHS;
546   }
547 }
548 
all_morphs() const549 bool TaggerImpl::all_morphs() const {
550   return request_type_ & MECAB_ALL_MORPHS;
551 }
552 
parse(Lattice * lattice) const553 bool TaggerImpl::parse(Lattice *lattice) const {
554 #ifdef HAVE_ATOMIC_OPS
555   scoped_reader_lock l(model()->mutex());
556 #endif
557 
558   return model()->viterbi()->analyze(lattice);
559 }
560 
parse(const char * str)561 const char *TaggerImpl::parse(const char *str) {
562   return parse(str, std::strlen(str));
563 }
564 
parse(const char * str,size_t len)565 const char *TaggerImpl::parse(const char *str, size_t len) {
566   Lattice *lattice = mutable_lattice();
567   lattice->set_sentence(str, len);
568   initRequestType();
569   if (!parse(lattice)) {
570     set_what(lattice->what());
571     return 0;
572   }
573   const char *result = lattice->toString();
574   if (!result) {
575     set_what(lattice->what());
576     return 0;
577   }
578   return result;
579 }
580 
parse(const char * str,size_t len,char * out,size_t len2)581 const char *TaggerImpl::parse(const char *str, size_t len,
582                               char *out, size_t len2) {
583   Lattice *lattice = mutable_lattice();
584   lattice->set_sentence(str, len);
585   initRequestType();
586   if (!parse(lattice)) {
587     set_what(lattice->what());
588     return 0;
589   }
590   const char *result = lattice->toString(out, len2);
591   if (!result) {
592     set_what(lattice->what());
593     return 0;
594   }
595   return result;
596 }
597 
parseToNode(const char * str)598 const Node *TaggerImpl::parseToNode(const char *str) {
599   return parseToNode(str, std::strlen(str));
600 }
601 
parseToNode(const char * str,size_t len)602 const Node *TaggerImpl::parseToNode(const char *str, size_t len) {
603   Lattice *lattice = mutable_lattice();
604   lattice->set_sentence(str, len);
605   initRequestType();
606   if (!parse(lattice)) {
607     set_what(lattice->what());
608     return 0;
609   }
610   return lattice->bos_node();
611 }
612 
parseNBestInit(const char * str)613 bool TaggerImpl::parseNBestInit(const char *str) {
614   return parseNBestInit(str, std::strlen(str));
615 }
616 
parseNBestInit(const char * str,size_t len)617 bool TaggerImpl::parseNBestInit(const char *str, size_t len) {
618   Lattice *lattice = mutable_lattice();
619   lattice->set_sentence(str, len);
620   initRequestType();
621   lattice->add_request_type(MECAB_NBEST);
622   if (!parse(lattice)) {
623     set_what(lattice->what());
624     return false;
625   }
626   return true;
627 }
628 
nextNode()629 const Node* TaggerImpl::nextNode() {
630   Lattice *lattice = mutable_lattice();
631   if (!lattice->next()) {
632     lattice->set_what("no more results");
633     return 0;
634   }
635   return lattice->bos_node();
636 }
637 
next()638 const char* TaggerImpl::next() {
639   Lattice *lattice = mutable_lattice();
640   if (!lattice->next()) {
641     lattice->set_what("no more results");
642     return 0;
643   }
644   const char *result = lattice->toString();
645   if (!result) {
646     set_what(lattice->what());
647     return 0;
648   }
649   return result;
650 }
651 
next(char * out,size_t len2)652 const char* TaggerImpl::next(char *out, size_t len2) {
653   Lattice *lattice = mutable_lattice();
654   if (!lattice->next()) {
655     lattice->set_what("no more results");
656     return 0;
657   }
658   const char *result = lattice->toString(out, len2);
659   if (!result) {
660     set_what(lattice->what());
661     return 0;
662   }
663   return result;
664 }
665 
parseNBest(size_t N,const char * str)666 const char* TaggerImpl::parseNBest(size_t N, const char* str) {
667   return parseNBest(N, str, std::strlen(str));
668 }
669 
parseNBest(size_t N,const char * str,size_t len)670 const char* TaggerImpl::parseNBest(size_t N,
671                                    const char* str, size_t len) {
672   Lattice *lattice = mutable_lattice();
673   lattice->set_sentence(str, len);
674   initRequestType();
675   lattice->add_request_type(MECAB_NBEST);
676 
677   if (!parse(lattice)) {
678     set_what(lattice->what());
679     return 0;
680   }
681 
682   const char *result = lattice->enumNBestAsString(N);
683   if (!result) {
684     set_what(lattice->what());
685     return 0;
686   }
687   return result;
688 }
689 
parseNBest(size_t N,const char * str,size_t len,char * out,size_t len2)690 const char* TaggerImpl::parseNBest(size_t N, const char* str, size_t len,
691                                    char *out, size_t len2) {
692   Lattice *lattice = mutable_lattice();
693   lattice->set_sentence(str, len);
694   initRequestType();
695   lattice->add_request_type(MECAB_NBEST);
696 
697   if (!parse(lattice)) {
698     set_what(lattice->what());
699     return 0;
700   }
701 
702   const char *result = lattice->enumNBestAsString(N, out, len2);
703   if (!result) {
704     set_what(lattice->what());
705     return 0;
706   }
707   return result;
708 }
709 
formatNode(const Node * node)710 const char* TaggerImpl::formatNode(const Node* node) {
711   const char *result = mutable_lattice()->toString(node);
712   if (!result) {
713     set_what(mutable_lattice()->what());
714     return 0;
715   }
716   return result;
717 }
718 
formatNode(const Node * node,char * out,size_t len)719 const char* TaggerImpl::formatNode(const Node* node,
720                                    char *out, size_t len) {
721   const char *result = mutable_lattice()->toString(node, out, len);
722   if (!result) {
723     set_what(mutable_lattice()->what());
724     return 0;
725   }
726   return result;
727 }
728 
dictionary_info() const729 const DictionaryInfo *TaggerImpl::dictionary_info() const {
730   return model()->dictionary_info();
731 }
732 
LatticeImpl(const Writer * writer)733 LatticeImpl::LatticeImpl(const Writer *writer)
734     : sentence_(0), size_(0), theta_(kDefaultTheta), Z_(0.0),
735       request_type_(MECAB_ONE_BEST),
736       writer_(writer),
737       ostrs_(0),
738       allocator_(new Allocator<Node, Path>) {
739   begin_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
740   end_nodes_.reserve(MIN_INPUT_BUFFER_SIZE);
741 }
742 
~LatticeImpl()743 LatticeImpl::~LatticeImpl() {}
744 
clear()745 void LatticeImpl::clear() {
746   allocator_->free();
747   if (ostrs_.get()) {
748     ostrs_->clear();
749   }
750   begin_nodes_.clear();
751   end_nodes_.clear();
752   feature_constraint_.clear();
753   boundary_constraint_.clear();
754   size_ = 0;
755   theta_ = kDefaultTheta;
756   Z_ = 0.0;
757   sentence_ = 0;
758 }
759 
set_sentence(const char * sentence)760 void LatticeImpl::set_sentence(const char *sentence) {
761   return set_sentence(sentence, strlen(sentence));
762 }
763 
set_sentence(const char * sentence,size_t len)764 void LatticeImpl::set_sentence(const char *sentence, size_t len) {
765   clear();
766   end_nodes_.resize(len + 4);
767   begin_nodes_.resize(len + 4);
768 
769   if (has_request_type(MECAB_ALLOCATE_SENTENCE) ||
770       has_request_type(MECAB_PARTIAL)) {
771     char *new_sentence = allocator()->strdup(sentence, len);
772     sentence_ = new_sentence;
773   } else {
774     sentence_ = sentence;
775   }
776 
777   size_ = len;
778   std::memset(&end_nodes_[0],   0,
779               sizeof(end_nodes_[0]) * (len + 4));
780   std::memset(&begin_nodes_[0], 0,
781               sizeof(begin_nodes_[0]) * (len + 4));
782 }
783 
next()784 bool LatticeImpl::next() {
785   if (!has_request_type(MECAB_NBEST)) {
786     set_what("MECAB_NBEST request type is not set");
787     return false;
788   }
789 
790   if (!allocator()->nbest_generator()->next()) {
791     return false;
792   }
793 
794   Viterbi::buildResultForNBest(this);
795   return true;
796 }
797 
set_result(const char * result)798 void LatticeImpl::set_result(const char *result) {
799   char *str = allocator()->strdup(result, std::strlen(result));
800   std::vector<char *> lines;
801   const size_t lsize = tokenize(str, "\n",
802                                 std::back_inserter(lines),
803                                 std::strlen(result));
804   CHECK_DIE(lsize == lines.size());
805 
806   std::string sentence;
807   std::vector<std::string> surfaces, features;
808   for (size_t i = 0; i < lines.size(); ++i) {
809     if (::strcmp("EOS", lines[i]) == 0) {
810       break;
811     }
812     char *cols[2];
813     if (tokenize(lines[i], "\t", cols, 2) != 2) {
814       break;
815     }
816     sentence += cols[0];
817     surfaces.push_back(cols[0]);
818     features.push_back(cols[1]);
819   }
820 
821   CHECK_DIE(features.size() == surfaces.size());
822 
823   set_sentence(allocator()->strdup(sentence.c_str(), sentence.size()));
824 
825   Node *bos_node = allocator()->newNode();
826   bos_node->surface = const_cast<const char *>(BOS_KEY);  // dummy
827   bos_node->feature = "BOS/EOS";
828   bos_node->isbest = 1;
829   bos_node->stat = MECAB_BOS_NODE;
830 
831   Node *eos_node = allocator()->newNode();
832   eos_node->surface = const_cast<const char *>(BOS_KEY);  // dummy
833   eos_node->feature = "BOS/EOS";
834   eos_node->isbest = 1;
835   eos_node->stat = MECAB_EOS_NODE;
836 
837   bos_node->surface = sentence_;
838   end_nodes_[0] = bos_node;
839 
840   size_t offset = 0;
841   Node *prev = bos_node;
842   for (size_t i = 0; i < surfaces.size(); ++i) {
843     Node *node = allocator()->newNode();
844     node->prev = prev;
845     prev->next = node;
846     node->surface = sentence_ + offset;
847     node->length = surfaces[i].size();
848     node->rlength = surfaces[i].size();
849     node->isbest = 1;
850     node->stat = MECAB_NOR_NODE;
851     node->wcost = 0;
852     node->cost = 0;
853     node->feature = allocator()->strdup(features[i].c_str(),
854                                         features[i].size());
855     begin_nodes_[offset] = node;
856     end_nodes_[offset + node->length] = node;
857     offset += node->length;
858     prev = node;
859   }
860 
861   prev->next = eos_node;
862   eos_node->prev = prev;
863 }
864 
865 // default implementation of Lattice formatter.
866 namespace {
writeLattice(Lattice * lattice,StringBuffer * os)867 void writeLattice(Lattice *lattice, StringBuffer *os) {
868   for (const Node *node = lattice->bos_node()->next;
869        node->next; node = node->next) {
870     os->write(node->surface, node->length);
871     *os << '\t' << node->feature;
872     *os << '\n';
873   }
874   *os << "EOS\n";
875 }
876 }  // namespace
877 
toString()878 const char *LatticeImpl::toString() {
879   return toStringInternal(stream());
880 }
881 
toString(char * buf,size_t size)882 const char *LatticeImpl::toString(char *buf, size_t size) {
883   StringBuffer os(buf, size);
884   return toStringInternal(&os);
885 }
886 
toStringInternal(StringBuffer * os)887 const char *LatticeImpl::toStringInternal(StringBuffer *os) {
888   os->clear();
889   if (writer_) {
890     if (!writer_->write(this, os)) {
891       return 0;
892     }
893   } else {
894     writeLattice(this, os);
895   }
896   *os << '\0';
897   if (!os->str()) {
898     set_what("output buffer overflow");
899     return 0;
900   }
901   return os->str();
902 }
903 
toString(const Node * node)904 const char *LatticeImpl::toString(const Node *node) {
905   return toStringInternal(node, stream());
906 }
907 
toString(const Node * node,char * buf,size_t size)908 const char *LatticeImpl::toString(const Node *node,
909                                   char *buf, size_t size) {
910   StringBuffer os(buf, size);
911   return toStringInternal(node, &os);
912 }
913 
toStringInternal(const Node * node,StringBuffer * os)914 const char *LatticeImpl::toStringInternal(const Node *node,
915                                           StringBuffer *os) {
916   os->clear();
917   if (!node) {
918     set_what("node is NULL");
919     return 0;
920   }
921   if (writer_) {
922     if (!writer_->writeNode(this, node, os)) {
923       return 0;
924     }
925   } else {
926     os->write(node->surface, node->length);
927     *os << '\t' << node->feature;
928   }
929   *os << '\0';
930   if (!os->str()) {
931     set_what("output buffer overflow");
932     return 0;
933   }
934   return os->str();
935 }
936 
enumNBestAsString(size_t N)937 const char *LatticeImpl::enumNBestAsString(size_t N) {
938   return enumNBestAsStringInternal(N, stream());
939 }
940 
enumNBestAsString(size_t N,char * buf,size_t size)941 const char *LatticeImpl::enumNBestAsString(size_t N, char *buf, size_t size) {
942   StringBuffer os(buf, size);
943   return enumNBestAsStringInternal(N, &os);
944 }
945 
enumNBestAsStringInternal(size_t N,StringBuffer * os)946 const char *LatticeImpl::enumNBestAsStringInternal(size_t N,
947                                                    StringBuffer *os) {
948   os->clear();
949 
950   if (N == 0 || N > NBEST_MAX) {
951     set_what("nbest size must be 1 <= nbest <= 512");
952     return 0;
953   }
954 
955   for (size_t i = 0; i < N; ++i) {
956     if (!next()) {
957       break;
958     }
959     if (writer_) {
960       if (!writer_->write(this, os)) {
961         return 0;
962       }
963     } else {
964       writeLattice(this, os);
965     }
966   }
967 
968   // make a dummy node for EON
969   if (writer_) {
970     Node eon_node;
971     memset(&eon_node, 0, sizeof(eon_node));
972     eon_node.stat = MECAB_EON_NODE;
973     eon_node.next = 0;
974     eon_node.surface = this->sentence() + this->size();
975     if (!writer_->writeNode(this, &eon_node, os)) {
976       return 0;
977     }
978   }
979   *os << '\0';
980 
981   if (!os->str()) {
982     set_what("output buffer overflow");
983     return 0;
984   }
985 
986   return os->str();
987 }
988 
has_constraint() const989 bool LatticeImpl::has_constraint() const {
990   return !boundary_constraint_.empty();
991 }
992 
boundary_constraint(size_t pos) const993 int LatticeImpl::boundary_constraint(size_t pos) const {
994   if (!boundary_constraint_.empty()) {
995     return boundary_constraint_[pos];
996   }
997   return MECAB_ANY_BOUNDARY;
998 }
999 
feature_constraint(size_t begin_pos) const1000 const char *LatticeImpl::feature_constraint(size_t begin_pos) const {
1001   if (!feature_constraint_.empty()) {
1002     return feature_constraint_[begin_pos];
1003   }
1004   return 0;
1005 }
1006 
set_boundary_constraint(size_t pos,int boundary_constraint_type)1007 void LatticeImpl::set_boundary_constraint(size_t pos,
1008                                           int boundary_constraint_type) {
1009   if (boundary_constraint_.empty()) {
1010     boundary_constraint_.resize(size() + 4, MECAB_ANY_BOUNDARY);
1011   }
1012   boundary_constraint_[pos] = boundary_constraint_type;
1013 }
1014 
set_feature_constraint(size_t begin_pos,size_t end_pos,const char * feature)1015 void LatticeImpl::set_feature_constraint(size_t begin_pos, size_t end_pos,
1016                                          const char *feature) {
1017   if (begin_pos >= end_pos || !feature) {
1018     return;
1019   }
1020 
1021   if (feature_constraint_.empty()) {
1022     feature_constraint_.resize(size() + 4, 0);
1023   }
1024 
1025   end_pos = std::min(end_pos, size());
1026 
1027   set_boundary_constraint(begin_pos, MECAB_TOKEN_BOUNDARY);
1028   set_boundary_constraint(end_pos, MECAB_TOKEN_BOUNDARY);
1029   for (size_t i = begin_pos + 1; i < end_pos; ++i) {
1030     set_boundary_constraint(i, MECAB_INSIDE_TOKEN);
1031   }
1032 
1033   feature_constraint_[begin_pos] = feature;
1034 }
1035 }  // namespace
1036 
create(int argc,char ** argv)1037 Tagger *Tagger::create(int argc, char **argv) {
1038   return createTagger(argc, argv);
1039 }
1040 
create(const char * arg)1041 Tagger *Tagger::create(const char *arg) {
1042   return createTagger(arg);
1043 }
1044 
version()1045 const char *Tagger::version() {
1046   return VERSION;
1047 }
1048 
createTagger(int argc,char ** argv)1049 Tagger *createTagger(int argc, char **argv) {
1050   TaggerImpl *tagger = new TaggerImpl();
1051   if (!tagger->open(argc, argv)) {
1052     setGlobalError(tagger->what());
1053     delete tagger;
1054     return 0;
1055   }
1056   return tagger;
1057 }
1058 
createTagger(const char * argv)1059 Tagger *createTagger(const char *argv) {
1060   TaggerImpl *tagger = new TaggerImpl();
1061   if (!tagger->open(argv)) {
1062     setGlobalError(tagger->what());
1063     delete tagger;
1064     return 0;
1065   }
1066   return tagger;
1067 }
1068 
deleteTagger(Tagger * tagger)1069 void deleteTagger(Tagger *tagger) {
1070   delete tagger;
1071 }
1072 
getTaggerError()1073 const char *getTaggerError() {
1074   return getLastError();
1075 }
1076 
getLastError()1077 const char *getLastError() {
1078   return getGlobalError();
1079 }
1080 
createModel(int argc,char ** argv)1081 Model *createModel(int argc, char **argv) {
1082   ModelImpl *model = new ModelImpl;
1083   if (!model->open(argc, argv)) {
1084     delete model;
1085     return 0;
1086   }
1087   return model;
1088 }
1089 
createModel(const char * arg)1090 Model *createModel(const char *arg) {
1091   ModelImpl *model = new ModelImpl;
1092   if (!model->open(arg)) {
1093     delete model;
1094     return 0;
1095   }
1096   return model;
1097 }
1098 
deleteModel(Model * model)1099 void deleteModel(Model *model) {
1100   delete model;
1101 }
1102 
create(int argc,char ** argv)1103 Model *Model::create(int argc, char **argv) {
1104   return createModel(argc, argv);
1105 }
1106 
create(const char * arg)1107 Model *Model::create(const char *arg) {
1108   return createModel(arg);
1109 }
1110 
version()1111 const char *Model::version() {
1112   return VERSION;
1113 }
1114 
parse(const Model & model,Lattice * lattice)1115 bool Tagger::parse(const Model &model, Lattice *lattice) {
1116   scoped_ptr<Tagger> tagger(model.createTagger());
1117   return tagger->parse(lattice);
1118 }
1119 
create()1120 Lattice *Lattice::create() {
1121   return createLattice();
1122 }
1123 
createLattice()1124 Lattice *createLattice() {
1125   return new LatticeImpl;
1126 }
1127 
deleteLattice(Lattice * lattice)1128 void deleteLattice(Lattice *lattice) {
1129   delete lattice;
1130 }
1131 }  // MeCab
1132 
mecab_do(int argc,char ** argv)1133 int mecab_do(int argc, char **argv) {
1134 #define WHAT_ERROR(msg) do {                    \
1135     std::cout << msg << std::endl;              \
1136     return EXIT_FAILURE; }                      \
1137   while (0);
1138 
1139   MeCab::Param param;
1140   if (!param.open(argc, argv, MeCab::long_options)) {
1141     std::cout << param.what() << std::endl;
1142     return EXIT_FAILURE;
1143   }
1144 
1145   if (param.get<bool>("help")) {
1146     std::cout << param.help() << std::endl;
1147     return EXIT_SUCCESS;
1148   }
1149 
1150   if (param.get<bool>("version")) {
1151     std::cout << param.version() << std::endl;
1152     return EXIT_SUCCESS;
1153   }
1154 
1155   if (!load_dictionary_resource(&param)) {
1156     std::cout << param.what() << std::endl;
1157     return EXIT_SUCCESS;
1158   }
1159 
1160   if (param.get<int>("lattice-level") >= 1) {
1161     std::cerr << "lattice-level is DEPERCATED. "
1162               << "use --marginal or --nbest." << std::endl;
1163   }
1164 
1165   MeCab::scoped_ptr<MeCab::ModelImpl> model(new MeCab::ModelImpl);
1166   if (!model->open(param)) {
1167     std::cout << MeCab::getLastError() << std::endl;
1168     return EXIT_FAILURE;
1169   }
1170 
1171   std::string ofilename = param.get<std::string>("output");
1172   if (ofilename.empty()) {
1173     ofilename = "-";
1174   }
1175 
1176   const int nbest = param.get<int>("nbest");
1177   if (nbest <= 0 || nbest > NBEST_MAX) {
1178     WHAT_ERROR("invalid N value");
1179   }
1180 
1181   MeCab::ostream_wrapper ofs(ofilename.c_str());
1182   if (!*ofs) {
1183     WHAT_ERROR("no such file or directory: " << ofilename);
1184   }
1185 
1186   if (param.get<bool>("dump-config")) {
1187     param.dump_config(&*ofs);
1188     return EXIT_FAILURE;
1189   }
1190 
1191   if (param.get<bool>("dictionary-info")) {
1192     for (const MeCab::DictionaryInfo *d = model->dictionary_info();
1193          d; d = d->next) {
1194       *ofs << "filename:\t" << d->filename << std::endl;
1195       *ofs << "version:\t" << d->version << std::endl;
1196       *ofs << "charset:\t" << d->charset << std::endl;
1197       *ofs << "type:\t" << d->type   << std::endl;
1198       *ofs << "size:\t" << d->size << std::endl;
1199       *ofs << "left size:\t" << d->lsize << std::endl;
1200       *ofs << "right size:\t" << d->rsize << std::endl;
1201       *ofs << std::endl;
1202     }
1203     return EXIT_FAILURE;
1204   }
1205 
1206   const std::vector<std::string>& rest_ = param.rest_args();
1207   std::vector<std::string> rest = rest_;
1208 
1209   if (rest.empty()) {
1210     rest.push_back("-");
1211   }
1212 
1213   size_t ibufsize = std::min(MAX_INPUT_BUFFER_SIZE,
1214                              std::max(param.get<int>
1215                                             ("input-buffer-size"),
1216                                             MIN_INPUT_BUFFER_SIZE));
1217 
1218   const bool partial = param.get<bool>("partial");
1219   if (partial) {
1220     ibufsize *= 8;
1221   }
1222 
1223   MeCab::scoped_array<char> ibuf_data(new char[ibufsize]);
1224   char *ibuf = ibuf_data.get();
1225 
1226   MeCab::scoped_ptr<MeCab::Tagger> tagger(model->createTagger());
1227 
1228   if (!tagger.get()) {
1229     WHAT_ERROR("cannot create tagger");
1230   }
1231 
1232   for (size_t i = 0; i < rest.size(); ++i) {
1233     MeCab::istream_wrapper ifs(rest[i].c_str());
1234     if (!*ifs) {
1235       WHAT_ERROR("no such file or directory: " << rest[i]);
1236     }
1237 
1238     while (true) {
1239       if (!partial) {
1240         ifs->getline(ibuf, ibufsize);
1241       } else {
1242         std::string sentence;
1243         MeCab::scoped_fixed_array<char, BUF_SIZE> line;
1244         for (;;) {
1245           if (!ifs->getline(line.get(), line.size())) {
1246             ifs->clear(std::ios::eofbit|std::ios::badbit);
1247             break;
1248           }
1249           sentence += line.get();
1250           sentence += '\n';
1251           if (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0') {
1252             break;
1253           }
1254         }
1255         std::strncpy(ibuf, sentence.c_str(), ibufsize);
1256       }
1257       if (ifs->eof() && !ibuf[0]) {
1258         return false;
1259       }
1260       if (ifs->fail()) {
1261         std::cerr << "input-buffer overflow. "
1262                   << "The line is split. use -b #SIZE option." << std::endl;
1263         ifs->clear();
1264       }
1265       const char *r = (nbest >= 2) ? tagger->parseNBest(nbest, ibuf) :
1266           tagger->parse(ibuf);
1267       if (!r)  {
1268         WHAT_ERROR(tagger->what());
1269       }
1270       *ofs << r << std::flush;
1271     }
1272   }
1273 
1274   return EXIT_SUCCESS;
1275 
1276 #undef WHAT_ERROR
1277 }
1278