1 //  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 //  Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 //  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <fstream>
7 #include <string>
8 #include <vector>
9 #include "common.h"
10 #include "feature_index.h"
11 #include "freelist.h"
12 #include "lbfgs.h"
13 #include "learner_tagger.h"
14 #include "param.h"
15 #include "string_buffer.h"
16 #include "thread.h"
17 #include "utils.h"
18 
19 namespace MeCab {
20 namespace {
21 
22 #define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
23 
24 #ifdef MECAB_USE_THREAD
25 class learner_thread: public thread {
26  public:
27   unsigned short start_i;
28   unsigned short thread_num;
29   size_t size;
30   size_t micro_p;
31   size_t micro_r;
32   size_t micro_c;
33   size_t err;
34   double f;
35   EncoderLearnerTagger **x;
36   std::vector<double> expected;
run()37   void run() {
38     micro_p = micro_r = micro_c = err = 0;
39     f = 0.0;
40     std::fill(expected.begin(), expected.end(), 0.0);
41     for (size_t i = start_i; i < size; i += thread_num) {
42       f += x[i]->gradient(&expected[0]);
43       err += x[i]->eval(&micro_c, &micro_p, &micro_r);
44     }
45   }
46 };
47 #endif
48 
49 class CRFLearner {
50  public:
run(Param * param)51   static int run(Param *param) {
52     const std::string dicdir = param->get<std::string>("dicdir");
53     CHECK_DIE(param->load(DCONF(DICRC)))
54         << "no such file or directory: " << DCONF(DICRC);
55 
56     const std::vector<std::string> &files = param->rest_args();
57     if (files.size() != 2) {
58       std::cout << "Usage: " <<
59           param->program_name() << " corpus model" << std::endl;
60       return -1;
61     }
62 
63     const std::string ifile = files[0];
64     const std::string model = files[1];
65     const std::string old_model = param->get<std::string>("old-model");
66 
67     EncoderFeatureIndex feature_index;
68     std::vector<double> expected;
69     std::vector<double> observed;
70     std::vector<double> alpha;
71     std::vector<double> old_alpha;
72     std::vector<EncoderLearnerTagger *> x;
73     Tokenizer<LearnerNode, LearnerPath> tokenizer;
74     Allocator<LearnerNode, LearnerPath> allocator;
75 
76     CHECK_DIE(tokenizer.open(*param)) << "cannot open tokenizer";
77     CHECK_DIE(feature_index.open(*param)) << "cannot open feature index";
78 
79     if (!old_model.empty()) {
80       std::cout << "Using previous model: " << old_model << std::endl;
81       std::cout << "--cost --freq and --eta options are overwritten."
82                 << std::endl;
83       CHECK_DIE(tokenizer.dictionary_info());
84       const char *dic_charset = tokenizer.dictionary_info()->charset;
85       feature_index.reopen(old_model.c_str(),
86                            dic_charset, &old_alpha, param);
87     }
88 
89     const double C = param->get<double>("cost");
90     const double eta = param->get<double>("eta");
91     const size_t eval_size = param->get<size_t>("eval-size");
92     const size_t unk_eval_size = param->get<size_t>("unk-eval-size");
93     const size_t thread_num = param->get<size_t>("thread");
94     const size_t freq = param->get<size_t>("freq");
95 
96     CHECK_DIE(C > 0) << "cost parameter is out of range: " << C;
97     CHECK_DIE(eta > 0) "eta is out of range: " << eta;
98     CHECK_DIE(eval_size > 0) << "eval-size is out of range: " << eval_size;
99     CHECK_DIE(unk_eval_size > 0) <<
100         "unk-eval-size is out of range: " << unk_eval_size;
101     CHECK_DIE(freq > 0) <<
102         "freq is out of range: " << unk_eval_size;
103     CHECK_DIE(thread_num > 0 && thread_num <= 512)
104         << "# thread is invalid: " << thread_num;
105 
106     std::cout.setf(std::ios::fixed, std::ios::floatfield);
107     std::cout.precision(5);
108 
109     std::cout << "reading corpus ..." << std::flush;
110 
111     std::ifstream ifs(WPATH(ifile.c_str()));
112     CHECK_DIE(ifs) << "no such file or directory: " << ifile;
113 
114     while (ifs) {
115       EncoderLearnerTagger *tagger = new EncoderLearnerTagger();
116 
117       CHECK_DIE(tagger->open(&tokenizer,
118                              &allocator,
119                              &feature_index,
120                              eval_size,
121                              unk_eval_size));
122 
123       CHECK_DIE(tagger->read(&ifs, &observed));
124 
125       if (!tagger->empty()) {
126         x.push_back(tagger);
127       } else {
128         delete tagger;
129       }
130 
131       if (x.size() % 100 == 0) {
132         std::cout << x.size() << "... " << std::flush;
133       }
134     }
135 
136     feature_index.shrink(freq, &observed);
137     feature_index.clearcache();
138 
139     const size_t psize = feature_index.size();
140     observed.resize(psize);
141     expected.resize(psize);
142     alpha.resize(psize);
143     old_alpha.resize(psize);
144     alpha = old_alpha;
145 
146     feature_index.set_alpha(&alpha[0]);
147 
148     std::cout << std::endl;
149     std::cout << "Number of sentences: " << x.size()  << std::endl;
150     std::cout << "Number of features:  " << psize     << std::endl;
151     std::cout << "eta:                 " << eta       << std::endl;
152     std::cout << "freq:                " << freq      << std::endl;
153     std::cout << "eval-size:           " << eval_size << std::endl;
154     std::cout << "unk-eval-size:       " << unk_eval_size << std::endl;
155 #ifdef MECAB_USE_THREAD
156     std::cout << "threads:             " << thread_num << std::endl;
157 #endif
158     std::cout << "charset:             " <<
159         tokenizer.dictionary_info()->charset << std::endl;
160     std::cout << "C(sigma^2):          " << C          << std::endl
161               << std::endl;
162 
163 #ifdef MECAB_USE_THREAD
164     std::vector<learner_thread> thread;
165     if (thread_num > 1) {
166       thread.resize(thread_num);
167       for (size_t i = 0; i < thread_num; ++i) {
168         thread[i].start_i = i;
169         thread[i].size = x.size();
170         thread[i].thread_num = thread_num;
171         thread[i].x = &x[0];
172         thread[i].expected.resize(expected.size());
173       }
174     }
175 #endif
176 
177     int converge = 0;
178     double prev_obj = 0.0;
179     LBFGS lbfgs;
180 
181     for (size_t itr = 0; ;  ++itr) {
182       std::fill(expected.begin(), expected.end(), 0.0);
183       double obj = 0.0;
184       size_t err = 0;
185       size_t micro_p = 0;
186       size_t micro_r = 0;
187       size_t micro_c = 0;
188 
189 #ifdef MECAB_USE_THREAD
190       if (thread_num > 1) {
191         for (size_t i = 0; i < thread_num; ++i) {
192           thread[i].start();
193         }
194 
195         for (size_t i = 0; i < thread_num; ++i) {
196           thread[i].join();
197         }
198 
199         for (size_t i = 0; i < thread_num; ++i) {
200           obj += thread[i].f;
201           err += thread[i].err;
202           micro_r += thread[i].micro_r;
203           micro_p += thread[i].micro_p;
204           micro_c += thread[i].micro_c;
205           for (size_t k = 0; k < psize; ++k) {
206             expected[k] += thread[i].expected[k];
207           }
208         }
209       } else
210 #endif
211       {
212         for (size_t i = 0; i < x.size(); ++i) {
213           obj += x[i]->gradient(&expected[0]);
214           err += x[i]->eval(&micro_c, &micro_p, &micro_r);
215         }
216       }
217 
218       const double p = 1.0 * micro_c / micro_p;
219       const double r = 1.0 * micro_c / micro_r;
220       const double micro_f = 2 * p * r / (p + r);
221 
222       for (size_t i = 0; i < psize; ++i) {
223         const double penalty = (alpha[i] - old_alpha[i]);
224         obj += (penalty * penalty / (2.0 * C));
225         expected[i] = expected[i] - observed[i] + penalty / C;
226       }
227 
228       const double diff = (itr == 0 ? 1.0 :
229                            std::fabs(1.0 * (prev_obj - obj)) / prev_obj);
230       std::cout << "iter="    << itr
231                 << " err="    << 1.0 * err/x.size()
232                 << " F="      << micro_f
233                 << " target=" << obj
234                 << " diff="   << diff << std::endl;
235       prev_obj = obj;
236 
237       if (diff < eta) {
238         converge++;
239       } else {
240         converge = 0;
241       }
242 
243       if (converge == 3) {
244         break;  // 3 is ad-hoc
245       }
246 
247       const int ret = lbfgs.optimize(psize,
248                                      &alpha[0], obj,
249                                      &expected[0], false, C);
250 
251       CHECK_DIE(ret >= 0) << "unexpected error in LBFGS routin";
252 
253       if (ret == 0) {
254         break;
255       }
256     }
257 
258     std::cout << "\nDone! writing model file ... " << std::endl;
259 
260     std::ostringstream oss;
261 
262     oss << "eta: "  << eta   << std::endl;
263     oss << "freq: " << freq  << std::endl;
264     oss << "C: "    << C     << std::endl;
265     oss.setf(std::ios::fixed, std::ios::floatfield);
266     oss.precision(16);
267     oss << "eval-size: " << eval_size << std::endl;
268     oss << "unk-eval-size: " << unk_eval_size << std::endl;
269     oss << "charset: " <<  tokenizer.dictionary_info()->charset << std::endl;
270 
271     const std::string header = oss.str();
272 
273     CHECK_DIE(feature_index.save(model.c_str(), header.c_str()))
274         << "permission denied: " << model;
275 
276     return 0;
277   }
278 };
279 
280 class Learner {
281  public:
run(int argc,char ** argv)282   static bool run(int argc, char **argv) {
283     static const MeCab::Option long_options[] = {
284       { "dicdir",   'd',  ".",     "DIR",
285         "set DIR as dicdir(default \".\" )" },
286       { "old-model",   'M',  0,     "FILE",
287         "set FILE as old CRF model file" },
288       { "cost",     'c',  "1.0",   "FLOAT",
289         "set FLOAT for cost C for constraints violatoin" },
290       { "freq",     'f',  "1",     "INT",
291         "set the frequency cut-off (default 1)" },
292       { "eta",      'e',  "0.00005", "DIR",
293         "set FLOAT for tolerance of termination criterion" },
294       { "thread",   'p',  "1",     "INT",    "number of threads(default 1)" },
295       { "version",  'v',  0,   0,  "show the version and exit"  },
296       { "help",     'h',  0,   0,  "show this help and exit."      },
297       { 0, 0, 0, 0 }
298     };
299 
300     Param param;
301 
302     if (!param.open(argc, argv, long_options)) {
303       std::cout << param.what() << "\n\n" <<  COPYRIGHT
304                 << "\ntry '--help' for more information." << std::endl;
305       return -1;
306     }
307 
308     if (!param.help_version()) {
309       return 0;
310     }
311 
312     return CRFLearner::run(&param);
313   }
314 };
315 }
316 }
317 
mecab_cost_train(int argc,char ** argv)318 int mecab_cost_train(int argc, char **argv) {
319   return MeCab::Learner::run(argc, argv);
320 }
321