1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <fstream>
7 #include <string>
8 #include <vector>
9 #include "common.h"
10 #include "feature_index.h"
11 #include "freelist.h"
12 #include "lbfgs.h"
13 #include "learner_tagger.h"
14 #include "param.h"
15 #include "string_buffer.h"
16 #include "thread.h"
17 #include "utils.h"
18
19 namespace MeCab {
20 namespace {
21
22 #define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
23
24 #ifdef MECAB_USE_THREAD
25 class learner_thread: public thread {
26 public:
27 unsigned short start_i;
28 unsigned short thread_num;
29 size_t size;
30 size_t micro_p;
31 size_t micro_r;
32 size_t micro_c;
33 size_t err;
34 double f;
35 EncoderLearnerTagger **x;
36 std::vector<double> expected;
run()37 void run() {
38 micro_p = micro_r = micro_c = err = 0;
39 f = 0.0;
40 std::fill(expected.begin(), expected.end(), 0.0);
41 for (size_t i = start_i; i < size; i += thread_num) {
42 f += x[i]->gradient(&expected[0]);
43 err += x[i]->eval(µ_c, µ_p, µ_r);
44 }
45 }
46 };
47 #endif
48
49 class CRFLearner {
50 public:
run(Param * param)51 static int run(Param *param) {
52 const std::string dicdir = param->get<std::string>("dicdir");
53 CHECK_DIE(param->load(DCONF(DICRC)))
54 << "no such file or directory: " << DCONF(DICRC);
55
56 const std::vector<std::string> &files = param->rest_args();
57 if (files.size() != 2) {
58 std::cout << "Usage: " <<
59 param->program_name() << " corpus model" << std::endl;
60 return -1;
61 }
62
63 const std::string ifile = files[0];
64 const std::string model = files[1];
65 const std::string old_model = param->get<std::string>("old-model");
66
67 EncoderFeatureIndex feature_index;
68 std::vector<double> expected;
69 std::vector<double> observed;
70 std::vector<double> alpha;
71 std::vector<double> old_alpha;
72 std::vector<EncoderLearnerTagger *> x;
73 Tokenizer<LearnerNode, LearnerPath> tokenizer;
74 Allocator<LearnerNode, LearnerPath> allocator;
75
76 CHECK_DIE(tokenizer.open(*param)) << "cannot open tokenizer";
77 CHECK_DIE(feature_index.open(*param)) << "cannot open feature index";
78
79 if (!old_model.empty()) {
80 std::cout << "Using previous model: " << old_model << std::endl;
81 std::cout << "--cost --freq and --eta options are overwritten."
82 << std::endl;
83 CHECK_DIE(tokenizer.dictionary_info());
84 const char *dic_charset = tokenizer.dictionary_info()->charset;
85 feature_index.reopen(old_model.c_str(),
86 dic_charset, &old_alpha, param);
87 }
88
89 const double C = param->get<double>("cost");
90 const double eta = param->get<double>("eta");
91 const size_t eval_size = param->get<size_t>("eval-size");
92 const size_t unk_eval_size = param->get<size_t>("unk-eval-size");
93 const size_t thread_num = param->get<size_t>("thread");
94 const size_t freq = param->get<size_t>("freq");
95
96 CHECK_DIE(C > 0) << "cost parameter is out of range: " << C;
97 CHECK_DIE(eta > 0) "eta is out of range: " << eta;
98 CHECK_DIE(eval_size > 0) << "eval-size is out of range: " << eval_size;
99 CHECK_DIE(unk_eval_size > 0) <<
100 "unk-eval-size is out of range: " << unk_eval_size;
101 CHECK_DIE(freq > 0) <<
102 "freq is out of range: " << unk_eval_size;
103 CHECK_DIE(thread_num > 0 && thread_num <= 512)
104 << "# thread is invalid: " << thread_num;
105
106 std::cout.setf(std::ios::fixed, std::ios::floatfield);
107 std::cout.precision(5);
108
109 std::cout << "reading corpus ..." << std::flush;
110
111 std::ifstream ifs(WPATH(ifile.c_str()));
112 CHECK_DIE(ifs) << "no such file or directory: " << ifile;
113
114 while (ifs) {
115 EncoderLearnerTagger *tagger = new EncoderLearnerTagger();
116
117 CHECK_DIE(tagger->open(&tokenizer,
118 &allocator,
119 &feature_index,
120 eval_size,
121 unk_eval_size));
122
123 CHECK_DIE(tagger->read(&ifs, &observed));
124
125 if (!tagger->empty()) {
126 x.push_back(tagger);
127 } else {
128 delete tagger;
129 }
130
131 if (x.size() % 100 == 0) {
132 std::cout << x.size() << "... " << std::flush;
133 }
134 }
135
136 feature_index.shrink(freq, &observed);
137 feature_index.clearcache();
138
139 const size_t psize = feature_index.size();
140 observed.resize(psize);
141 expected.resize(psize);
142 alpha.resize(psize);
143 old_alpha.resize(psize);
144 alpha = old_alpha;
145
146 feature_index.set_alpha(&alpha[0]);
147
148 std::cout << std::endl;
149 std::cout << "Number of sentences: " << x.size() << std::endl;
150 std::cout << "Number of features: " << psize << std::endl;
151 std::cout << "eta: " << eta << std::endl;
152 std::cout << "freq: " << freq << std::endl;
153 std::cout << "eval-size: " << eval_size << std::endl;
154 std::cout << "unk-eval-size: " << unk_eval_size << std::endl;
155 #ifdef MECAB_USE_THREAD
156 std::cout << "threads: " << thread_num << std::endl;
157 #endif
158 std::cout << "charset: " <<
159 tokenizer.dictionary_info()->charset << std::endl;
160 std::cout << "C(sigma^2): " << C << std::endl
161 << std::endl;
162
163 #ifdef MECAB_USE_THREAD
164 std::vector<learner_thread> thread;
165 if (thread_num > 1) {
166 thread.resize(thread_num);
167 for (size_t i = 0; i < thread_num; ++i) {
168 thread[i].start_i = i;
169 thread[i].size = x.size();
170 thread[i].thread_num = thread_num;
171 thread[i].x = &x[0];
172 thread[i].expected.resize(expected.size());
173 }
174 }
175 #endif
176
177 int converge = 0;
178 double prev_obj = 0.0;
179 LBFGS lbfgs;
180
181 for (size_t itr = 0; ; ++itr) {
182 std::fill(expected.begin(), expected.end(), 0.0);
183 double obj = 0.0;
184 size_t err = 0;
185 size_t micro_p = 0;
186 size_t micro_r = 0;
187 size_t micro_c = 0;
188
189 #ifdef MECAB_USE_THREAD
190 if (thread_num > 1) {
191 for (size_t i = 0; i < thread_num; ++i) {
192 thread[i].start();
193 }
194
195 for (size_t i = 0; i < thread_num; ++i) {
196 thread[i].join();
197 }
198
199 for (size_t i = 0; i < thread_num; ++i) {
200 obj += thread[i].f;
201 err += thread[i].err;
202 micro_r += thread[i].micro_r;
203 micro_p += thread[i].micro_p;
204 micro_c += thread[i].micro_c;
205 for (size_t k = 0; k < psize; ++k) {
206 expected[k] += thread[i].expected[k];
207 }
208 }
209 } else
210 #endif
211 {
212 for (size_t i = 0; i < x.size(); ++i) {
213 obj += x[i]->gradient(&expected[0]);
214 err += x[i]->eval(µ_c, µ_p, µ_r);
215 }
216 }
217
218 const double p = 1.0 * micro_c / micro_p;
219 const double r = 1.0 * micro_c / micro_r;
220 const double micro_f = 2 * p * r / (p + r);
221
222 for (size_t i = 0; i < psize; ++i) {
223 const double penalty = (alpha[i] - old_alpha[i]);
224 obj += (penalty * penalty / (2.0 * C));
225 expected[i] = expected[i] - observed[i] + penalty / C;
226 }
227
228 const double diff = (itr == 0 ? 1.0 :
229 std::fabs(1.0 * (prev_obj - obj)) / prev_obj);
230 std::cout << "iter=" << itr
231 << " err=" << 1.0 * err/x.size()
232 << " F=" << micro_f
233 << " target=" << obj
234 << " diff=" << diff << std::endl;
235 prev_obj = obj;
236
237 if (diff < eta) {
238 converge++;
239 } else {
240 converge = 0;
241 }
242
243 if (converge == 3) {
244 break; // 3 is ad-hoc
245 }
246
247 const int ret = lbfgs.optimize(psize,
248 &alpha[0], obj,
249 &expected[0], false, C);
250
251 CHECK_DIE(ret >= 0) << "unexpected error in LBFGS routin";
252
253 if (ret == 0) {
254 break;
255 }
256 }
257
258 std::cout << "\nDone! writing model file ... " << std::endl;
259
260 std::ostringstream oss;
261
262 oss << "eta: " << eta << std::endl;
263 oss << "freq: " << freq << std::endl;
264 oss << "C: " << C << std::endl;
265 oss.setf(std::ios::fixed, std::ios::floatfield);
266 oss.precision(16);
267 oss << "eval-size: " << eval_size << std::endl;
268 oss << "unk-eval-size: " << unk_eval_size << std::endl;
269 oss << "charset: " << tokenizer.dictionary_info()->charset << std::endl;
270
271 const std::string header = oss.str();
272
273 CHECK_DIE(feature_index.save(model.c_str(), header.c_str()))
274 << "permission denied: " << model;
275
276 return 0;
277 }
278 };
279
280 class Learner {
281 public:
run(int argc,char ** argv)282 static bool run(int argc, char **argv) {
283 static const MeCab::Option long_options[] = {
284 { "dicdir", 'd', ".", "DIR",
285 "set DIR as dicdir(default \".\" )" },
286 { "old-model", 'M', 0, "FILE",
287 "set FILE as old CRF model file" },
288 { "cost", 'c', "1.0", "FLOAT",
289 "set FLOAT for cost C for constraints violatoin" },
290 { "freq", 'f', "1", "INT",
291 "set the frequency cut-off (default 1)" },
292 { "eta", 'e', "0.00005", "DIR",
293 "set FLOAT for tolerance of termination criterion" },
294 { "thread", 'p', "1", "INT", "number of threads(default 1)" },
295 { "version", 'v', 0, 0, "show the version and exit" },
296 { "help", 'h', 0, 0, "show this help and exit." },
297 { 0, 0, 0, 0 }
298 };
299
300 Param param;
301
302 if (!param.open(argc, argv, long_options)) {
303 std::cout << param.what() << "\n\n" << COPYRIGHT
304 << "\ntry '--help' for more information." << std::endl;
305 return -1;
306 }
307
308 if (!param.help_version()) {
309 return 0;
310 }
311
312 return CRFLearner::run(¶m);
313 }
314 };
315 }
316 }
317
mecab_cost_train(int argc,char ** argv)318 int mecab_cost_train(int argc, char **argv) {
319 return MeCab::Learner::run(argc, argv);
320 }
321