1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <fstream>
7 #include <climits>
8 #include "connector.h"
9 #include "context_id.h"
10 #include "char_property.h"
11 #include "common.h"
12 #include "dictionary.h"
13 #include "dictionary_rewriter.h"
14 #include "feature_index.h"
15 #include "iconv_utils.h"
16 #include "mmap.h"
17 #include "param.h"
18 #include "scoped_ptr.h"
19 #include "utils.h"
20 #include "writer.h"
21
22 namespace MeCab {
23 namespace {
24
25 const unsigned int DictionaryMagicID = 0xef718f77u;
26
toInt(const char * str)27 int toInt(const char *str) {
28 if (!str || std::strlen(str) == 0) {
29 return INT_MAX;
30 }
31 return std::atoi(str);
32 }
33
calcCost(const std::string & w,const std::string & feature,int factor,DecoderFeatureIndex * fi,DictionaryRewriter * rewriter,CharProperty * property)34 int calcCost(const std::string &w, const std::string &feature,
35 int factor,
36 DecoderFeatureIndex *fi, DictionaryRewriter *rewriter,
37 CharProperty *property) {
38 CHECK_DIE(fi);
39 CHECK_DIE(rewriter);
40 CHECK_DIE(property);
41
42 LearnerPath path;
43 LearnerNode rnode;
44 LearnerNode lnode;
45 rnode.stat = lnode.stat = MECAB_NOR_NODE;
46 rnode.rpath = &path;
47 lnode.lpath = &path;
48 path.lnode = &lnode;
49 path.rnode = &rnode;
50
51 size_t mblen = 0;
52 const CharInfo cinfo = property->getCharInfo(w.c_str(),
53 w.c_str() + w.size(),
54 &mblen);
55 path.rnode->char_type = cinfo.default_type;
56 std::string ufeature, lfeature, rfeature;
57 rewriter->rewrite2(feature, &ufeature, &lfeature, &rfeature);
58 fi->buildUnigramFeature(&path, ufeature.c_str());
59 fi->calcCost(&rnode);
60 return tocost(rnode.wcost, factor);
61 }
62
progress_bar_darts(size_t current,size_t total)63 int progress_bar_darts(size_t current, size_t total) {
64 return progress_bar("emitting double-array", current, total);
65 }
66
67 template <typename T1, typename T2>
68 struct pair_1st_cmp: public std::binary_function<bool, T1, T2> {
operator ()MeCab::__anon7f35eb7f0111::pair_1st_cmp69 bool operator()(const std::pair<T1, T2> &x1,
70 const std::pair<T1, T2> &x2) {
71 return x1.first < x2.first;
72 }
73 };
74 } // namespace
75
open(const char * file,const char * mode)76 bool Dictionary::open(const char *file, const char *mode) {
77 close();
78 filename_.assign(file);
79 CHECK_FALSE(dmmap_->open(file, mode))
80 << "no such file or directory: " << file;
81
82 CHECK_FALSE(dmmap_->size() >= 100)
83 << "dictionary file is broken: " << file;
84
85 const char *ptr = dmmap_->begin();
86
87 unsigned int dsize;
88 unsigned int tsize;
89 unsigned int fsize;
90 unsigned int magic;
91 unsigned int dummy;
92
93 read_static<unsigned int>(&ptr, magic);
94 CHECK_FALSE((magic ^ DictionaryMagicID) == dmmap_->size())
95 << "dictionary file is broken: " << file;
96
97 read_static<unsigned int>(&ptr, version_);
98 CHECK_FALSE(version_ == DIC_VERSION)
99 << "incompatible version: " << version_;
100
101 read_static<unsigned int>(&ptr, type_);
102 read_static<unsigned int>(&ptr, lexsize_);
103 read_static<unsigned int>(&ptr, lsize_);
104 read_static<unsigned int>(&ptr, rsize_);
105 read_static<unsigned int>(&ptr, dsize);
106 read_static<unsigned int>(&ptr, tsize);
107 read_static<unsigned int>(&ptr, fsize);
108 read_static<unsigned int>(&ptr, dummy);
109
110 charset_ = ptr;
111 ptr += 32;
112 da_.set_array(reinterpret_cast<void *>(const_cast<char*>(ptr)));
113
114 ptr += dsize;
115
116 token_ = reinterpret_cast<const Token *>(ptr);
117 ptr += tsize;
118
119 feature_ = ptr;
120 ptr += fsize;
121
122 CHECK_FALSE(ptr == dmmap_->end())
123 << "dictionary file is broken: " << file;
124
125 return true;
126 }
127
close()128 void Dictionary::close() {
129 dmmap_->close();
130 }
131
132 #define DCONF(file) create_filename(dicdir, std::string(file));
133
assignUserDictionaryCosts(const Param & param,const std::vector<std::string> & dics,const char * output)134 bool Dictionary::assignUserDictionaryCosts(
135 const Param ¶m,
136 const std::vector<std::string> &dics,
137 const char *output) {
138 Connector matrix;
139 DictionaryRewriter rewriter;
140 DecoderFeatureIndex fi;
141 ContextID cid;
142 CharProperty property;
143
144 const std::string dicdir = param.get<std::string>("dicdir");
145
146 const std::string matrix_file = DCONF(MATRIX_DEF_FILE);
147 const std::string matrix_bin_file = DCONF(MATRIX_FILE);
148 const std::string left_id_file = DCONF(LEFT_ID_FILE);
149 const std::string right_id_file = DCONF(RIGHT_ID_FILE);
150 const std::string rewrite_file = DCONF(REWRITE_FILE);
151
152 const std::string from = param.get<std::string>("dictionary-charset");
153
154 const int factor = param.get<int>("cost-factor");
155 CHECK_DIE(factor > 0) << "cost factor needs to be positive value";
156
157 std::string config_charset = param.get<std::string>("config-charset");
158 if (config_charset.empty()) {
159 config_charset = from;
160 }
161
162 CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
163
164 Iconv config_iconv;
165 CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
166 << "iconv_open() failed with from=" << config_charset << " to=" << from;
167
168 rewriter.open(rewrite_file.c_str(), &config_iconv);
169 CHECK_DIE(fi.open(param)) << "cannot open feature index";
170
171 CHECK_DIE(property.open(param));
172 property.set_charset(from.c_str());
173
174 if (!matrix.openText(matrix_file.c_str()) &&
175 !matrix.open(matrix_bin_file.c_str())) {
176 matrix.set_left_size(1);
177 matrix.set_right_size(1);
178 }
179
180 cid.open(left_id_file.c_str(),
181 right_id_file.c_str(), &config_iconv);
182 CHECK_DIE(cid.left_size() == matrix.left_size() &&
183 cid.right_size() == matrix.right_size())
184 << "Context ID files("
185 << left_id_file
186 << " or "
187 << right_id_file << " may be broken: "
188 << cid.left_size() << " " << matrix.left_size() << " "
189 << cid.right_size() << " " << matrix.right_size();
190
191 std::ofstream ofs(output);
192 CHECK_DIE(ofs) << "permission denied: " << output;
193
194 for (size_t i = 0; i < dics.size(); ++i) {
195 std::ifstream ifs(WPATH(dics[i].c_str()));
196 CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
197 std::cout << "reading " << dics[i] << " ... ";
198 scoped_fixed_array<char, BUF_SIZE> line;
199 while (ifs.getline(line.get(), line.size())) {
200 char *col[8];
201 const size_t n = tokenizeCSV(line.get(), col, 5);
202 CHECK_DIE(n == 5) << "format error: " << line.get();
203 std::string w = col[0];
204 const std::string feature = col[4];
205 const int cost = calcCost(w, feature, factor,
206 &fi, &rewriter, &property);
207 std::string ufeature, lfeature, rfeature;
208 CHECK_DIE(rewriter.rewrite(feature, &ufeature, &lfeature, &rfeature))
209 << "rewrite failed: " << feature;
210 const int lid = cid.lid(lfeature.c_str());
211 const int rid = cid.rid(rfeature.c_str());
212 CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
213 << "invalid ids are found lid=" << lid << " rid=" << rid;
214 escape_csv_element(&w);
215 ofs << w << ',' << lid << ',' << rid << ','
216 << cost << ',' << feature << '\n';
217 }
218 }
219
220 return true;
221 }
222
compile(const Param & param,const std::vector<std::string> & dics,const char * output)223 bool Dictionary::compile(const Param ¶m,
224 const std::vector<std::string> &dics,
225 const char *output) {
226 Connector matrix;
227 scoped_ptr<DictionaryRewriter> rewrite;
228 scoped_ptr<POSIDGenerator> posid;
229 scoped_ptr<DecoderFeatureIndex> fi;
230 scoped_ptr<ContextID> cid;
231 scoped_ptr<Writer> writer;
232 scoped_ptr<Lattice> lattice;
233 scoped_ptr<StringBuffer> os;
234 scoped_ptr<CharProperty> property;
235 Node node;
236
237 const std::string dicdir = param.get<std::string>("dicdir");
238
239 const std::string matrix_file = DCONF(MATRIX_DEF_FILE);
240 const std::string matrix_bin_file = DCONF(MATRIX_FILE);
241 const std::string left_id_file = DCONF(LEFT_ID_FILE);
242 const std::string right_id_file = DCONF(RIGHT_ID_FILE);
243 const std::string rewrite_file = DCONF(REWRITE_FILE);
244 const std::string pos_id_file = DCONF(POS_ID_FILE);
245
246 std::vector<std::pair<std::string, Token*> > dic;
247
248 size_t offset = 0;
249 unsigned int lexsize = 0;
250 std::string fbuf;
251
252 const std::string from = param.get<std::string>("dictionary-charset");
253 const std::string to = param.get<std::string>("charset");
254 const bool wakati = param.get<bool>("wakati");
255 const int type = param.get<int>("type");
256 const std::string node_format = param.get<std::string>("node-format");
257 const int factor = param.get<int>("cost-factor");
258 CHECK_DIE(factor > 0) << "cost factor needs to be positive value";
259
260 // for backward compatibility
261 std::string config_charset = param.get<std::string>("config-charset");
262 if (config_charset.empty()) {
263 config_charset = from;
264 }
265
266 CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
267 CHECK_DIE(!to.empty()) << "output dictionary charset is empty";
268
269 Iconv iconv;
270 CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
271 << "iconv_open() failed with from=" << from << " to=" << to;
272
273 Iconv config_iconv;
274 CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
275 << "iconv_open() failed with from=" << config_charset << " to=" << from;
276
277 if (!node_format.empty()) {
278 writer.reset(new Writer);
279 lattice.reset(createLattice());
280 os.reset(new StringBuffer);
281 memset(&node, 0, sizeof(node));
282 }
283
284 if (!matrix.openText(matrix_file.c_str()) &&
285 !matrix.open(matrix_bin_file.c_str())) {
286 matrix.set_left_size(1);
287 matrix.set_right_size(1);
288 }
289
290 posid.reset(new POSIDGenerator);
291 posid->open(pos_id_file.c_str(), &config_iconv);
292
293 std::istringstream iss(UNK_DEF_DEFAULT);
294
295 for (size_t i = 0; i < dics.size(); ++i) {
296 std::ifstream ifs(WPATH(dics[i].c_str()));
297 std::istream *is = &ifs;
298 if (!ifs) {
299 if (type == MECAB_UNK_DIC) {
300 std::cerr << dics[i]
301 << " is not found. minimum setting is used." << std::endl;
302 is = &iss;
303 } else {
304 CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
305 }
306 }
307
308 std::cout << "reading " << dics[i] << " ... ";
309
310 scoped_fixed_array<char, BUF_SIZE> line;
311 size_t num = 0;
312
313 while (is->getline(line.get(), line.size())) {
314 char *col[8];
315 const size_t n = tokenizeCSV(line.get(), col, 5);
316 CHECK_DIE(n == 5) << "format error: " << line.get();
317
318 std::string w = col[0];
319 int lid = toInt(col[1]);
320 int rid = toInt(col[2]);
321 int cost = toInt(col[3]);
322 std::string feature = col[4];
323 const int pid = posid->id(feature.c_str());
324
325 if (cost == INT_MAX) {
326 CHECK_DIE(type == MECAB_USR_DIC)
327 << "cost field should not be empty in sys/unk dic.";
328 if (!rewrite.get()) {
329 rewrite.reset(new DictionaryRewriter);
330 rewrite->open(rewrite_file.c_str(), &config_iconv);
331 fi.reset(new DecoderFeatureIndex);
332 CHECK_DIE(fi->open(param)) << "cannot open feature index";
333 property.reset(new CharProperty);
334 CHECK_DIE(property->open(param));
335 property->set_charset(from.c_str());
336 }
337 cost = calcCost(w, feature, factor,
338 fi.get(), rewrite.get(), property.get());
339 }
340
341 if (lid < 0 || rid < 0 || lid == INT_MAX || rid == INT_MAX) {
342 if (!rewrite.get()) {
343 rewrite.reset(new DictionaryRewriter);
344 rewrite->open(rewrite_file.c_str(), &config_iconv);
345 }
346
347 std::string ufeature, lfeature, rfeature;
348 CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature))
349 << "rewrite failed: " << feature;
350
351 if (!cid.get()) {
352 cid.reset(new ContextID);
353 cid->open(left_id_file.c_str(),
354 right_id_file.c_str(), &config_iconv);
355 CHECK_DIE(cid->left_size() == matrix.left_size() &&
356 cid->right_size() == matrix.right_size())
357 << "Context ID files("
358 << left_id_file
359 << " or "
360 << right_id_file << " may be broken";
361 }
362
363 lid = cid->lid(lfeature.c_str());
364 rid = cid->rid(rfeature.c_str());
365 }
366
367 CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
368 << "invalid ids are found lid=" << lid << " rid=" << rid;
369
370 if (w.empty()) {
371 std::cerr << "empty word is found, discard this line" << std::endl;
372 continue;
373 }
374
375 if (!iconv.convert(&feature)) {
376 std::cerr << "iconv conversion failed. skip this entry"
377 << std::endl;
378 continue;
379 }
380
381 if (type != MECAB_UNK_DIC && !iconv.convert(&w)) {
382 std::cerr << "iconv conversion failed. skip this entry"
383 << std::endl;
384 continue;
385 }
386
387 if (!node_format.empty()) {
388 node.surface = w.c_str();
389 node.feature = feature.c_str();
390 node.length = w.size();
391 node.rlength = w.size();
392 node.posid = pid;
393 node.stat = MECAB_NOR_NODE;
394 lattice->set_sentence(w.c_str());
395 CHECK_DIE(os.get());
396 CHECK_DIE(writer.get());
397 os->clear();
398 CHECK_DIE(writer->writeNode(lattice.get(),
399 node_format.c_str(),
400 &node, &*os)) <<
401 "conversion error: " << feature << " with " << node_format;
402 *os << '\0';
403 feature = os->str();
404 }
405
406 std::string key;
407 if (!wakati) {
408 key = feature + '\0';
409 }
410
411 Token* token = new Token;
412 token->lcAttr = lid;
413 token->rcAttr = rid;
414 token->posid = pid;
415 token->wcost = cost;
416 token->feature = offset;
417 token->compound = 0;
418 dic.push_back(std::pair<std::string, Token*>(w, token));
419
420 // append to output buffer
421 if (!wakati) {
422 fbuf.append(key.data(), key.size());
423 }
424 offset += key.size();
425
426 ++num;
427 ++lexsize;
428 }
429
430 std::cout << num << std::endl;
431 }
432
433 if (wakati) {
434 fbuf.append("\0", 1);
435 }
436
437 std::stable_sort(dic.begin(), dic.end(),
438 pair_1st_cmp<std::string, Token *>());
439
440 size_t bsize = 0;
441 size_t idx = 0;
442 std::string prev;
443 std::vector<const char *> str;
444 std::vector<size_t> len;
445 std::vector<Darts::DoubleArray::result_type> val;
446
447 for (size_t i = 0; i < dic.size(); ++i) {
448 if (i != 0 && prev != dic[i].first) {
449 str.push_back(dic[idx].first.c_str());
450 len.push_back(dic[idx].first.size());
451 val.push_back(bsize +(idx << 8));
452 bsize = 1;
453 idx = i;
454 } else {
455 ++bsize;
456 }
457 prev = dic[i].first;
458 }
459 str.push_back(dic[idx].first.c_str());
460 len.push_back(dic[idx].first.size());
461 val.push_back(bsize +(idx << 8));
462
463 CHECK_DIE(str.size() == len.size());
464 CHECK_DIE(str.size() == val.size());
465
466 Darts::DoubleArray da;
467 CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]),
468 &len[0], &val[0], &progress_bar_darts) == 0)
469 << "unkown error in building double-array";
470
471 std::string tbuf;
472 for (size_t i = 0; i < dic.size(); ++i) {
473 tbuf.append(reinterpret_cast<const char*>(dic[i].second),
474 sizeof(Token));
475 delete dic[i].second;
476 }
477 dic.clear();
478
479 // needs to be 8byte(64bit) aligned
480 while (tbuf.size() % 8 != 0) {
481 Token dummy;
482 memset(&dummy, 0, sizeof(Token));
483 tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token));
484 }
485
486 unsigned int dummy = 0;
487 unsigned int lsize = matrix.left_size();
488 unsigned int rsize = matrix.right_size();
489 unsigned int dsize = da.unit_size() * da.size();
490 unsigned int tsize = tbuf.size();
491 unsigned int fsize = fbuf.size();
492
493 unsigned int version = DIC_VERSION;
494 char charset[32];
495 std::fill(charset, charset + sizeof(charset), '\0');
496 std::strncpy(charset, to.c_str(), 31);
497
498 std::ofstream bofs(WPATH(output), std::ios::binary|std::ios::out);
499 CHECK_DIE(bofs) << "permission denied: " << output;
500
501 unsigned int magic = 0;
502
503 // needs to be 64bit aligned
504 // 10*32 = 64*5
505 bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));
506 bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int));
507 bofs.write(reinterpret_cast<const char *>(&type), sizeof(unsigned int));
508 bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int));
509 bofs.write(reinterpret_cast<const char *>(&lsize), sizeof(unsigned int));
510 bofs.write(reinterpret_cast<const char *>(&rsize), sizeof(unsigned int));
511 bofs.write(reinterpret_cast<const char *>(&dsize), sizeof(unsigned int));
512 bofs.write(reinterpret_cast<const char *>(&tsize), sizeof(unsigned int));
513 bofs.write(reinterpret_cast<const char *>(&fsize), sizeof(unsigned int));
514 bofs.write(reinterpret_cast<const char *>(&dummy), sizeof(unsigned int));
515
516 // 32 * 8 = 64 * 4
517 bofs.write(reinterpret_cast<const char *>(charset), sizeof(charset));
518
519 bofs.write(reinterpret_cast<const char*>(da.array()),
520 da.unit_size() * da.size());
521 bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size());
522 bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size());
523
524 // save magic id
525 magic = static_cast<unsigned int>(bofs.tellp());
526 magic ^= DictionaryMagicID;
527 bofs.seekp(0);
528 bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));
529
530 bofs.close();
531
532 return true;
533 }
534 }
535