1 //  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 //  Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 //  Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #include <fstream>
7 #include <climits>
8 #include "connector.h"
9 #include "context_id.h"
10 #include "char_property.h"
11 #include "common.h"
12 #include "dictionary.h"
13 #include "dictionary_rewriter.h"
14 #include "feature_index.h"
15 #include "iconv_utils.h"
16 #include "mmap.h"
17 #include "param.h"
18 #include "scoped_ptr.h"
19 #include "utils.h"
20 #include "writer.h"
21 
22 namespace MeCab {
23 namespace {
24 
25 const unsigned int DictionaryMagicID = 0xef718f77u;
26 
toInt(const char * str)27 int toInt(const char *str) {
28   if (!str || std::strlen(str) == 0) {
29     return INT_MAX;
30   }
31   return std::atoi(str);
32 }
33 
calcCost(const std::string & w,const std::string & feature,int factor,DecoderFeatureIndex * fi,DictionaryRewriter * rewriter,CharProperty * property)34 int calcCost(const std::string &w, const std::string &feature,
35              int factor,
36              DecoderFeatureIndex *fi, DictionaryRewriter *rewriter,
37              CharProperty *property) {
38   CHECK_DIE(fi);
39   CHECK_DIE(rewriter);
40   CHECK_DIE(property);
41 
42   LearnerPath path;
43   LearnerNode rnode;
44   LearnerNode lnode;
45   rnode.stat  = lnode.stat = MECAB_NOR_NODE;
46   rnode.rpath = &path;
47   lnode.lpath = &path;
48   path.lnode  = &lnode;
49   path.rnode  = &rnode;
50 
51   size_t mblen = 0;
52   const CharInfo cinfo = property->getCharInfo(w.c_str(),
53                                                w.c_str() + w.size(),
54                                                &mblen);
55   path.rnode->char_type = cinfo.default_type;
56   std::string ufeature, lfeature, rfeature;
57   rewriter->rewrite2(feature, &ufeature, &lfeature, &rfeature);
58   fi->buildUnigramFeature(&path, ufeature.c_str());
59   fi->calcCost(&rnode);
60   return tocost(rnode.wcost, factor);
61 }
62 
progress_bar_darts(size_t current,size_t total)63 int progress_bar_darts(size_t current, size_t total) {
64   return progress_bar("emitting double-array", current, total);
65 }
66 
67 template <typename T1, typename T2>
68 struct pair_1st_cmp: public std::binary_function<bool, T1, T2> {
operator ()MeCab::__anon7f35eb7f0111::pair_1st_cmp69   bool operator()(const std::pair<T1, T2> &x1,
70                   const std::pair<T1, T2> &x2)  {
71     return x1.first < x2.first;
72   }
73 };
74 }  // namespace
75 
open(const char * file,const char * mode)76 bool Dictionary::open(const char *file, const char *mode) {
77   close();
78   filename_.assign(file);
79   CHECK_FALSE(dmmap_->open(file, mode))
80       << "no such file or directory: " << file;
81 
82   CHECK_FALSE(dmmap_->size() >= 100)
83       << "dictionary file is broken: " << file;
84 
85   const char *ptr = dmmap_->begin();
86 
87   unsigned int dsize;
88   unsigned int tsize;
89   unsigned int fsize;
90   unsigned int magic;
91   unsigned int dummy;
92 
93   read_static<unsigned int>(&ptr, magic);
94   CHECK_FALSE((magic ^ DictionaryMagicID) == dmmap_->size())
95       << "dictionary file is broken: " << file;
96 
97   read_static<unsigned int>(&ptr, version_);
98   CHECK_FALSE(version_ == DIC_VERSION)
99       << "incompatible version: " << version_;
100 
101   read_static<unsigned int>(&ptr, type_);
102   read_static<unsigned int>(&ptr, lexsize_);
103   read_static<unsigned int>(&ptr, lsize_);
104   read_static<unsigned int>(&ptr, rsize_);
105   read_static<unsigned int>(&ptr, dsize);
106   read_static<unsigned int>(&ptr, tsize);
107   read_static<unsigned int>(&ptr, fsize);
108   read_static<unsigned int>(&ptr, dummy);
109 
110   charset_ = ptr;
111   ptr += 32;
112   da_.set_array(reinterpret_cast<void *>(const_cast<char*>(ptr)));
113 
114   ptr += dsize;
115 
116   token_ = reinterpret_cast<const Token *>(ptr);
117   ptr += tsize;
118 
119   feature_ = ptr;
120   ptr += fsize;
121 
122   CHECK_FALSE(ptr == dmmap_->end())
123       << "dictionary file is broken: " << file;
124 
125   return true;
126 }
127 
close()128 void Dictionary::close() {
129   dmmap_->close();
130 }
131 
132 #define DCONF(file) create_filename(dicdir, std::string(file));
133 
assignUserDictionaryCosts(const Param & param,const std::vector<std::string> & dics,const char * output)134 bool Dictionary::assignUserDictionaryCosts(
135     const Param &param,
136     const std::vector<std::string> &dics,
137     const char *output) {
138   Connector matrix;
139   DictionaryRewriter rewriter;
140   DecoderFeatureIndex fi;
141   ContextID cid;
142   CharProperty property;
143 
144   const std::string dicdir = param.get<std::string>("dicdir");
145 
146   const std::string matrix_file     = DCONF(MATRIX_DEF_FILE);
147   const std::string matrix_bin_file = DCONF(MATRIX_FILE);
148   const std::string left_id_file    = DCONF(LEFT_ID_FILE);
149   const std::string right_id_file   = DCONF(RIGHT_ID_FILE);
150   const std::string rewrite_file    = DCONF(REWRITE_FILE);
151 
152   const std::string from = param.get<std::string>("dictionary-charset");
153 
154   const int factor = param.get<int>("cost-factor");
155   CHECK_DIE(factor > 0)   << "cost factor needs to be positive value";
156 
157   std::string config_charset = param.get<std::string>("config-charset");
158   if (config_charset.empty()) {
159     config_charset = from;
160   }
161 
162   CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
163 
164   Iconv config_iconv;
165   CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
166       << "iconv_open() failed with from=" << config_charset << " to=" << from;
167 
168   rewriter.open(rewrite_file.c_str(), &config_iconv);
169   CHECK_DIE(fi.open(param)) << "cannot open feature index";
170 
171   CHECK_DIE(property.open(param));
172   property.set_charset(from.c_str());
173 
174   if (!matrix.openText(matrix_file.c_str()) &&
175       !matrix.open(matrix_bin_file.c_str())) {
176     matrix.set_left_size(1);
177     matrix.set_right_size(1);
178   }
179 
180   cid.open(left_id_file.c_str(),
181            right_id_file.c_str(), &config_iconv);
182   CHECK_DIE(cid.left_size()  == matrix.left_size() &&
183             cid.right_size() == matrix.right_size())
184       << "Context ID files("
185       << left_id_file
186       << " or "
187       << right_id_file << " may be broken: "
188       << cid.left_size() << " " << matrix.left_size() << " "
189       << cid.right_size() << " " << matrix.right_size();
190 
191   std::ofstream ofs(output);
192   CHECK_DIE(ofs) << "permission denied: " << output;
193 
194   for (size_t i = 0; i < dics.size(); ++i) {
195     std::ifstream ifs(WPATH(dics[i].c_str()));
196     CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
197     std::cout << "reading " << dics[i] << " ... ";
198     scoped_fixed_array<char, BUF_SIZE> line;
199     while (ifs.getline(line.get(), line.size())) {
200       char *col[8];
201       const size_t n = tokenizeCSV(line.get(), col, 5);
202       CHECK_DIE(n == 5) << "format error: " << line.get();
203       std::string w = col[0];
204       const std::string feature = col[4];
205       const int cost = calcCost(w, feature, factor,
206                                 &fi, &rewriter, &property);
207       std::string ufeature, lfeature, rfeature;
208       CHECK_DIE(rewriter.rewrite(feature, &ufeature, &lfeature, &rfeature))
209           << "rewrite failed: " << feature;
210       const int lid = cid.lid(lfeature.c_str());
211       const int rid = cid.rid(rfeature.c_str());
212       CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
213           << "invalid ids are found lid=" << lid << " rid=" << rid;
214       escape_csv_element(&w);
215       ofs << w << ',' << lid << ',' << rid << ','
216           << cost << ',' << feature << '\n';
217     }
218   }
219 
220   return true;
221 }
222 
compile(const Param & param,const std::vector<std::string> & dics,const char * output)223 bool Dictionary::compile(const Param &param,
224                          const std::vector<std::string> &dics,
225                          const char *output) {
226   Connector matrix;
227   scoped_ptr<DictionaryRewriter> rewrite;
228   scoped_ptr<POSIDGenerator> posid;
229   scoped_ptr<DecoderFeatureIndex> fi;
230   scoped_ptr<ContextID> cid;
231   scoped_ptr<Writer> writer;
232   scoped_ptr<Lattice> lattice;
233   scoped_ptr<StringBuffer> os;
234   scoped_ptr<CharProperty> property;
235   Node node;
236 
237   const std::string dicdir = param.get<std::string>("dicdir");
238 
239   const std::string matrix_file     = DCONF(MATRIX_DEF_FILE);
240   const std::string matrix_bin_file = DCONF(MATRIX_FILE);
241   const std::string left_id_file    = DCONF(LEFT_ID_FILE);
242   const std::string right_id_file   = DCONF(RIGHT_ID_FILE);
243   const std::string rewrite_file    = DCONF(REWRITE_FILE);
244   const std::string pos_id_file     = DCONF(POS_ID_FILE);
245 
246   std::vector<std::pair<std::string, Token*> > dic;
247 
248   size_t offset  = 0;
249   unsigned int lexsize = 0;
250   std::string fbuf;
251 
252   const std::string from = param.get<std::string>("dictionary-charset");
253   const std::string to = param.get<std::string>("charset");
254   const bool wakati = param.get<bool>("wakati");
255   const int type = param.get<int>("type");
256   const std::string node_format = param.get<std::string>("node-format");
257   const int factor = param.get<int>("cost-factor");
258   CHECK_DIE(factor > 0)   << "cost factor needs to be positive value";
259 
260   // for backward compatibility
261   std::string config_charset = param.get<std::string>("config-charset");
262   if (config_charset.empty()) {
263     config_charset = from;
264   }
265 
266   CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
267   CHECK_DIE(!to.empty())   << "output dictionary charset is empty";
268 
269   Iconv iconv;
270   CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
271       << "iconv_open() failed with from=" << from << " to=" << to;
272 
273   Iconv config_iconv;
274   CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
275       << "iconv_open() failed with from=" << config_charset << " to=" << from;
276 
277   if (!node_format.empty()) {
278     writer.reset(new Writer);
279     lattice.reset(createLattice());
280     os.reset(new StringBuffer);
281     memset(&node, 0, sizeof(node));
282   }
283 
284   if (!matrix.openText(matrix_file.c_str()) &&
285       !matrix.open(matrix_bin_file.c_str())) {
286     matrix.set_left_size(1);
287     matrix.set_right_size(1);
288   }
289 
290   posid.reset(new POSIDGenerator);
291   posid->open(pos_id_file.c_str(), &config_iconv);
292 
293   std::istringstream iss(UNK_DEF_DEFAULT);
294 
295   for (size_t i = 0; i < dics.size(); ++i) {
296     std::ifstream ifs(WPATH(dics[i].c_str()));
297     std::istream *is = &ifs;
298     if (!ifs) {
299       if (type == MECAB_UNK_DIC) {
300         std::cerr << dics[i]
301                   << " is not found. minimum setting is used." << std::endl;
302         is = &iss;
303       } else {
304         CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
305       }
306     }
307 
308     std::cout << "reading " << dics[i] << " ... ";
309 
310     scoped_fixed_array<char, BUF_SIZE> line;
311     size_t num = 0;
312 
313     while (is->getline(line.get(), line.size())) {
314       char *col[8];
315       const size_t n = tokenizeCSV(line.get(), col, 5);
316       CHECK_DIE(n == 5) << "format error: " << line.get();
317 
318       std::string w = col[0];
319       int lid = toInt(col[1]);
320       int rid = toInt(col[2]);
321       int cost = toInt(col[3]);
322       std::string feature = col[4];
323       const int pid = posid->id(feature.c_str());
324 
325       if (cost == INT_MAX) {
326         CHECK_DIE(type == MECAB_USR_DIC)
327             << "cost field should not be empty in sys/unk dic.";
328         if (!rewrite.get()) {
329           rewrite.reset(new DictionaryRewriter);
330           rewrite->open(rewrite_file.c_str(), &config_iconv);
331           fi.reset(new DecoderFeatureIndex);
332           CHECK_DIE(fi->open(param)) << "cannot open feature index";
333           property.reset(new CharProperty);
334           CHECK_DIE(property->open(param));
335           property->set_charset(from.c_str());
336         }
337         cost = calcCost(w, feature, factor,
338                         fi.get(), rewrite.get(), property.get());
339       }
340 
341       if (lid < 0  || rid < 0 || lid == INT_MAX || rid == INT_MAX) {
342         if (!rewrite.get()) {
343           rewrite.reset(new DictionaryRewriter);
344           rewrite->open(rewrite_file.c_str(), &config_iconv);
345         }
346 
347         std::string ufeature, lfeature, rfeature;
348         CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature))
349             << "rewrite failed: " << feature;
350 
351         if (!cid.get()) {
352           cid.reset(new ContextID);
353           cid->open(left_id_file.c_str(),
354                     right_id_file.c_str(), &config_iconv);
355           CHECK_DIE(cid->left_size()  == matrix.left_size() &&
356                     cid->right_size() == matrix.right_size())
357               << "Context ID files("
358               << left_id_file
359               << " or "
360               << right_id_file << " may be broken";
361         }
362 
363         lid = cid->lid(lfeature.c_str());
364         rid = cid->rid(rfeature.c_str());
365       }
366 
367       CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
368           << "invalid ids are found lid=" << lid << " rid=" << rid;
369 
370       if (w.empty()) {
371         std::cerr << "empty word is found, discard this line" << std::endl;
372         continue;
373       }
374 
375       if (!iconv.convert(&feature)) {
376         std::cerr << "iconv conversion failed. skip this entry"
377                   << std::endl;
378         continue;
379       }
380 
381       if (type != MECAB_UNK_DIC && !iconv.convert(&w)) {
382         std::cerr << "iconv conversion failed. skip this entry"
383                   << std::endl;
384         continue;
385       }
386 
387       if (!node_format.empty()) {
388         node.surface = w.c_str();
389         node.feature = feature.c_str();
390         node.length  = w.size();
391         node.rlength = w.size();
392         node.posid   = pid;
393         node.stat    = MECAB_NOR_NODE;
394         lattice->set_sentence(w.c_str());
395         CHECK_DIE(os.get());
396         CHECK_DIE(writer.get());
397         os->clear();
398         CHECK_DIE(writer->writeNode(lattice.get(),
399                                     node_format.c_str(),
400                                     &node, &*os)) <<
401             "conversion error: " << feature << " with " << node_format;
402         *os << '\0';
403         feature = os->str();
404       }
405 
406       std::string key;
407       if (!wakati) {
408         key = feature + '\0';
409       }
410 
411       Token* token  = new Token;
412       token->lcAttr = lid;
413       token->rcAttr = rid;
414       token->posid  = pid;
415       token->wcost = cost;
416       token->feature = offset;
417       token->compound = 0;
418       dic.push_back(std::pair<std::string, Token*>(w, token));
419 
420       // append to output buffer
421       if (!wakati) {
422         fbuf.append(key.data(), key.size());
423       }
424       offset += key.size();
425 
426       ++num;
427       ++lexsize;
428     }
429 
430     std::cout << num << std::endl;
431   }
432 
433   if (wakati) {
434     fbuf.append("\0", 1);
435   }
436 
437   std::stable_sort(dic.begin(), dic.end(),
438                    pair_1st_cmp<std::string, Token *>());
439 
440   size_t bsize = 0;
441   size_t idx = 0;
442   std::string prev;
443   std::vector<const char *> str;
444   std::vector<size_t> len;
445   std::vector<Darts::DoubleArray::result_type> val;
446 
447   for (size_t i = 0; i < dic.size(); ++i) {
448     if (i != 0 && prev != dic[i].first) {
449       str.push_back(dic[idx].first.c_str());
450       len.push_back(dic[idx].first.size());
451       val.push_back(bsize +(idx << 8));
452       bsize = 1;
453       idx = i;
454     } else {
455       ++bsize;
456     }
457     prev = dic[i].first;
458   }
459   str.push_back(dic[idx].first.c_str());
460   len.push_back(dic[idx].first.size());
461   val.push_back(bsize +(idx << 8));
462 
463   CHECK_DIE(str.size() == len.size());
464   CHECK_DIE(str.size() == val.size());
465 
466   Darts::DoubleArray da;
467   CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]),
468                      &len[0], &val[0], &progress_bar_darts) == 0)
469       << "unkown error in building double-array";
470 
471   std::string tbuf;
472   for (size_t i = 0; i < dic.size(); ++i) {
473     tbuf.append(reinterpret_cast<const char*>(dic[i].second),
474                 sizeof(Token));
475     delete dic[i].second;
476   }
477   dic.clear();
478 
479   // needs to be 8byte(64bit) aligned
480   while (tbuf.size() % 8 != 0) {
481     Token dummy;
482     memset(&dummy, 0, sizeof(Token));
483     tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token));
484   }
485 
486   unsigned int dummy = 0;
487   unsigned int lsize = matrix.left_size();
488   unsigned int rsize = matrix.right_size();
489   unsigned int dsize = da.unit_size() * da.size();
490   unsigned int tsize = tbuf.size();
491   unsigned int fsize = fbuf.size();
492 
493   unsigned int version = DIC_VERSION;
494   char charset[32];
495   std::fill(charset, charset + sizeof(charset), '\0');
496   std::strncpy(charset, to.c_str(), 31);
497 
498   std::ofstream bofs(WPATH(output), std::ios::binary|std::ios::out);
499   CHECK_DIE(bofs) << "permission denied: " << output;
500 
501   unsigned int magic = 0;
502 
503   // needs to be 64bit aligned
504   // 10*32 = 64*5
505   bofs.write(reinterpret_cast<const char *>(&magic),   sizeof(unsigned int));
506   bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int));
507   bofs.write(reinterpret_cast<const char *>(&type),    sizeof(unsigned int));
508   bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int));
509   bofs.write(reinterpret_cast<const char *>(&lsize),   sizeof(unsigned int));
510   bofs.write(reinterpret_cast<const char *>(&rsize),   sizeof(unsigned int));
511   bofs.write(reinterpret_cast<const char *>(&dsize),   sizeof(unsigned int));
512   bofs.write(reinterpret_cast<const char *>(&tsize),   sizeof(unsigned int));
513   bofs.write(reinterpret_cast<const char *>(&fsize),   sizeof(unsigned int));
514   bofs.write(reinterpret_cast<const char *>(&dummy),   sizeof(unsigned int));
515 
516   // 32 * 8 = 64 * 4
517   bofs.write(reinterpret_cast<const char *>(charset),  sizeof(charset));
518 
519   bofs.write(reinterpret_cast<const char*>(da.array()),
520              da.unit_size() * da.size());
521   bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size());
522   bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size());
523 
524   // save magic id
525   magic = static_cast<unsigned int>(bofs.tellp());
526   magic ^= DictionaryMagicID;
527   bofs.seekp(0);
528   bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));
529 
530   bofs.close();
531 
532   return true;
533 }
534 }
535