1 /*
2 * SPDX-FileCopyrightText: 2002~2005 Yuking <yuking_net@sohu.com>
3 * SPDX-FileCopyrightText: 2020~2020 CSSlayer <wengxt@gmail.com>
4 *
5 * SPDX-License-Identifier: LGPL-2.1-or-later
6 *
7 */
8 #include "config.h"
9 #include "libime/core/historybigram.h"
10 #include "libime/core/utils.h"
11 #include "libime/core/utils_p.h"
12 #include "libime/table/tablebaseddictionary.h"
13 #include <boost/iostreams/device/file_descriptor.hpp>
14 #include <boost/iostreams/stream.hpp>
15 #include <fcitx-utils/charutils.h>
16 #include <fcitx-utils/standardpath.h>
17 #include <fcntl.h>
18 #include <istream>
19 #include <sstream>
20
21 #if __GNUC__ <= 8
22 #include <boost/filesystem.hpp>
23 #else
24 #include <filesystem>
25 #endif
26
27 #if defined(__linux__) || defined(__GLIBC__)
28 #include <endian.h>
29 #else
30 #include <sys/endian.h>
31 #endif
32
33 using namespace libime;
34 using namespace fcitx;
35
36 struct MigrationCommonOption {
37 bool skipHistory = false;
38 bool skipDict = false;
39 std::string dictFile;
40 std::string historyFile;
41 bool useXdgPath = true;
42 std::string sourceFile;
43
openSourceFileMigrationCommonOption44 UnixFD openSourceFile() const {
45 // We support two different mode of migration.
46 // For an input method with existing code base.
47 UnixFD sourceFd;
48 // Fcitx 4's xdg is not following the spec, it's using a xdg_data and
49 // xdg_config in mixed way.
50 if (useXdgPath && sourceFile[0] != '/') {
51 StandardPath standardPath(/*skipFcitxPath=*/true);
52 sourceFd = UnixFD::own(
53 standardPath
54 .openUser(StandardPath::Type::Config,
55 stringutils::joinPath("fcitx/table", sourceFile),
56 O_RDONLY)
57 .release());
58 if (!sourceFd.isValid()) {
59 sourceFd = UnixFD::own(
60 standardPath
61 .open(StandardPath::Type::Config,
62 stringutils::joinPath("fcitx/table", sourceFile),
63 O_RDONLY)
64 .release());
65 }
66 } else {
67 sourceFd = UnixFD::own(open(sourceFile.data(), O_RDONLY));
68 }
69 return sourceFd;
70 }
71
openMergeFileMigrationCommonOption72 UnixFD openMergeFile(const std::string &path) const {
73 UnixFD fd;
74 if (useXdgPath && path[0] != '/') {
75 fd = UnixFD::own(StandardPath::global()
76 .openUser(StandardPath::Type::PkgData,
77 stringutils::joinPath("table", path),
78 O_RDONLY)
79 .release());
80 } else {
81 fd = UnixFD::own(open(path.data(), O_RDONLY));
82 }
83 return fd;
84 }
85
pathForSaveMigrationCommonOption86 std::string pathForSave(const std::string &path) const {
87 if (path[0] != '/') {
88 if (useXdgPath) {
89 return stringutils::joinPath("table", path);
90 } else {
91 #if __GNUC__ <= 8
92 return boost::filesystem::absolute(path).string();
93 #else
94 return std::filesystem::absolute(path);
95 #endif
96 }
97 }
98 return path;
99 }
100 };
101
102 struct MigrationWithBaseOption : public MigrationCommonOption {
103 std::string baseFile;
104 bool merge = true;
105 };
106
107 struct MigrationWithoutBaseOption : public MigrationCommonOption {};
108
109 struct BasicTableInfo {
110 std::string code;
111 std::string ignoreChars;
112 uint32_t length = 0;
113 std::string rule;
114 char pinyin = '\0';
115 char prompt = '\0';
116 char phrase = '\0';
117 };
118
119 enum RecordType {
120 RECORDTYPE_NORMAL = 0x0,
121 RECORDTYPE_PINYIN = 0x1,
122 RECORDTYPE_CONSTRUCT = 0x2,
123 RECORDTYPE_PROMPT = 0x3,
124 };
125
126 constexpr int INTERNAL_VERSION = 3;
127 constexpr int MAX_CODE_LENGTH = 60;
128 constexpr const char mbSuffix[] = ".mb";
129
130 std::string_view argv0;
131
usage(const char * extra=nullptr)132 void usage(const char *extra = nullptr) {
133 std::cout
134 << "Usage: " << argv0
135 << " [-o <dict>/-O] [-p <history>/-P] [-b <base>/-B] [-U] [-B] [-X] "
136 "<source>"
137 << std::endl
138 << "<source>: the source file of the dictionary." << std::endl
139 << "-o: output dict file path" << std::endl
140 << "-O: Skip dict file." << std::endl
141 << "-p: history file path" << std::endl
142 << "-P: Skip history file" << std::endl
143 << "-b: base file of a libime main dict" << std::endl
144 << "-B: generate full data without base file" << std::endl
145 << "-X: locate non-abstract path by only path instead of Xdg path"
146 << std::endl
147 << "-U: overwrite instead of merge with existing data." << std::endl
148 << "-h: Show this help" << std::endl;
149 if (extra) {
150 std::cout << extra << std::endl;
151 }
152 }
153
replaceSuffix(const std::string & input,const std::string & suffix,std::string_view newSuffix)154 std::optional<std::string> replaceSuffix(const std::string &input,
155 const std::string &suffix,
156 std::string_view newSuffix) {
157 auto name = fs::baseName(input);
158 if (!stringutils::endsWith(name, suffix)) {
159 return {};
160 }
161 // Strip .mb
162 name.erase(name.size() - suffix.size(), suffix.size());
163 name.append(newSuffix);
164 return name;
165 }
166
guessValidChar(char prefer,std::string_view invalid)167 char guessValidChar(char prefer, std::string_view invalid) {
168 if (invalid.find(prefer) == std::string::npos) {
169 return prefer;
170 }
171 unsigned char c;
172 std::string_view punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
173 for (c = 0; c <= 127; c++) {
174 if (punct.find(c) != std::string_view::npos || charutils::isdigit(c) ||
175 charutils::islower(c) || charutils::isupper(c)) {
176 if (invalid.find(c) == std::string::npos) {
177 return c;
178 }
179 }
180 }
181 return 0;
182 }
183
loadSource(const UnixFD & sourceFd,std::function<void (const BasicTableInfo & info)> basicInfoCallback,std::function<void (const BasicTableInfo & info,RecordType,const std::string &,const std::string &,uint32_t)> recordCallback)184 void loadSource(
185 const UnixFD &sourceFd,
186 std::function<void(const BasicTableInfo &info)> basicInfoCallback,
187 std::function<void(const BasicTableInfo &info, RecordType,
188 const std::string &, const std::string &, uint32_t)>
189 recordCallback) {
190 BasicTableInfo info;
191 boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source>
192 buffer(sourceFd.fd(),
193 boost::iostreams::file_descriptor_flags::never_close_handle);
194 std::istream in(&buffer);
195
196 uint32_t codeStrLength;
197 throw_if_io_fail(unmarshallLE(in, codeStrLength));
198 //先读取码表的信息
199 bool isOldVersion = 1;
200 if (!codeStrLength) {
201 uint8_t version;
202 throw_if_io_fail(unmarshall(in, version));
203 isOldVersion = (version < INTERNAL_VERSION);
204 throw_if_io_fail(unmarshallLE(in, codeStrLength));
205 }
206 std::vector<char> codeString;
207 codeString.resize(codeStrLength + 1);
208 throw_if_io_fail(unmarshallVector(in, codeString));
209
210 uint8_t len;
211 throw_if_io_fail(unmarshall(in, len));
212 if (len == 0 || len > MAX_CODE_LENGTH) {
213 throw std::runtime_error("Invalid code length");
214 }
215
216 uint8_t pylen = 0;
217 if (!isOldVersion) {
218 throw_if_io_fail(unmarshall(in, pylen));
219 }
220
221 info.length = len;
222 info.code.assign(codeString.data(), codeStrLength);
223
224 uint32_t invalidCharLength;
225 throw_if_io_fail(unmarshallLE(in, invalidCharLength));
226 std::vector<char> invalidChar;
227 invalidChar.resize(invalidCharLength + 1);
228 throw_if_io_fail(unmarshallVector(in, invalidChar));
229 info.ignoreChars.assign(invalidChar.data(), invalidCharLength);
230
231 uint8_t hasRule;
232 throw_if_io_fail(unmarshall(in, hasRule));
233 std::string rule;
234 if (hasRule) {
235 std::stringstream ss;
236 for (size_t i = 1; i < len; i++) {
237 uint8_t ruleFlag;
238 throw_if_io_fail(unmarshall(in, ruleFlag));
239 ss << (ruleFlag ? 'a' : 'e');
240 uint8_t ruleLength;
241 throw_if_io_fail(unmarshall(in, ruleLength));
242 ss << static_cast<uint32_t>(ruleLength) << '=';
243
244 for (size_t j = 0; j < len; j++) {
245 if (j) {
246 ss << '+';
247 }
248 uint8_t ruleIndex;
249 throw_if_io_fail(unmarshall(in, ruleFlag));
250 ss << (ruleFlag ? 'p' : 'n');
251 throw_if_io_fail(unmarshall(in, ruleIndex));
252 ss << static_cast<uint32_t>(ruleIndex);
253 throw_if_io_fail(unmarshall(in, ruleIndex));
254 ss << static_cast<uint32_t>(ruleIndex);
255 }
256 ss << std::endl;
257 }
258 info.rule = ss.str();
259 }
260 std::string invalid = info.code;
261 info.pinyin = guessValidChar('@', invalid);
262 if (info.pinyin) {
263 invalid.push_back(info.pinyin);
264 }
265 info.prompt = guessValidChar('&', invalid);
266 if (info.prompt) {
267 invalid.push_back(info.prompt);
268 }
269 info.phrase = guessValidChar('^', invalid);
270 if (info.phrase) {
271 invalid.push_back(info.phrase);
272 }
273
274 if (basicInfoCallback) {
275 basicInfoCallback(info);
276 }
277
278 uint32_t nRecords;
279 throw_if_io_fail(unmarshallLE(in, nRecords));
280
281 if (!isOldVersion) {
282 len = pylen;
283 }
284
285 for (size_t i = 0; i < nRecords; i++) {
286 std::vector<char> codeBuffer;
287 codeBuffer.resize(len + 1);
288 throw_if_io_fail(unmarshallVector(in, codeBuffer));
289 uint32_t hzLength = 0;
290 if (codeBuffer.back() != 0) {
291 throw std::runtime_error("Invalid data in source file.");
292 }
293 throw_if_io_fail(unmarshallLE(in, hzLength));
294 std::vector<char> hzBuffer;
295 hzBuffer.resize(hzLength);
296 throw_if_io_fail(unmarshallVector(in, hzBuffer));
297 if (hzLength == 0 || hzBuffer.back() != 0) {
298 throw std::runtime_error("Invalid data in source file.");
299 }
300
301 uint8_t recordType = RECORDTYPE_NORMAL;
302 if (!isOldVersion) {
303 throw_if_io_fail(unmarshall(in, recordType));
304
305 switch (recordType) {
306 case RECORDTYPE_PINYIN:
307 break;
308 case RECORDTYPE_CONSTRUCT:
309 break;
310 case RECORDTYPE_PROMPT:
311 break;
312 default:
313 recordType = RECORDTYPE_NORMAL;
314 break;
315 }
316 }
317
318 uint32_t index;
319 uint32_t freq;
320 throw_if_io_fail(unmarshallLE(in, freq));
321 throw_if_io_fail(unmarshallLE(in, index));
322
323 recordCallback(info, static_cast<RecordType>(recordType),
324 codeBuffer.data(), hzBuffer.data(), freq);
325 }
326 }
327
migrate(MigrationWithBaseOption option)328 int migrate(MigrationWithBaseOption option) {
329 UnixFD baseFd;
330 if (option.baseFile.empty()) {
331 if (!option.useXdgPath) {
332 usage("Base file name is missing. Please use -b to specifiy the "
333 "base file, or -B to use non-base file mode.");
334 return 1;
335 }
336
337 if (auto name =
338 replaceSuffix(option.sourceFile, mbSuffix, ".main.dict")) {
339 baseFd = UnixFD::own(open(
340 stringutils::joinPath(LIBIME_INSTALL_PKGDATADIR, *name).data(),
341 O_RDONLY));
342 if (!baseFd.isValid()) {
343 baseFd = UnixFD::own(
344 StandardPath::global()
345 .open(fcitx::StandardPath::Type::PkgData,
346 stringutils::joinPath("table", *name), O_RDONLY)
347 .release());
348 }
349 } else {
350 usage("Failed to infer the base file name. Please use -b to "
351 "specifiy the base file, or -B to use non-base file mode.");
352 return 1;
353 }
354 } else {
355 baseFd = UnixFD::own(open(option.baseFile.data(), O_RDONLY));
356 }
357 if (!baseFd.isValid()) {
358 usage("Failed to locate base file, please use -b to specifiy the right "
359 "base file, or -B to use non-base file mode.");
360 return 1;
361 }
362
363 if (!option.skipDict) {
364 if (option.dictFile.empty()) {
365 if (!option.useXdgPath) {
366 usage("Output dict file need to be specified.");
367 return 1;
368 }
369
370 if (auto name =
371 replaceSuffix(option.sourceFile, mbSuffix, ".user.dict")) {
372 option.dictFile = *name;
373 } else {
374 usage("Failed to infer the dict file name. Please use -o to "
375 "specifiy the dict file, or -O skip.");
376 return 1;
377 }
378 }
379 }
380
381 if (!option.skipHistory) {
382 if (option.historyFile.empty()) {
383 if (!option.useXdgPath) {
384 usage("History file need to be specified.");
385 return 1;
386 }
387
388 if (auto name =
389 replaceSuffix(option.sourceFile, mbSuffix, ".history")) {
390 option.historyFile = *name;
391 } else {
392 usage("Failed to infer the history file name. Please use -p to "
393 "specifiy the history file, or -P skip.");
394 return 1;
395 }
396 }
397 }
398
399 UnixFD sourceFd = option.openSourceFile();
400 if (!sourceFd.isValid()) {
401 usage("Failed to open the source file.");
402 return 1;
403 }
404
405 TableBasedDictionary tableDict;
406 {
407 boost::iostreams::stream_buffer<
408 boost::iostreams::file_descriptor_source>
409 buffer(baseFd.fd(),
410 boost::iostreams::file_descriptor_flags::never_close_handle);
411 std::istream in(&buffer);
412 tableDict.load(in);
413 }
414 if (option.merge && !option.skipDict) {
415 UnixFD dictFd = option.openMergeFile(option.dictFile);
416 if (dictFd.isValid()) {
417 try {
418 boost::iostreams::stream_buffer<
419 boost::iostreams::file_descriptor_source>
420 buffer(dictFd.fd(),
421 boost::iostreams::file_descriptor_flags::
422 never_close_handle);
423 std::istream in(&buffer);
424 tableDict.loadUser(in);
425 } catch (const std::exception &e) {
426 std::cout << "Failed when loading dict file: " << e.what();
427 return 1;
428 }
429 }
430 }
431
432 HistoryBigram history;
433 if (option.merge && !option.skipHistory) {
434 UnixFD historyFd = option.openMergeFile(option.historyFile);
435 if (historyFd.isValid()) {
436 try {
437 boost::iostreams::stream_buffer<
438 boost::iostreams::file_descriptor_source>
439 buffer(historyFd.fd(),
440 boost::iostreams::file_descriptor_flags::
441 never_close_handle);
442 std::istream in(&buffer);
443 history.load(in);
444 } catch (const std::exception &e) {
445 std::cout << "Failed when loading history file: " << e.what();
446 return 1;
447 }
448 }
449 }
450
451 uint32_t mergedWord = 0;
452 try {
453 loadSource(
454 sourceFd, {},
455 [&option, &tableDict, &history,
456 &mergedWord](const BasicTableInfo &, RecordType type,
457 const std::string &code, const std::string &value,
458 uint32_t freq) {
459 if (type != RECORDTYPE_NORMAL) {
460 return;
461 }
462
463 if (!option.skipDict) {
464 auto flag = tableDict.wordExists(code, value);
465 if (flag != PhraseFlag::User && flag != PhraseFlag::None) {
466 tableDict.insert(code, value, PhraseFlag::User);
467 ++mergedWord;
468 }
469 }
470
471 if (!option.skipHistory) {
472 for (uint32_t i = 0;
473 i < std::min(static_cast<uint32_t>(10u), freq); i++) {
474 history.add({value});
475 }
476 }
477 });
478 } catch (const std::exception &e) {
479 std::cout << "Failed when loading source file: " << e.what();
480 return 1;
481 }
482
483 std::cout << "Found " << mergedWord << " new words." << std::endl;
484 if (!option.skipDict) {
485 if (!StandardPath::global().safeSave(
486 StandardPath::Type::PkgData,
487 option.pathForSave(option.dictFile), [&tableDict](int fd) {
488 boost::iostreams::stream_buffer<
489 boost::iostreams::file_descriptor_sink>
490 buffer(fd, boost::iostreams::file_descriptor_flags::
491 never_close_handle);
492 std::ostream out(&buffer);
493 tableDict.saveUser(out);
494 return true;
495 })) {
496 std::cout << "Failed to save to dictionary file." << std::endl;
497 return 1;
498 }
499 }
500
501 if (!option.skipHistory) {
502 if (!StandardPath::global().safeSave(
503 StandardPath::Type::PkgData,
504 option.pathForSave(option.historyFile), [&history](int fd) {
505 boost::iostreams::stream_buffer<
506 boost::iostreams::file_descriptor_sink>
507 buffer(fd, boost::iostreams::file_descriptor_flags::
508 never_close_handle);
509 std::ostream out(&buffer);
510 history.save(out);
511 return true;
512 })) {
513 std::cout << "Failed to save to history file." << std::endl;
514 return 1;
515 }
516 }
517
518 return 0;
519 }
520
migrate(MigrationWithoutBaseOption option)521 int migrate(MigrationWithoutBaseOption option) {
522
523 if (!option.skipDict) {
524 if (option.dictFile.empty()) {
525 if (!option.useXdgPath) {
526 usage("Output dict file need to be specified.");
527 return 1;
528 }
529
530 if (auto name =
531 replaceSuffix(option.sourceFile, mbSuffix, ".user.dict")) {
532 option.dictFile = *name;
533 } else {
534 usage("Failed to infer the dict file name. Please use -o to "
535 "specifiy the dict file, or -O skip.");
536 return 1;
537 }
538 }
539 }
540
541 if (!option.skipHistory) {
542 if (option.historyFile.empty()) {
543 if (!option.useXdgPath) {
544 usage("History file need to be specified.");
545 return 1;
546 }
547 if (auto name =
548 replaceSuffix(option.sourceFile, mbSuffix, ".main.dict")) {
549 option.historyFile = *name;
550 } else {
551 usage("Failed to infer the history file name. Please use -p to "
552 "specifiy the history file, or -P skip.");
553 return 1;
554 }
555 }
556 }
557
558 UnixFD sourceFd = option.openSourceFile();
559 if (!sourceFd.isValid()) {
560 usage("Failed to open the source file.");
561 return 1;
562 }
563
564 TableBasedDictionary tableDict;
565 HistoryBigram history;
566 std::stringstream ss;
567 try {
568 loadSource(
569 sourceFd,
570 [&ss](const BasicTableInfo &info) {
571 ss << "KeyCode=" << info.code << std::endl;
572 ss << "Length=" << info.length << std::endl;
573
574 if (!info.ignoreChars.empty()) {
575 ss << "InvalidChar=" << info.ignoreChars << std::endl;
576 }
577
578 if (info.pinyin) {
579 ss << "Pinyin=" << info.pinyin << std::endl;
580 }
581
582 if (info.prompt) {
583 ss << "Prompt=" << info.prompt << std::endl;
584 }
585
586 if (info.phrase) {
587 ss << "ConstructPhrase=" << info.phrase << std::endl;
588 }
589
590 if (!info.rule.empty()) {
591 ss << "[Rule]\n" << info.rule << std::endl;
592 }
593 ss << "[Data]" << std::endl;
594 },
595 [&option, &ss, &history](const BasicTableInfo &info,
596 RecordType type, const std::string &code,
597 const std::string &value, uint32_t freq) {
598 if (!option.skipDict) {
599 switch (type) {
600 case RECORDTYPE_NORMAL:
601 ss << code << " " << value << std::endl;
602 break;
603 case RECORDTYPE_CONSTRUCT:
604 if (info.phrase) {
605 ss << info.phrase << code << " " << value
606 << std::endl;
607 }
608 break;
609 case RECORDTYPE_PROMPT:
610 if (info.prompt) {
611 ss << info.prompt << code << " " << value
612 << std::endl;
613 }
614 break;
615 case RECORDTYPE_PINYIN:
616 if (info.pinyin) {
617 ss << info.pinyin << code << " " << value
618 << std::endl;
619 }
620 break;
621 }
622 }
623
624 if (!option.skipHistory) {
625 for (uint32_t i = 0;
626 i < std::min(static_cast<uint32_t>(10u), freq); i++) {
627 history.add({value});
628 }
629 }
630 });
631 } catch (const std::exception &e) {
632 std::cout << "Failed when loading source file: " << e.what();
633 return 1;
634 }
635 if (!option.skipDict) {
636 try {
637 tableDict.load(ss, libime::TableFormat::Text);
638 } catch (const std::exception &e) {
639 std::cout << "Failed when construct new dict: " << e.what();
640 return 1;
641 }
642 if (!StandardPath::global().safeSave(
643 StandardPath::Type::PkgData,
644 option.pathForSave(option.dictFile), [&tableDict](int fd) {
645 boost::iostreams::stream_buffer<
646 boost::iostreams::file_descriptor_sink>
647 buffer(fd, boost::iostreams::file_descriptor_flags::
648 never_close_handle);
649 std::ostream out(&buffer);
650 tableDict.save(out);
651 return true;
652 })) {
653 std::cout << "Failed to save to dictionary file." << std::endl;
654 return 1;
655 }
656 }
657
658 if (!option.skipHistory) {
659 if (!StandardPath::global().safeSave(
660 StandardPath::Type::PkgData,
661 option.pathForSave(option.historyFile), [&history](int fd) {
662 boost::iostreams::stream_buffer<
663 boost::iostreams::file_descriptor_sink>
664 buffer(fd, boost::iostreams::file_descriptor_flags::
665 never_close_handle);
666 std::ostream out(&buffer);
667 history.save(out);
668 return true;
669 })) {
670 std::cout << "Failed to save to history file." << std::endl;
671 return 1;
672 }
673 }
674
675 return 0;
676 }
677
main(int argc,char * argv[])678 int main(int argc, char *argv[]) {
679 argv0 = argv[0];
680 bool noBasefile = false;
681 MigrationWithBaseOption withBaseOption;
682 MigrationWithoutBaseOption withoutBaseOption;
683 int c;
684 while ((c = getopt(argc, argv, "Bb:Oo:Pp:UXh")) != -1) {
685 switch (c) {
686 case 'B':
687 noBasefile = true;
688 break;
689 case 'b':
690 withBaseOption.baseFile = optarg;
691 break;
692 case 'O':
693 withBaseOption.skipDict = withoutBaseOption.skipDict = true;
694 break;
695 case 'o':
696 withBaseOption.dictFile = withoutBaseOption.dictFile = optarg;
697 break;
698 case 'P':
699 withBaseOption.skipHistory = withoutBaseOption.skipHistory = true;
700 break;
701 case 'p':
702 withBaseOption.historyFile = withoutBaseOption.historyFile = optarg;
703 break;
704 case 'U':
705 withBaseOption.merge = false;
706 break;
707 case 'X':
708 withBaseOption.useXdgPath = withoutBaseOption.useXdgPath = false;
709 break;
710 case 'h':
711 usage();
712 return 0;
713 default:
714 usage();
715 return 1;
716 }
717 }
718
719 if (optind + 1 != argc) {
720 usage("Source file is missing.");
721 return 1;
722 }
723
724 withBaseOption.sourceFile = withoutBaseOption.sourceFile = argv[optind];
725 UnixFD baseFd, outputFd;
726
727 // Validate the required arguments.
728 if (noBasefile) {
729 return migrate(withoutBaseOption);
730 }
731 return migrate(withBaseOption);
732 }
733