1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 // Generates system dictionary header file.
31 //
32 // gen_system_dictionary_data_main
33 // --input="dictionary0.txt dictionary1.txt"
34 // --output="output.h"
35 // --make_header
36
37 #include <memory>
38 #include <string>
39 #include <vector>
40
41 #include "base/file_stream.h"
42 #include "base/flags.h"
43 #include "base/init_mozc.h"
44 #include "base/logging.h"
45 #include "base/util.h"
46 #include "data_manager/data_manager.h"
47 #include "dictionary/dictionary_token.h"
48 #include "dictionary/pos_matcher.h"
49 #include "dictionary/system/system_dictionary_builder.h"
50 #include "dictionary/text_dictionary_loader.h"
51
52 DEFINE_string(input, "", "space separated input text files");
53 DEFINE_string(user_pos_manager_data, "", "user pos manager data");
54 DEFINE_string(output, "", "output binary file");
55
56 namespace mozc {
57 namespace {
58
59 // 10 dictionary files are passed to this program with --input flag.
60 // reading_correction.tsv is also passed to this program with --input flag
61 // in the same manner. This program checks the file name pattern and change
62 // the algorithm for handling dictionaries. Ideally, we want to use different
63 // flags for dictionary and reading correction, but due to the limitation
64 // of internal build system, it turned out that the description of the rules
65 // will become much complicated, if we use two flags.
66 const char kReadingCorrectionFile[] = "reading_correction.tsv";
67
68 // convert space delimtered text to CSV
GetInputFileName(const string & input_file,string * system_dictionary_input,string * reading_correction_input)69 void GetInputFileName(const string &input_file,
70 string *system_dictionary_input,
71 string *reading_correction_input) {
72 CHECK(system_dictionary_input);
73 CHECK(reading_correction_input);
74 system_dictionary_input->clear();
75 reading_correction_input->clear();
76 const StringPiece kDelimiter(", ", 1);
77 for (SplitIterator<SingleDelimiter> iter(input_file, " ");
78 !iter.Done(); iter.Next()) {
79 const StringPiece &input_file = iter.Get();
80 if (Util::EndsWith(input_file, kReadingCorrectionFile)) {
81 Util::AppendStringWithDelimiter(kDelimiter, input_file,
82 reading_correction_input);
83 } else {
84 Util::AppendStringWithDelimiter(kDelimiter, input_file,
85 system_dictionary_input);
86 }
87 }
88 }
89
90 } // namespace
91 } // namespace mozc
92
main(int argc,char ** argv)93 int main(int argc, char **argv) {
94 mozc::InitMozc(argv[0], &argc, &argv, false);
95
96 string system_dictionary_input, reading_correction_input;
97 mozc::GetInputFileName(FLAGS_input, &system_dictionary_input,
98 &reading_correction_input);
99
100 // User POS manager data for build tools has no magic number.
101 const char *kMagicNumber = "";
102 mozc::DataManager data_manager;
103 const mozc::DataManager::Status status =
104 data_manager.InitUserPosManagerDataFromFile(FLAGS_user_pos_manager_data,
105 kMagicNumber);
106 CHECK_EQ(status, mozc::DataManager::Status::OK)
107 << "Failed to initialize data manager from "
108 << FLAGS_user_pos_manager_data;
109
110 const mozc::dictionary::POSMatcher pos_matcher(
111 data_manager.GetPOSMatcherData());
112
113 mozc::dictionary::TextDictionaryLoader loader(pos_matcher);
114 loader.Load(system_dictionary_input, reading_correction_input);
115
116 mozc::dictionary::SystemDictionaryBuilder builder;
117 builder.BuildFromTokens(loader.tokens());
118
119 std::unique_ptr<std::ostream> output_stream(new mozc::OutputFileStream(
120 FLAGS_output.c_str(), std::ios::out | std::ios::binary));
121 builder.WriteToStream(FLAGS_output, output_stream.get());
122
123 return 0;
124 }
125