1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 // Generates system dictionary header file.
31 //
32 // gen_system_dictionary_data_main
33 //  --input="dictionary0.txt dictionary1.txt"
34 //  --output="output.h"
35 //  --make_header
36 
37 #include <memory>
38 #include <string>
39 #include <vector>
40 
41 #include "base/file_stream.h"
42 #include "base/flags.h"
43 #include "base/init_mozc.h"
44 #include "base/logging.h"
45 #include "base/util.h"
46 #include "data_manager/data_manager.h"
47 #include "dictionary/dictionary_token.h"
48 #include "dictionary/pos_matcher.h"
49 #include "dictionary/system/system_dictionary_builder.h"
50 #include "dictionary/text_dictionary_loader.h"
51 
52 DEFINE_string(input, "", "space separated input text files");
53 DEFINE_string(user_pos_manager_data, "", "user pos manager data");
54 DEFINE_string(output, "", "output binary file");
55 
56 namespace mozc {
57 namespace {
58 
59 // 10 dictionary files are passed to this program with --input flag.
60 // reading_correction.tsv is also passed to this program with --input flag
61 // in the same manner. This program checks the file name pattern and change
62 // the algorithm for handling dictionaries. Ideally, we want to use different
63 // flags for dictionary and reading correction, but due to the limitation
64 // of internal build system, it turned out that the description of the rules
65 // will become much complicated, if we use two flags.
66 const char kReadingCorrectionFile[] = "reading_correction.tsv";
67 
68 // convert space delimtered text to CSV
GetInputFileName(const string & input_file,string * system_dictionary_input,string * reading_correction_input)69 void GetInputFileName(const string &input_file,
70                       string *system_dictionary_input,
71                       string *reading_correction_input) {
72   CHECK(system_dictionary_input);
73   CHECK(reading_correction_input);
74   system_dictionary_input->clear();
75   reading_correction_input->clear();
76   const StringPiece kDelimiter(", ", 1);
77   for (SplitIterator<SingleDelimiter> iter(input_file, " ");
78        !iter.Done(); iter.Next()) {
79     const StringPiece &input_file = iter.Get();
80     if (Util::EndsWith(input_file, kReadingCorrectionFile)) {
81       Util::AppendStringWithDelimiter(kDelimiter, input_file,
82                                       reading_correction_input);
83     } else {
84       Util::AppendStringWithDelimiter(kDelimiter, input_file,
85                                       system_dictionary_input);
86     }
87   }
88 }
89 
90 }  // namespace
91 }  // namespace mozc
92 
main(int argc,char ** argv)93 int main(int argc, char **argv) {
94   mozc::InitMozc(argv[0], &argc, &argv, false);
95 
96   string system_dictionary_input, reading_correction_input;
97   mozc::GetInputFileName(FLAGS_input, &system_dictionary_input,
98                          &reading_correction_input);
99 
100   // User POS manager data for build tools has no magic number.
101   const char *kMagicNumber = "";
102   mozc::DataManager data_manager;
103   const mozc::DataManager::Status status =
104       data_manager.InitUserPosManagerDataFromFile(FLAGS_user_pos_manager_data,
105                                                   kMagicNumber);
106   CHECK_EQ(status, mozc::DataManager::Status::OK)
107       << "Failed to initialize data manager from "
108       << FLAGS_user_pos_manager_data;
109 
110   const mozc::dictionary::POSMatcher pos_matcher(
111       data_manager.GetPOSMatcherData());
112 
113   mozc::dictionary::TextDictionaryLoader loader(pos_matcher);
114   loader.Load(system_dictionary_input, reading_correction_input);
115 
116   mozc::dictionary::SystemDictionaryBuilder builder;
117   builder.BuildFromTokens(loader.tokens());
118 
119   std::unique_ptr<std::ostream> output_stream(new mozc::OutputFileStream(
120       FLAGS_output.c_str(), std::ios::out | std::ios::binary));
121   builder.WriteToStream(FLAGS_output, output_stream.get());
122 
123   return 0;
124 }
125