1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 // NOTE(tabata): This code is used mainly by louds trie builder to build
31 // dictionary. Please check error handling, if you want to include
32 // this to run within client.
33
34 #include "dictionary/text_dictionary_loader.h"
35
36 #include <algorithm>
37 #include <limits>
38 #include <memory>
39 #include <string>
40 #include <vector>
41
42 #include "base/file_stream.h"
43 #include "base/flags.h"
44 #include "base/iterator_adapter.h"
45 #include "base/logging.h"
46 #include "base/multifile.h"
47 #include "base/number_util.h"
48 #include "base/stl_util.h"
49 #include "base/string_piece.h"
50 #include "base/util.h"
51 #include "dictionary/dictionary_token.h"
52 #include "dictionary/pos_matcher.h"
53
54 DEFINE_int32(tokens_reserve_size, 1400000,
55 "Reserve the specified size of token buffer in advance.");
56
57 namespace mozc {
58 namespace dictionary {
59 namespace {
60
61 // Functor to sort a sequence of Tokens first by value and then by key.
62 struct OrderByValueThenByKey {
operator ()mozc::dictionary::__anon50b1a61b0111::OrderByValueThenByKey63 bool operator()(const Token *l, const Token *r) const {
64 const int comp = l->value.compare(r->value);
65 return comp == 0 ? (l->key < r->key) : (comp < 0);
66 }
67 };
68
69 // Provides a view of Token iterator as a pair of value and key. Used to look
70 // up tokens from a sorted range of Token pointers using value and key.
71 struct AsValueAndKey
72 : public AdapterBase<std::pair<StringPiece, StringPiece> > {
operator ()mozc::dictionary::__anon50b1a61b0111::AsValueAndKey73 value_type operator()(std::vector<Token *>::const_iterator iter) const {
74 const Token *token = *iter;
75 return std::pair<StringPiece, StringPiece>(token->value, token->key);
76 }
77 };
78
79 // Provides a view of Token iterator as a value string. Used to look up tokens
80 // from a sorted range of Tokens using value.
81 struct AsValue : public AdapterBase<StringPiece> {
operator ()mozc::dictionary::__anon50b1a61b0111::AsValue82 value_type operator()(std::vector<Token *>::const_iterator iter) const {
83 return StringPiece((*iter)->value);
84 }
85 };
86
87 // Parses one line of reading correction file. Since the result is returned as
88 // StringPieces, |line| needs to outlive |value_key|.
ParseReadingCorrectionTSV(const string & line,std::pair<StringPiece,StringPiece> * value_key)89 void ParseReadingCorrectionTSV(const string &line,
90 std::pair<StringPiece, StringPiece> *value_key) {
91 // Format: value\terror\tcorrect
92 SplitIterator<SingleDelimiter> iter(line, "\t");
93 CHECK(!iter.Done());
94 value_key->first = iter.Get();
95 iter.Next();
96 CHECK(!iter.Done());
97 value_key->second = iter.Get();
98 }
99
100 // Helper function to parse an integer from a string.
SafeStrToInt(StringPiece s,int * n)101 inline bool SafeStrToInt(StringPiece s, int *n) {
102 uint32 u32 = 0;
103 const bool ret = NumberUtil::SafeStrToUInt32(s, &u32);
104 *n = u32;
105 return ret;
106 }
107
108 // Helper functions to get const iterators.
CBegin(const std::vector<Token * > & tokens)109 inline std::vector<Token *>::const_iterator CBegin(
110 const std::vector<Token *> &tokens) {
111 return tokens.begin();
112 }
113
CEnd(const std::vector<Token * > & tokens)114 inline std::vector<Token *>::const_iterator CEnd(
115 const std::vector<Token *> &tokens) {
116 return tokens.end();
117 }
118
119 } // namespace
120
TextDictionaryLoader(const POSMatcher & pos_matcher)121 TextDictionaryLoader::TextDictionaryLoader(const POSMatcher &pos_matcher)
122 : zipcode_id_(pos_matcher.GetZipcodeId()),
123 isolated_word_id_(pos_matcher.GetIsolatedWordId()) {}
124
TextDictionaryLoader(uint16 zipcode_id,uint16 isolated_word_id)125 TextDictionaryLoader::TextDictionaryLoader(uint16 zipcode_id,
126 uint16 isolated_word_id)
127 : zipcode_id_(zipcode_id), isolated_word_id_(isolated_word_id) {}
128
~TextDictionaryLoader()129 TextDictionaryLoader::~TextDictionaryLoader() {
130 Clear();
131 }
132
RewriteSpecialToken(Token * token,StringPiece label) const133 bool TextDictionaryLoader::RewriteSpecialToken(Token *token,
134 StringPiece label) const {
135 CHECK(token);
136 if (label.empty()) {
137 return true;
138 }
139 if (Util::StartsWith(label, "SPELLING_CORRECTION")) {
140 token->attributes = Token::SPELLING_CORRECTION;
141 return true;
142 }
143 if (Util::StartsWith(label, "ZIP_CODE")) {
144 token->lid = zipcode_id_;
145 token->rid = zipcode_id_;
146 return true;
147 }
148 if (Util::StartsWith(label, "ENGLISH")) {
149 // TODO(noriyukit): Might be better to use special POS for english words.
150 token->lid = isolated_word_id_;
151 token->rid = isolated_word_id_;
152 return true;
153 }
154 LOG(ERROR) << "Unknown special label: " << label;
155 return false;
156 }
157
Load(const string & dictionary_filename,const string & reading_correction_filename)158 void TextDictionaryLoader::Load(const string &dictionary_filename,
159 const string &reading_correction_filename) {
160 LoadWithLineLimit(dictionary_filename, reading_correction_filename, -1);
161 }
162
LoadWithLineLimit(const string & dictionary_filename,const string & reading_correction_filename,int limit)163 void TextDictionaryLoader::LoadWithLineLimit(
164 const string &dictionary_filename,
165 const string &reading_correction_filename,
166 int limit) {
167 Clear();
168
169 // Roughly allocate buffers for Token pointers.
170 if (limit < 0) {
171 tokens_.reserve(FLAGS_tokens_reserve_size);
172 limit = std::numeric_limits<int>::max();
173 } else {
174 tokens_.reserve(limit);
175 }
176
177 // Read system dictionary.
178 {
179 InputMultiFile file(dictionary_filename);
180 string line;
181 while (limit > 0 && file.ReadLine(&line)) {
182 Util::ChopReturns(&line);
183 Token *token = ParseTSVLine(line);
184 if (token) {
185 tokens_.push_back(token);
186 --limit;
187 }
188 }
189 LOG(INFO) << tokens_.size() << " tokens from " << dictionary_filename;
190 }
191
192 if (reading_correction_filename.empty() || limit <= 0) {
193 return;
194 }
195
196 // Prepare for loading reading corrections. We sort |tokens_| first by value
197 // and then by key so that we can perform the following operations both in
198 // O(log(N)), where N is the size of tokens.
199 // 1. Checking the existence of any key-value pairs: This can be done by
200 // binary-searching for a pair of value and key.
201 // 2. Accessing all the tokens that have the same value: Since tokens are
202 // also sorted in order of value, this can be done by finding a range of
203 // tokens that have the same value.
204 std::sort(tokens_.begin(), tokens_.end(), OrderByValueThenByKey());
205
206 std::vector<Token *> reading_correction_tokens;
207 LoadReadingCorrectionTokens(reading_correction_filename, tokens_,
208 &limit, &reading_correction_tokens);
209 const size_t tokens_size = tokens_.size();
210 tokens_.resize(tokens_size + reading_correction_tokens.size());
211 for (size_t i = 0; i < reading_correction_tokens.size(); ++i) {
212 // |tokens_| takes the ownership of each allocated token.
213 tokens_[tokens_size + i] = reading_correction_tokens[i];
214 }
215 }
216
217 // Loads reading correction data into |tokens|. The second argument is used to
218 // determine costs of reading correction tokens and must be sorted by
219 // OrderByValueThenByKey(). The output tokens are newly allocated and the
220 // caller is responsible to delete them.
LoadReadingCorrectionTokens(const string & reading_correction_filename,const std::vector<Token * > & ref_sorted_tokens,int * limit,std::vector<Token * > * tokens)221 void TextDictionaryLoader::LoadReadingCorrectionTokens(
222 const string &reading_correction_filename,
223 const std::vector<Token *> &ref_sorted_tokens,
224 int *limit,
225 std::vector<Token *> *tokens) {
226 // Load reading correction entries.
227 int reading_correction_size = 0;
228
229 InputMultiFile file(reading_correction_filename);
230 string line;
231 while (file.ReadLine(&line)) {
232 if (line.empty() || line[0] == '#') {
233 continue;
234 }
235
236 // Parse TSV line in a pair of value and key (Note: first element is value
237 // and the second key).
238 Util::ChopReturns(&line);
239 std::pair<StringPiece, StringPiece> value_key;
240 ParseReadingCorrectionTSV(line, &value_key);
241
242 // Filter the entry if this key value pair already exists in the system
243 // dictionary.
244 if (std::binary_search(
245 MakeIteratorAdapter(ref_sorted_tokens.begin(), AsValueAndKey()),
246 MakeIteratorAdapter(ref_sorted_tokens.end(), AsValueAndKey()),
247 value_key)) {
248 VLOG(1) << "System dictionary has the same key-value: " << line;
249 continue;
250 }
251
252 // Since reading correction entries lack POS and cost, we recover those
253 // fields from a token in the system dictionary that has the same value.
254 // Since multple tokens may have the same value, from such tokens, we
255 // select the one that has the maximum cost.
256 typedef std::vector<Token *>::const_iterator TokenIterator;
257 typedef IteratorAdapter<TokenIterator, AsValue> AsValueIterator;
258 typedef std::pair<AsValueIterator, AsValueIterator> Range;
259 Range range = std::equal_range(
260 MakeIteratorAdapter(CBegin(ref_sorted_tokens), AsValue()),
261 MakeIteratorAdapter(CEnd(ref_sorted_tokens), AsValue()),
262 value_key.first);
263 TokenIterator begin = range.first.base(), end = range.second.base();
264 if (begin == end) {
265 VLOG(1) << "Cannot find the value in system dicitonary - ignored:"
266 << line;
267 continue;
268 }
269 // Now [begin, end) contains all the tokens that have the same value as
270 // this reading correction entry. Next, find the token that has the
271 // maximum cost in [begin, end). Note that linear search is sufficiently
272 // fast here because the size of the range is small.
273 const Token *max_cost_token = *begin;
274 for (++begin; begin != end; ++begin) {
275 if ((*begin)->cost > max_cost_token->cost) {
276 max_cost_token = *begin;
277 }
278 }
279
280 // The cost is calculated as -log(prob) * 500.
281 // We here assume that the wrong reading appear with 1/100 probability
282 // of the original (correct) reading.
283 const int kCostPenalty = 2302; // -log(1/100) * 500;
284 std::unique_ptr<Token> token(new Token);
285 token->key.assign(value_key.second.data(), value_key.second.size());
286 token->value = max_cost_token->value;
287 token->lid = max_cost_token->lid;
288 token->rid = max_cost_token->rid;
289 token->cost = max_cost_token->cost + kCostPenalty;
290 // We don't set SPELLING_CORRECTION. The entries in reading_correction
291 // data are also stored in rewriter/correction_rewriter.cc.
292 // reading_correction_rewriter annotates the spelling correction
293 // notations.
294 token->attributes = Token::NONE;
295 tokens->push_back(token.release());
296 ++reading_correction_size;
297 if (--*limit <= 0) {
298 break;
299 }
300 }
301 LOG(INFO) << reading_correction_size << " tokens from "
302 << reading_correction_filename;
303 }
304
Clear()305 void TextDictionaryLoader::Clear() {
306 STLDeleteElements(&tokens_);
307 }
308
CollectTokens(std::vector<Token * > * res) const309 void TextDictionaryLoader::CollectTokens(std::vector<Token *> *res) const {
310 DCHECK(res);
311 res->reserve(res->size() + tokens_.size());
312 res->insert(res->end(), tokens_.begin(), tokens_.end());
313 }
314
ParseTSVLine(StringPiece line) const315 Token *TextDictionaryLoader::ParseTSVLine(StringPiece line) const {
316 std::vector<StringPiece> columns;
317 Util::SplitStringUsing(line, "\t", &columns);
318 return ParseTSV(columns);
319 }
320
ParseTSV(const std::vector<StringPiece> & columns) const321 Token *TextDictionaryLoader::ParseTSV(
322 const std::vector<StringPiece> &columns) const {
323 CHECK_LE(5, columns.size()) << "Lack of columns: " << columns.size();
324
325 std::unique_ptr<Token> token(new Token);
326
327 // Parse key, lid, rid, cost, value.
328 Util::NormalizeVoicedSoundMark(columns[0], &token->key);
329 CHECK(SafeStrToInt(columns[1], &token->lid))
330 << "Wrong lid: " << columns[1];
331 CHECK(SafeStrToInt(columns[2], &token->rid))
332 << "Wrong rid: " << columns[2];
333 CHECK(SafeStrToInt(columns[3], &token->cost))
334 << "Wrong cost: " << columns[3];
335 Util::NormalizeVoicedSoundMark(columns[4], &token->value);
336
337 // Optionally, label (SPELLING_CORRECTION, ZIP_CODE, etc.) may be provided in
338 // column 6.
339 if (columns.size() > 5) {
340 CHECK(RewriteSpecialToken(token.get(), columns[5]))
341 << "Invalid label: " << columns[5];
342 }
343 return token.release();
344 }
345
346 } // namespace dictionary
347 } // namespace mozc
348