1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 // NOTE(tabata): This code is used mainly by louds trie builder to build
31 // dictionary. Please check error handling, if you want to include
32 // this to run within client.
33 
34 #include "dictionary/text_dictionary_loader.h"
35 
36 #include <algorithm>
37 #include <limits>
38 #include <memory>
39 #include <string>
40 #include <vector>
41 
42 #include "base/file_stream.h"
43 #include "base/flags.h"
44 #include "base/iterator_adapter.h"
45 #include "base/logging.h"
46 #include "base/multifile.h"
47 #include "base/number_util.h"
48 #include "base/stl_util.h"
49 #include "base/string_piece.h"
50 #include "base/util.h"
51 #include "dictionary/dictionary_token.h"
52 #include "dictionary/pos_matcher.h"
53 
54 DEFINE_int32(tokens_reserve_size, 1400000,
55              "Reserve the specified size of token buffer in advance.");
56 
57 namespace mozc {
58 namespace dictionary {
59 namespace {
60 
61 // Functor to sort a sequence of Tokens first by value and then by key.
62 struct OrderByValueThenByKey {
operator ()mozc::dictionary::__anon50b1a61b0111::OrderByValueThenByKey63   bool operator()(const Token *l, const Token *r) const {
64     const int comp = l->value.compare(r->value);
65     return comp == 0 ? (l->key < r->key) : (comp < 0);
66   }
67 };
68 
69 // Provides a view of Token iterator as a pair of value and key.  Used to look
70 // up tokens from a sorted range of Token pointers using value and key.
71 struct AsValueAndKey
72     : public AdapterBase<std::pair<StringPiece, StringPiece> > {
operator ()mozc::dictionary::__anon50b1a61b0111::AsValueAndKey73   value_type operator()(std::vector<Token *>::const_iterator iter) const {
74     const Token *token = *iter;
75     return std::pair<StringPiece, StringPiece>(token->value, token->key);
76   }
77 };
78 
79 // Provides a view of Token iterator as a value string.  Used to look up tokens
80 // from a sorted range of Tokens using value.
81 struct AsValue : public AdapterBase<StringPiece> {
operator ()mozc::dictionary::__anon50b1a61b0111::AsValue82   value_type operator()(std::vector<Token *>::const_iterator iter) const {
83     return StringPiece((*iter)->value);
84   }
85 };
86 
87 // Parses one line of reading correction file.  Since the result is returned as
88 // StringPieces, |line| needs to outlive |value_key|.
ParseReadingCorrectionTSV(const string & line,std::pair<StringPiece,StringPiece> * value_key)89 void ParseReadingCorrectionTSV(const string &line,
90                                std::pair<StringPiece, StringPiece> *value_key) {
91   // Format: value\terror\tcorrect
92   SplitIterator<SingleDelimiter> iter(line, "\t");
93   CHECK(!iter.Done());
94   value_key->first = iter.Get();
95   iter.Next();
96   CHECK(!iter.Done());
97   value_key->second = iter.Get();
98 }
99 
100 // Helper function to parse an integer from a string.
SafeStrToInt(StringPiece s,int * n)101 inline bool SafeStrToInt(StringPiece s, int *n) {
102   uint32 u32 = 0;
103   const bool ret = NumberUtil::SafeStrToUInt32(s, &u32);
104   *n = u32;
105   return ret;
106 }
107 
108 // Helper functions to get const iterators.
CBegin(const std::vector<Token * > & tokens)109 inline std::vector<Token *>::const_iterator CBegin(
110     const std::vector<Token *> &tokens) {
111   return tokens.begin();
112 }
113 
CEnd(const std::vector<Token * > & tokens)114 inline std::vector<Token *>::const_iterator CEnd(
115     const std::vector<Token *> &tokens) {
116   return tokens.end();
117 }
118 
119 }  // namespace
120 
TextDictionaryLoader(const POSMatcher & pos_matcher)121 TextDictionaryLoader::TextDictionaryLoader(const POSMatcher &pos_matcher)
122     : zipcode_id_(pos_matcher.GetZipcodeId()),
123       isolated_word_id_(pos_matcher.GetIsolatedWordId()) {}
124 
TextDictionaryLoader(uint16 zipcode_id,uint16 isolated_word_id)125 TextDictionaryLoader::TextDictionaryLoader(uint16 zipcode_id,
126                                            uint16 isolated_word_id)
127     : zipcode_id_(zipcode_id), isolated_word_id_(isolated_word_id) {}
128 
~TextDictionaryLoader()129 TextDictionaryLoader::~TextDictionaryLoader() {
130   Clear();
131 }
132 
RewriteSpecialToken(Token * token,StringPiece label) const133 bool TextDictionaryLoader::RewriteSpecialToken(Token *token,
134                                                StringPiece label) const {
135   CHECK(token);
136   if (label.empty()) {
137     return true;
138   }
139   if (Util::StartsWith(label, "SPELLING_CORRECTION")) {
140     token->attributes = Token::SPELLING_CORRECTION;
141     return true;
142   }
143   if (Util::StartsWith(label, "ZIP_CODE")) {
144     token->lid = zipcode_id_;
145     token->rid = zipcode_id_;
146     return true;
147   }
148   if (Util::StartsWith(label, "ENGLISH")) {
149     // TODO(noriyukit): Might be better to use special POS for english words.
150     token->lid = isolated_word_id_;
151     token->rid = isolated_word_id_;
152     return true;
153   }
154   LOG(ERROR) << "Unknown special label: " << label;
155   return false;
156 }
157 
Load(const string & dictionary_filename,const string & reading_correction_filename)158 void TextDictionaryLoader::Load(const string &dictionary_filename,
159                                 const string &reading_correction_filename) {
160   LoadWithLineLimit(dictionary_filename, reading_correction_filename, -1);
161 }
162 
LoadWithLineLimit(const string & dictionary_filename,const string & reading_correction_filename,int limit)163 void TextDictionaryLoader::LoadWithLineLimit(
164     const string &dictionary_filename,
165     const string &reading_correction_filename,
166     int limit) {
167   Clear();
168 
169   // Roughly allocate buffers for Token pointers.
170   if (limit < 0) {
171     tokens_.reserve(FLAGS_tokens_reserve_size);
172     limit = std::numeric_limits<int>::max();
173   } else {
174     tokens_.reserve(limit);
175   }
176 
177   // Read system dictionary.
178   {
179     InputMultiFile file(dictionary_filename);
180     string line;
181     while (limit > 0 && file.ReadLine(&line)) {
182       Util::ChopReturns(&line);
183       Token *token = ParseTSVLine(line);
184       if (token) {
185         tokens_.push_back(token);
186         --limit;
187       }
188     }
189     LOG(INFO) << tokens_.size() << " tokens from " << dictionary_filename;
190   }
191 
192   if (reading_correction_filename.empty() || limit <= 0) {
193     return;
194   }
195 
196   // Prepare for loading reading corrections. We sort |tokens_| first by value
197   // and then by key so that we can perform the following operations both in
198   // O(log(N)), where N is the size of tokens.
199   //   1. Checking the existence of any key-value pairs: This can be done by
200   //      binary-searching for a pair of value and key.
201   //   2. Accessing all the tokens that have the same value: Since tokens are
202   //      also sorted in order of value, this can be done by finding a range of
203   //      tokens that have the same value.
204   std::sort(tokens_.begin(), tokens_.end(), OrderByValueThenByKey());
205 
206   std::vector<Token *> reading_correction_tokens;
207   LoadReadingCorrectionTokens(reading_correction_filename, tokens_,
208                               &limit, &reading_correction_tokens);
209   const size_t tokens_size = tokens_.size();
210   tokens_.resize(tokens_size + reading_correction_tokens.size());
211   for (size_t i = 0; i < reading_correction_tokens.size(); ++i) {
212     // |tokens_| takes the ownership of each allocated token.
213     tokens_[tokens_size + i] = reading_correction_tokens[i];
214   }
215 }
216 
217 // Loads reading correction data into |tokens|.  The second argument is used to
218 // determine costs of reading correction tokens and must be sorted by
219 // OrderByValueThenByKey().  The output tokens are newly allocated and the
220 // caller is responsible to delete them.
LoadReadingCorrectionTokens(const string & reading_correction_filename,const std::vector<Token * > & ref_sorted_tokens,int * limit,std::vector<Token * > * tokens)221 void TextDictionaryLoader::LoadReadingCorrectionTokens(
222     const string &reading_correction_filename,
223     const std::vector<Token *> &ref_sorted_tokens,
224     int *limit,
225     std::vector<Token *> *tokens) {
226   // Load reading correction entries.
227   int reading_correction_size = 0;
228 
229   InputMultiFile file(reading_correction_filename);
230   string line;
231   while (file.ReadLine(&line)) {
232     if (line.empty() || line[0] == '#') {
233       continue;
234     }
235 
236     // Parse TSV line in a pair of value and key (Note: first element is value
237     // and the second key).
238     Util::ChopReturns(&line);
239     std::pair<StringPiece, StringPiece> value_key;
240     ParseReadingCorrectionTSV(line, &value_key);
241 
242     // Filter the entry if this key value pair already exists in the system
243     // dictionary.
244     if (std::binary_search(
245             MakeIteratorAdapter(ref_sorted_tokens.begin(), AsValueAndKey()),
246             MakeIteratorAdapter(ref_sorted_tokens.end(), AsValueAndKey()),
247             value_key)) {
248       VLOG(1) << "System dictionary has the same key-value: " << line;
249       continue;
250     }
251 
252     // Since reading correction entries lack POS and cost, we recover those
253     // fields from a token in the system dictionary that has the same value.
254     // Since multple tokens may have the same value, from such tokens, we
255     // select the one that has the maximum cost.
256     typedef std::vector<Token *>::const_iterator TokenIterator;
257     typedef IteratorAdapter<TokenIterator, AsValue> AsValueIterator;
258     typedef std::pair<AsValueIterator, AsValueIterator> Range;
259     Range range = std::equal_range(
260         MakeIteratorAdapter(CBegin(ref_sorted_tokens), AsValue()),
261         MakeIteratorAdapter(CEnd(ref_sorted_tokens), AsValue()),
262         value_key.first);
263     TokenIterator begin = range.first.base(), end = range.second.base();
264     if (begin == end) {
265       VLOG(1) << "Cannot find the value in system dicitonary - ignored:"
266               << line;
267       continue;
268     }
269     // Now [begin, end) contains all the tokens that have the same value as
270     // this reading correction entry.  Next, find the token that has the
271     // maximum cost in [begin, end).  Note that linear search is sufficiently
272     // fast here because the size of the range is small.
273     const Token *max_cost_token = *begin;
274     for (++begin; begin != end; ++begin) {
275       if ((*begin)->cost > max_cost_token->cost) {
276         max_cost_token = *begin;
277       }
278     }
279 
280     // The cost is calculated as -log(prob) * 500.
281     // We here assume that the wrong reading appear with 1/100 probability
282     // of the original (correct) reading.
283     const int kCostPenalty = 2302;      // -log(1/100) * 500;
284     std::unique_ptr<Token> token(new Token);
285     token->key.assign(value_key.second.data(), value_key.second.size());
286     token->value = max_cost_token->value;
287     token->lid = max_cost_token->lid;
288     token->rid = max_cost_token->rid;
289     token->cost = max_cost_token->cost + kCostPenalty;
290     // We don't set SPELLING_CORRECTION. The entries in reading_correction
291     // data are also stored in rewriter/correction_rewriter.cc.
292     // reading_correction_rewriter annotates the spelling correction
293     // notations.
294     token->attributes = Token::NONE;
295     tokens->push_back(token.release());
296     ++reading_correction_size;
297     if (--*limit <= 0) {
298       break;
299     }
300   }
301   LOG(INFO) << reading_correction_size << " tokens from "
302             << reading_correction_filename;
303 }
304 
Clear()305 void TextDictionaryLoader::Clear() {
306   STLDeleteElements(&tokens_);
307 }
308 
CollectTokens(std::vector<Token * > * res) const309 void TextDictionaryLoader::CollectTokens(std::vector<Token *> *res) const {
310   DCHECK(res);
311   res->reserve(res->size() + tokens_.size());
312   res->insert(res->end(), tokens_.begin(), tokens_.end());
313 }
314 
ParseTSVLine(StringPiece line) const315 Token *TextDictionaryLoader::ParseTSVLine(StringPiece line) const {
316   std::vector<StringPiece> columns;
317   Util::SplitStringUsing(line, "\t", &columns);
318   return ParseTSV(columns);
319 }
320 
ParseTSV(const std::vector<StringPiece> & columns) const321 Token *TextDictionaryLoader::ParseTSV(
322     const std::vector<StringPiece> &columns) const {
323   CHECK_LE(5, columns.size()) << "Lack of columns: " << columns.size();
324 
325   std::unique_ptr<Token> token(new Token);
326 
327   // Parse key, lid, rid, cost, value.
328   Util::NormalizeVoicedSoundMark(columns[0], &token->key);
329   CHECK(SafeStrToInt(columns[1], &token->lid))
330       << "Wrong lid: " << columns[1];
331   CHECK(SafeStrToInt(columns[2], &token->rid))
332       << "Wrong rid: " << columns[2];
333   CHECK(SafeStrToInt(columns[3], &token->cost))
334       << "Wrong cost: " << columns[3];
335   Util::NormalizeVoicedSoundMark(columns[4], &token->value);
336 
337   // Optionally, label (SPELLING_CORRECTION, ZIP_CODE, etc.) may be provided in
338   // column 6.
339   if (columns.size() > 5) {
340     CHECK(RewriteSpecialToken(token.get(), columns[5]))
341         << "Invalid label: " << columns[5];
342   }
343   return token.release();
344 }
345 
346 }  // namespace dictionary
347 }  // namespace mozc
348