1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/user_pos.h"
31 
32 #include <algorithm>
33 #include <set>
34 
35 #include "base/logging.h"
36 #include "base/util.h"
37 
38 namespace mozc {
39 namespace dictionary {
40 
UserPOS(StringPiece token_array_data,StringPiece string_array_data)41 UserPOS::UserPOS(StringPiece token_array_data, StringPiece string_array_data)
42     : token_array_data_(token_array_data) {
43   DCHECK_EQ(token_array_data.size() % 8, 0);
44   DCHECK(SerializedStringArray::VerifyData(string_array_data));
45   string_array_.Set(string_array_data);
46 }
47 
48 UserPOS::~UserPOS() = default;
49 
GetPOSList(std::vector<string> * pos_list) const50 void UserPOS::GetPOSList(std::vector<string> *pos_list) const {
51   pos_list->clear();
52   std::set<uint16> seen;
53   for (auto iter = begin(); iter != end(); ++iter) {
54     if (!seen.insert(iter.pos_index()).second) {
55       continue;
56     }
57     const StringPiece pos = string_array_[iter.pos_index()];
58     pos_list->emplace_back(pos.data(), pos.size());
59   }
60 }
61 
IsValidPOS(const string & pos) const62 bool UserPOS::IsValidPOS(const string &pos) const {
63   const auto iter =
64       std::lower_bound(string_array_.begin(), string_array_.end(), pos);
65   if (iter == string_array_.end()) {
66     return false;
67   }
68   return std::binary_search(begin(), end(), iter.index());
69 }
70 
GetPOSIDs(const string & pos,uint16 * id) const71 bool UserPOS::GetPOSIDs(const string &pos, uint16 *id) const {
72   const auto str_iter =
73       std::lower_bound(string_array_.begin(), string_array_.end(), pos);
74   if (str_iter == string_array_.end() || *str_iter != pos) {
75     return false;
76   }
77   const auto token_iter = std::lower_bound(begin(), end(), str_iter.index());
78   if (token_iter == end() || token_iter.pos_index() != str_iter.index()) {
79     return false;
80   }
81   *id = token_iter.conjugation_id();
82   return true;
83 }
84 
GetTokens(const string & key,const string & value,const string & pos,std::vector<Token> * tokens) const85 bool UserPOS::GetTokens(const string &key, const string &value,
86                         const string &pos, std::vector<Token> *tokens) const {
87   if (key.empty() || value.empty() || pos.empty() || tokens == nullptr) {
88     return false;
89   }
90 
91   tokens->clear();
92   const auto str_iter =
93       std::lower_bound(string_array_.begin(), string_array_.end(), pos);
94   if (str_iter == string_array_.end() || *str_iter != pos) {
95     return false;
96   }
97   std::pair<iterator, iterator> range =
98       std::equal_range(begin(), end(), str_iter.index());
99   if (range.first == range.second) {
100     return false;
101   }
102   const size_t size = range.second - range.first;
103   CHECK_GE(size, 1);
104   tokens->resize(size);
105 
106   // TODO(taku)  Change the cost by seeing cost_type
107   const int16 kDefaultCost = 5000;
108 
109   // Set smaller cost for "短縮よみ" in order to make
110   // the rank of the word higher than others.
111   const int16 kIsolatedWordCost = 200;
112   const char kIsolatedWordPOS[] = "短縮よみ";
113 
114   if (size == 1) {  // no conjugation
115     const auto &token_iter = range.first;
116     (*tokens)[0].key = key;
117     (*tokens)[0].value = value;
118     (*tokens)[0].id = token_iter.conjugation_id();
119     if (pos == kIsolatedWordPOS) {
120       (*tokens)[0].cost = kIsolatedWordCost;
121     } else {
122       (*tokens)[0].cost = kDefaultCost;
123     }
124   } else {
125     const auto &base_form_token_iter = range.first;
126     // expand all other forms
127     string key_stem = key;
128     string value_stem = value;
129     // assume that conjugation_form[0] contains the suffix of "base form".
130     const StringPiece base_key_suffix =
131         string_array_[base_form_token_iter.key_suffix_index()];
132     const StringPiece base_value_suffix =
133         string_array_[base_form_token_iter.value_suffix_index()];
134 
135     if (base_key_suffix.size() < key.size() &&
136         base_value_suffix.size() < value.size() &&
137         Util::EndsWith(key, base_key_suffix) &&
138         Util::EndsWith(value, base_value_suffix)) {
139       key_stem.assign(key, 0, key.size() - base_key_suffix.size());
140       value_stem.assign(value, 0, value.size() - base_value_suffix.size());
141     }
142     for (size_t i = 0; i < size; ++i, ++range.first) {
143       const auto &token_iter = range.first;
144       const StringPiece key_suffix =
145           string_array_[token_iter.key_suffix_index()];
146       const StringPiece value_suffix =
147           string_array_[token_iter.value_suffix_index()];
148       Util::ConcatStrings(key_stem, key_suffix, &(*tokens)[i].key);
149       Util::ConcatStrings(value_stem, value_suffix, &(*tokens)[i].value);
150       (*tokens)[i].id = token_iter.conjugation_id();
151       (*tokens)[i].cost = kDefaultCost;
152     }
153     DCHECK(range.first == range.second);
154   }
155 
156   return true;
157 }
158 
CreateFromDataManager(const DataManagerInterface & manager)159 UserPOS *UserPOS::CreateFromDataManager(const DataManagerInterface &manager) {
160   StringPiece token_array_data, string_array_data;
161   manager.GetUserPOSData(&token_array_data, &string_array_data);
162   return new UserPOS(token_array_data, string_array_data);
163 }
164 
165 }  // namespace dictionary
166 }  // namespace mozc
167