1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/user_pos.h"
31
32 #include <algorithm>
33 #include <set>
34
35 #include "base/logging.h"
36 #include "base/util.h"
37
38 namespace mozc {
39 namespace dictionary {
40
UserPOS(StringPiece token_array_data,StringPiece string_array_data)41 UserPOS::UserPOS(StringPiece token_array_data, StringPiece string_array_data)
42 : token_array_data_(token_array_data) {
43 DCHECK_EQ(token_array_data.size() % 8, 0);
44 DCHECK(SerializedStringArray::VerifyData(string_array_data));
45 string_array_.Set(string_array_data);
46 }
47
48 UserPOS::~UserPOS() = default;
49
GetPOSList(std::vector<string> * pos_list) const50 void UserPOS::GetPOSList(std::vector<string> *pos_list) const {
51 pos_list->clear();
52 std::set<uint16> seen;
53 for (auto iter = begin(); iter != end(); ++iter) {
54 if (!seen.insert(iter.pos_index()).second) {
55 continue;
56 }
57 const StringPiece pos = string_array_[iter.pos_index()];
58 pos_list->emplace_back(pos.data(), pos.size());
59 }
60 }
61
IsValidPOS(const string & pos) const62 bool UserPOS::IsValidPOS(const string &pos) const {
63 const auto iter =
64 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
65 if (iter == string_array_.end()) {
66 return false;
67 }
68 return std::binary_search(begin(), end(), iter.index());
69 }
70
GetPOSIDs(const string & pos,uint16 * id) const71 bool UserPOS::GetPOSIDs(const string &pos, uint16 *id) const {
72 const auto str_iter =
73 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
74 if (str_iter == string_array_.end() || *str_iter != pos) {
75 return false;
76 }
77 const auto token_iter = std::lower_bound(begin(), end(), str_iter.index());
78 if (token_iter == end() || token_iter.pos_index() != str_iter.index()) {
79 return false;
80 }
81 *id = token_iter.conjugation_id();
82 return true;
83 }
84
GetTokens(const string & key,const string & value,const string & pos,std::vector<Token> * tokens) const85 bool UserPOS::GetTokens(const string &key, const string &value,
86 const string &pos, std::vector<Token> *tokens) const {
87 if (key.empty() || value.empty() || pos.empty() || tokens == nullptr) {
88 return false;
89 }
90
91 tokens->clear();
92 const auto str_iter =
93 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
94 if (str_iter == string_array_.end() || *str_iter != pos) {
95 return false;
96 }
97 std::pair<iterator, iterator> range =
98 std::equal_range(begin(), end(), str_iter.index());
99 if (range.first == range.second) {
100 return false;
101 }
102 const size_t size = range.second - range.first;
103 CHECK_GE(size, 1);
104 tokens->resize(size);
105
106 // TODO(taku) Change the cost by seeing cost_type
107 const int16 kDefaultCost = 5000;
108
109 // Set smaller cost for "短縮よみ" in order to make
110 // the rank of the word higher than others.
111 const int16 kIsolatedWordCost = 200;
112 const char kIsolatedWordPOS[] = "短縮よみ";
113
114 if (size == 1) { // no conjugation
115 const auto &token_iter = range.first;
116 (*tokens)[0].key = key;
117 (*tokens)[0].value = value;
118 (*tokens)[0].id = token_iter.conjugation_id();
119 if (pos == kIsolatedWordPOS) {
120 (*tokens)[0].cost = kIsolatedWordCost;
121 } else {
122 (*tokens)[0].cost = kDefaultCost;
123 }
124 } else {
125 const auto &base_form_token_iter = range.first;
126 // expand all other forms
127 string key_stem = key;
128 string value_stem = value;
129 // assume that conjugation_form[0] contains the suffix of "base form".
130 const StringPiece base_key_suffix =
131 string_array_[base_form_token_iter.key_suffix_index()];
132 const StringPiece base_value_suffix =
133 string_array_[base_form_token_iter.value_suffix_index()];
134
135 if (base_key_suffix.size() < key.size() &&
136 base_value_suffix.size() < value.size() &&
137 Util::EndsWith(key, base_key_suffix) &&
138 Util::EndsWith(value, base_value_suffix)) {
139 key_stem.assign(key, 0, key.size() - base_key_suffix.size());
140 value_stem.assign(value, 0, value.size() - base_value_suffix.size());
141 }
142 for (size_t i = 0; i < size; ++i, ++range.first) {
143 const auto &token_iter = range.first;
144 const StringPiece key_suffix =
145 string_array_[token_iter.key_suffix_index()];
146 const StringPiece value_suffix =
147 string_array_[token_iter.value_suffix_index()];
148 Util::ConcatStrings(key_stem, key_suffix, &(*tokens)[i].key);
149 Util::ConcatStrings(value_stem, value_suffix, &(*tokens)[i].value);
150 (*tokens)[i].id = token_iter.conjugation_id();
151 (*tokens)[i].cost = kDefaultCost;
152 }
153 DCHECK(range.first == range.second);
154 }
155
156 return true;
157 }
158
CreateFromDataManager(const DataManagerInterface & manager)159 UserPOS *UserPOS::CreateFromDataManager(const DataManagerInterface &manager) {
160 StringPiece token_array_data, string_array_data;
161 manager.GetUserPOSData(&token_array_data, &string_array_data);
162 return new UserPOS(token_array_data, string_array_data);
163 }
164
165 } // namespace dictionary
166 } // namespace mozc
167