1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 // Generates:
31 // |kCompressedLSize|, |kCompressedRSize|,
32 // |kCompressedLIDTable|, |kCompressedRIDTable|,
33 // |kSegmenterBitArrayData_size|, |kSegmenterBitArrayData_data|
34
35 #include "converter/gen_segmenter_bitarray.h"
36
37 #include <map>
38 #include <string>
39 #include <vector>
40
41 #include "base/bitarray.h"
42 #include "base/file_stream.h"
43 #include "base/logging.h"
44 #include "base/port.h"
45 #include "base/util.h"
46 #include "protocol/segmenter_data.pb.h"
47
48 namespace mozc {
49
50 namespace {
51 class StateTable {
52 public:
StateTable(const size_t size)53 explicit StateTable(const size_t size) : compressed_size_(0) {
54 idarray_.resize(size);
55 }
56
57 // |str| is an 1-dimentional row (or column) represented in byte array.
Add(uint16 id,const string & str)58 void Add(uint16 id, const string &str) {
59 CHECK_LT(id, idarray_.size());
60 idarray_[id] = str;
61 }
62
Build()63 void Build() {
64 compressed_table_.resize(idarray_.size());
65 uint16 id = 0;
66 std::map<string, uint16> dup;
67 for (size_t i = 0; i < idarray_.size(); ++i) {
68 std::map<string, uint16>::const_iterator it = dup.find(idarray_[i]);
69 if (it != dup.end()) {
70 compressed_table_[i] = it->second;
71 } else {
72 compressed_table_[i] = id;
73 dup.insert(std::make_pair(idarray_[i], id));
74 ++id;
75 }
76 }
77
78 compressed_size_ = dup.size();
79
80 // verify
81 for (size_t i = 0; i < idarray_.size(); ++i) {
82 CHECK_LT(compressed_table_[i], compressed_size_);
83 CHECK_EQ(dup[idarray_[i]], compressed_table_[i]);
84 }
85
86 CHECK_LT(compressed_size_, idarray_.size());
87 }
88
id(uint16 id) const89 uint16 id(uint16 id) const {
90 CHECK_LT(id, idarray_.size());
91 return compressed_table_[id];
92 }
93
compressed_size() const94 size_t compressed_size() const { return compressed_size_; }
95
Output(std::ostream * os)96 void Output(std::ostream *os) {
97 const char* data = reinterpret_cast<const char*>(compressed_table_.data());
98 const size_t bytelen = compressed_table_.size() * sizeof(uint16);
99 os->write(data, bytelen);
100 }
101
102 private:
103 std::vector<string> idarray_;
104 std::vector<uint16> compressed_table_;
105 size_t compressed_size_;
106
107 DISALLOW_COPY_AND_ASSIGN(StateTable);
108 };
109 } // namespace
110
GenerateBitarray(int lsize,int rsize,IsBoundaryFunc func,const string & output_size_info,const string & output_ltable,const string & output_rtable,const string & output_bitarray)111 void SegmenterBitarrayGenerator::GenerateBitarray(
112 int lsize, int rsize, IsBoundaryFunc func, const string &output_size_info,
113 const string &output_ltable, const string &output_rtable,
114 const string &output_bitarray) {
115 // Load the original matrix into an array
116 std::vector<uint8> array((lsize + 1) * (rsize + 1));
117
118 for (size_t rid = 0; rid <= lsize; ++rid) {
119 for (size_t lid = 0; lid <= rsize; ++lid) {
120 const uint32 index = rid + lsize * lid;
121 CHECK_LT(index, array.size());
122 if (rid == lsize || lid == rsize) {
123 array[index] = 1;
124 continue;
125 }
126 if ((*func)(rid, lid)) {
127 array[index] = 1;
128 } else {
129 array[index] = 0;
130 }
131 }
132 }
133
134 // Reduce left states (remove dupliacate rows)
135 StateTable ltable(lsize + 1);
136 for (size_t rid = 0; rid <= lsize; ++rid) {
137 string buf;
138 for (size_t lid = 0; lid <= rsize; ++lid) {
139 const uint32 index = rid + lsize * lid;
140 buf += array[index];
141 }
142 ltable.Add(rid, buf);
143 }
144
145 // Reduce right states (remove dupliacate columns)
146 StateTable rtable(rsize + 1);
147 for (size_t lid = 0; lid <= rsize; ++lid) {
148 string buf;
149 for (size_t rid = 0; rid <= lsize; ++rid) {
150 const uint32 index = rid + lsize * lid;
151 buf += array[index];
152 }
153 rtable.Add(lid, buf);
154 }
155
156 // make lookup table
157 rtable.Build();
158 ltable.Build();
159
160 const size_t kCompressedLSize = ltable.compressed_size();
161 const size_t kCompressedRSize = rtable.compressed_size();
162
163 CHECK_GT(kCompressedLSize, 0);
164 CHECK_GT(kCompressedRSize, 0);
165
166 // make bitarray
167 mozc::BitArray barray(kCompressedLSize * kCompressedRSize);
168 for (size_t rid = 0; rid <= lsize; ++rid) {
169 for (size_t lid = 0; lid <= rsize; ++lid) {
170 const int index = rid + lsize * lid;
171 const uint32 cindex = ltable.id(rid) + kCompressedLSize * rtable.id(lid);
172 if (array[index] > 0) {
173 barray.set(cindex);
174 } else {
175 barray.clear(cindex);
176 }
177 }
178 }
179
180 // verify the table
181 for (size_t rid = 0; rid <= lsize; ++rid) {
182 for (size_t lid = 0; lid <= rsize; ++lid) {
183 const int index = rid + lsize * lid;
184 const uint32 cindex = ltable.id(rid) + kCompressedLSize * rtable.id(lid);
185 CHECK_EQ(barray.get(cindex), (array[index] != 0));
186 }
187 }
188
189 CHECK(barray.array());
190 CHECK_GT(barray.size(), 0);
191
192 CHECK(Util::IsLittleEndian())
193 << "Architecture must be little endian";
194 {
195 mozc::converter::SegmenterDataSizeInfo pb;
196 pb.set_compressed_lsize(kCompressedLSize);
197 pb.set_compressed_rsize(kCompressedRSize);
198 mozc::OutputFileStream ofs(output_size_info.c_str(),
199 std::ios_base::out | std::ios_base::binary);
200 CHECK(ofs);
201 CHECK(pb.SerializeToOstream(&ofs));
202 ofs.close();
203 }
204 {
205 mozc::OutputFileStream ofs(output_ltable.c_str(),
206 std::ios_base::out | std::ios_base::binary);
207 CHECK(ofs);
208 ltable.Output(&ofs);
209 ofs.close();
210 }
211 {
212 mozc::OutputFileStream ofs(output_rtable.c_str(),
213 std::ios_base::out | std::ios_base::binary);
214 CHECK(ofs);
215 rtable.Output(&ofs);
216 ofs.close();
217 }
218 {
219 mozc::OutputFileStream ofs(output_bitarray.c_str(),
220 std::ios_base::out | std::ios_base::binary);
221 CHECK(ofs);
222 ofs.write(barray.array(), barray.array_size());
223 ofs.close();
224 }
225 }
226
227 } // namespace mozc
228