1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 // Generates:
31 // |kCompressedLSize|, |kCompressedRSize|,
32 // |kCompressedLIDTable|, |kCompressedRIDTable|,
33 // |kSegmenterBitArrayData_size|, |kSegmenterBitArrayData_data|
34 
35 #include "converter/gen_segmenter_bitarray.h"
36 
37 #include <map>
38 #include <string>
39 #include <vector>
40 
41 #include "base/bitarray.h"
42 #include "base/file_stream.h"
43 #include "base/logging.h"
44 #include "base/port.h"
45 #include "base/util.h"
46 #include "protocol/segmenter_data.pb.h"
47 
48 namespace mozc {
49 
50 namespace {
51 class StateTable {
52  public:
StateTable(const size_t size)53   explicit StateTable(const size_t size) : compressed_size_(0) {
54     idarray_.resize(size);
55   }
56 
57   // |str| is an 1-dimentional row (or column) represented in byte array.
Add(uint16 id,const string & str)58   void Add(uint16 id, const string &str) {
59     CHECK_LT(id, idarray_.size());
60     idarray_[id] = str;
61   }
62 
Build()63   void Build() {
64     compressed_table_.resize(idarray_.size());
65     uint16 id = 0;
66     std::map<string, uint16> dup;
67     for (size_t i = 0; i < idarray_.size(); ++i) {
68       std::map<string, uint16>::const_iterator it = dup.find(idarray_[i]);
69       if (it != dup.end()) {
70         compressed_table_[i] = it->second;
71       } else {
72         compressed_table_[i] = id;
73         dup.insert(std::make_pair(idarray_[i], id));
74         ++id;
75       }
76     }
77 
78     compressed_size_ = dup.size();
79 
80     // verify
81     for (size_t i = 0; i < idarray_.size(); ++i) {
82       CHECK_LT(compressed_table_[i], compressed_size_);
83       CHECK_EQ(dup[idarray_[i]], compressed_table_[i]);
84     }
85 
86     CHECK_LT(compressed_size_, idarray_.size());
87   }
88 
id(uint16 id) const89   uint16 id(uint16 id) const {
90     CHECK_LT(id, idarray_.size());
91     return compressed_table_[id];
92   }
93 
compressed_size() const94   size_t compressed_size() const { return compressed_size_; }
95 
Output(std::ostream * os)96   void Output(std::ostream *os) {
97     const char* data = reinterpret_cast<const char*>(compressed_table_.data());
98     const size_t bytelen = compressed_table_.size() * sizeof(uint16);
99     os->write(data, bytelen);
100   }
101 
102  private:
103   std::vector<string> idarray_;
104   std::vector<uint16> compressed_table_;
105   size_t compressed_size_;
106 
107   DISALLOW_COPY_AND_ASSIGN(StateTable);
108 };
109 }  // namespace
110 
GenerateBitarray(int lsize,int rsize,IsBoundaryFunc func,const string & output_size_info,const string & output_ltable,const string & output_rtable,const string & output_bitarray)111 void SegmenterBitarrayGenerator::GenerateBitarray(
112     int lsize, int rsize, IsBoundaryFunc func, const string &output_size_info,
113     const string &output_ltable, const string &output_rtable,
114     const string &output_bitarray) {
115   // Load the original matrix into an array
116   std::vector<uint8> array((lsize + 1) * (rsize + 1));
117 
118   for (size_t rid = 0; rid <= lsize; ++rid) {
119     for (size_t lid = 0; lid <= rsize; ++lid) {
120       const uint32 index = rid + lsize * lid;
121       CHECK_LT(index, array.size());
122       if (rid == lsize || lid == rsize) {
123         array[index] = 1;
124         continue;
125       }
126       if ((*func)(rid, lid)) {
127         array[index] = 1;
128       } else {
129         array[index] = 0;
130       }
131     }
132   }
133 
134   // Reduce left states (remove dupliacate rows)
135   StateTable ltable(lsize + 1);
136   for (size_t rid = 0; rid <= lsize; ++rid) {
137     string buf;
138     for (size_t lid = 0; lid <= rsize; ++lid) {
139       const uint32 index = rid + lsize * lid;
140       buf += array[index];
141     }
142     ltable.Add(rid, buf);
143   }
144 
145   // Reduce right states (remove dupliacate columns)
146   StateTable rtable(rsize + 1);
147   for (size_t lid = 0; lid <= rsize; ++lid) {
148     string buf;
149     for (size_t rid = 0; rid <= lsize; ++rid) {
150       const uint32 index = rid + lsize * lid;
151       buf += array[index];
152     }
153     rtable.Add(lid, buf);
154   }
155 
156   // make lookup table
157   rtable.Build();
158   ltable.Build();
159 
160   const size_t kCompressedLSize = ltable.compressed_size();
161   const size_t kCompressedRSize = rtable.compressed_size();
162 
163   CHECK_GT(kCompressedLSize, 0);
164   CHECK_GT(kCompressedRSize, 0);
165 
166   // make bitarray
167   mozc::BitArray barray(kCompressedLSize * kCompressedRSize);
168   for (size_t rid = 0; rid <= lsize; ++rid) {
169     for (size_t lid = 0; lid <= rsize; ++lid) {
170       const int index = rid + lsize * lid;
171       const uint32 cindex = ltable.id(rid) + kCompressedLSize * rtable.id(lid);
172       if (array[index] > 0) {
173         barray.set(cindex);
174       } else {
175         barray.clear(cindex);
176       }
177     }
178   }
179 
180   // verify the table
181   for (size_t rid = 0; rid <= lsize; ++rid) {
182     for (size_t lid = 0; lid <= rsize; ++lid) {
183       const int index = rid + lsize * lid;
184       const uint32 cindex = ltable.id(rid) + kCompressedLSize * rtable.id(lid);
185       CHECK_EQ(barray.get(cindex), (array[index] != 0));
186     }
187   }
188 
189   CHECK(barray.array());
190   CHECK_GT(barray.size(), 0);
191 
192   CHECK(Util::IsLittleEndian())
193       << "Architecture must be little endian";
194   {
195     mozc::converter::SegmenterDataSizeInfo pb;
196     pb.set_compressed_lsize(kCompressedLSize);
197     pb.set_compressed_rsize(kCompressedRSize);
198     mozc::OutputFileStream ofs(output_size_info.c_str(),
199                                std::ios_base::out | std::ios_base::binary);
200     CHECK(ofs);
201     CHECK(pb.SerializeToOstream(&ofs));
202     ofs.close();
203   }
204   {
205     mozc::OutputFileStream ofs(output_ltable.c_str(),
206                                std::ios_base::out | std::ios_base::binary);
207     CHECK(ofs);
208     ltable.Output(&ofs);
209     ofs.close();
210   }
211   {
212     mozc::OutputFileStream ofs(output_rtable.c_str(),
213                                std::ios_base::out | std::ios_base::binary);
214     CHECK(ofs);
215     rtable.Output(&ofs);
216     ofs.close();
217   }
218   {
219     mozc::OutputFileStream ofs(output_bitarray.c_str(),
220                                std::ios_base::out | std::ios_base::binary);
221     CHECK(ofs);
222     ofs.write(barray.array(), barray.array_size());
223     ofs.close();
224   }
225 }
226 
227 }  // namespace mozc
228