1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/file/codec.h"
31
32 #include "base/hash.h"
33 #include "base/logging.h"
34 #include "base/port.h"
35 #include "base/util.h"
36 #include "dictionary/file/codec_interface.h"
37 #include "dictionary/file/codec_util.h"
38 #include "dictionary/file/section.h"
39
40 namespace mozc {
41 namespace dictionary {
42
DictionaryFileCodec()43 DictionaryFileCodec::DictionaryFileCodec()
44 : seed_(2135654146), filemagic_(20110701) {}
45
46 DictionaryFileCodec::~DictionaryFileCodec() = default;
47
WriteSections(const std::vector<DictionaryFileSection> & sections,std::ostream * ofs) const48 void DictionaryFileCodec::WriteSections(
49 const std::vector<DictionaryFileSection> §ions,
50 std::ostream *ofs) const {
51 DCHECK(ofs);
52 WriteHeader(ofs);
53
54 if (sections.size() == 4) {
55 // In production, the number of sections equals 4. In this case, write the
56 // sections in the following deterministic order. This order was determined
57 // by random shuffle for engine version 24 but it's now made deterministic
58 // to obsolte DictionaryFileCodec.
59 for (size_t i : {0, 2, 1, 3}) {
60 WriteSection(sections[i], ofs);
61 }
62 } else {
63 // Some tests don't have four sections. In this case, simply write sections
64 // in given order.
65 for (const auto §ion : sections) {
66 WriteSection(section, ofs);
67 }
68 }
69
70 filecodec_util::WriteInt(0, ofs);
71 }
72
WriteHeader(std::ostream * ofs) const73 void DictionaryFileCodec::WriteHeader(std::ostream *ofs) const {
74 DCHECK(ofs);
75 filecodec_util::WriteInt(filemagic_, ofs);
76 filecodec_util::WriteInt(seed_, ofs);
77 }
78
WriteSection(const DictionaryFileSection & section,std::ostream * ofs) const79 void DictionaryFileCodec::WriteSection(const DictionaryFileSection §ion,
80 std::ostream *ofs) const {
81 DCHECK(ofs);
82 const string &name = section.name;
83 // name should be encoded
84 // uint64 needs just 8 bytes.
85 DCHECK_EQ(8, name.size());
86 string escaped;
87 Util::Escape(name, &escaped);
88 VLOG(1) << "section=" << escaped << " length=" << section.len;
89 filecodec_util::WriteInt(section.len, ofs);
90 ofs->write(name.data(), name.size());
91
92 ofs->write(section.ptr, section.len);
93 Pad4(section.len, ofs);
94 }
95
Pad4(int length,std::ostream * ofs)96 void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
97 DCHECK(ofs);
98 for (int i = length; (i % 4) != 0; ++i) {
99 (*ofs) << '\0';
100 }
101 }
102
GetSectionName(const string & name) const103 string DictionaryFileCodec::GetSectionName(const string &name) const {
104 VLOG(1) << "seed\t" << seed_;
105 const uint64 name_fp = Hash::FingerprintWithSeed(name, seed_);
106 const string fp_string(reinterpret_cast<const char *>(&name_fp),
107 sizeof(name_fp));
108 string escaped;
109 Util::Escape(fp_string, &escaped);
110 VLOG(1) << "Section name for " << name << ": " << escaped;
111 return fp_string;
112 }
113
ReadSections(const char * image,int length,std::vector<DictionaryFileSection> * sections) const114 bool DictionaryFileCodec::ReadSections(
115 const char *image, int length,
116 std::vector<DictionaryFileSection> *sections) const {
117 DCHECK(sections);
118 const char *ptr = image;
119 const int filemagic = filecodec_util::ReadInt(ptr);
120 CHECK(filemagic == filemagic_)
121 << "invalid dictionary file magic (recompile dictionary?)";
122 ptr += sizeof(filemagic);
123 seed_ = filecodec_util::ReadInt(ptr);
124 ptr += sizeof(seed_);
125 int size;
126 while ((size = filecodec_util::ReadInt(ptr))) {
127 ptr += sizeof(size);
128 // finger print name
129 const string name(ptr, sizeof(uint64));
130 ptr += sizeof(uint64);
131
132 string escaped;
133 Util::Escape(name, &escaped);
134 VLOG(1) << "section=" << escaped << " length=" << size;
135
136 sections->push_back(DictionaryFileSection(ptr, size, name));
137
138 ptr += size;
139 ptr += filecodec_util::Rup4(size);
140 if (image + length < ptr) {
141 return false;
142 }
143 }
144 return true;
145 }
146
147 } // namespace dictionary
148 } // namespace mozc
149