1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/file/codec.h"
31 
32 #include "base/hash.h"
33 #include "base/logging.h"
34 #include "base/port.h"
35 #include "base/util.h"
36 #include "dictionary/file/codec_interface.h"
37 #include "dictionary/file/codec_util.h"
38 #include "dictionary/file/section.h"
39 
40 namespace mozc {
41 namespace dictionary {
42 
DictionaryFileCodec()43 DictionaryFileCodec::DictionaryFileCodec()
44     : seed_(2135654146), filemagic_(20110701) {}
45 
46 DictionaryFileCodec::~DictionaryFileCodec() = default;
47 
WriteSections(const std::vector<DictionaryFileSection> & sections,std::ostream * ofs) const48 void DictionaryFileCodec::WriteSections(
49     const std::vector<DictionaryFileSection> &sections,
50     std::ostream *ofs) const {
51   DCHECK(ofs);
52   WriteHeader(ofs);
53 
54   if (sections.size() == 4) {
55     // In production, the number of sections equals 4.  In this case, write the
56     // sections in the following deterministic order.  This order was determined
57     // by random shuffle for engine version 24 but it's now made deterministic
58     // to obsolte DictionaryFileCodec.
59     for (size_t i : {0, 2, 1, 3}) {
60       WriteSection(sections[i], ofs);
61     }
62   } else {
63     // Some tests don't have four sections.  In this case, simply write sections
64     // in given order.
65     for (const auto &section : sections) {
66       WriteSection(section, ofs);
67     }
68   }
69 
70   filecodec_util::WriteInt(0, ofs);
71 }
72 
WriteHeader(std::ostream * ofs) const73 void DictionaryFileCodec::WriteHeader(std::ostream *ofs) const {
74   DCHECK(ofs);
75   filecodec_util::WriteInt(filemagic_, ofs);
76   filecodec_util::WriteInt(seed_, ofs);
77 }
78 
WriteSection(const DictionaryFileSection & section,std::ostream * ofs) const79 void DictionaryFileCodec::WriteSection(const DictionaryFileSection &section,
80                                        std::ostream *ofs) const {
81   DCHECK(ofs);
82   const string &name = section.name;
83   // name should be encoded
84   // uint64 needs just 8 bytes.
85   DCHECK_EQ(8, name.size());
86   string escaped;
87   Util::Escape(name, &escaped);
88   VLOG(1) << "section=" << escaped << " length=" << section.len;
89   filecodec_util::WriteInt(section.len, ofs);
90   ofs->write(name.data(), name.size());
91 
92   ofs->write(section.ptr, section.len);
93   Pad4(section.len, ofs);
94 }
95 
Pad4(int length,std::ostream * ofs)96 void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
97   DCHECK(ofs);
98   for (int i = length; (i % 4) != 0; ++i) {
99     (*ofs) << '\0';
100   }
101 }
102 
GetSectionName(const string & name) const103 string DictionaryFileCodec::GetSectionName(const string &name) const {
104   VLOG(1) << "seed\t" << seed_;
105   const uint64 name_fp = Hash::FingerprintWithSeed(name, seed_);
106   const string fp_string(reinterpret_cast<const char *>(&name_fp),
107                          sizeof(name_fp));
108   string escaped;
109   Util::Escape(fp_string, &escaped);
110   VLOG(1) << "Section name for " << name << ": " << escaped;
111   return fp_string;
112 }
113 
ReadSections(const char * image,int length,std::vector<DictionaryFileSection> * sections) const114 bool DictionaryFileCodec::ReadSections(
115     const char *image, int length,
116     std::vector<DictionaryFileSection> *sections) const {
117   DCHECK(sections);
118   const char *ptr = image;
119   const int filemagic = filecodec_util::ReadInt(ptr);
120   CHECK(filemagic == filemagic_)
121       << "invalid dictionary file magic (recompile dictionary?)";
122   ptr += sizeof(filemagic);
123   seed_ = filecodec_util::ReadInt(ptr);
124   ptr += sizeof(seed_);
125   int size;
126   while ((size = filecodec_util::ReadInt(ptr))) {
127     ptr += sizeof(size);
128     // finger print name
129     const string name(ptr, sizeof(uint64));
130     ptr += sizeof(uint64);
131 
132     string escaped;
133     Util::Escape(name, &escaped);
134     VLOG(1) << "section=" << escaped << " length=" << size;
135 
136     sections->push_back(DictionaryFileSection(ptr, size, name));
137 
138     ptr += size;
139     ptr += filecodec_util::Rup4(size);
140     if (image + length < ptr) {
141       return false;
142     }
143   }
144   return true;
145 }
146 
147 }  // namespace dictionary
148 }  // namespace mozc
149