1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "storage/lru_storage.h"
31
32 #include <algorithm>
33 #include <cstdlib>
34 #include <cstring>
35 #include <ctime>
36 #include <map>
37 #include <memory>
38 #include <set>
39 #include <string>
40 #include <vector>
41
42 #include "base/clock.h"
43 #include "base/file_stream.h"
44 #include "base/file_util.h"
45 #include "base/hash.h"
46 #include "base/logging.h"
47 #include "base/mmap.h"
48 #include "base/port.h"
49 #include "base/util.h"
50
51 namespace mozc {
52 namespace storage {
53
54 namespace {
55 const size_t kMaxLRUSize = 1000000; // 1M
56 const size_t kMaxValueSize = 1024; // 1024 byte
57
58 template <class T>
ReadValue(char ** ptr,T * value)59 inline void ReadValue(char **ptr, T *value) {
60 memcpy(value, *ptr, sizeof(*value));
61 *ptr += sizeof(*value);
62 }
63
GetFP(const char * ptr)64 uint64 GetFP(const char *ptr) {
65 return *reinterpret_cast<const uint64 *>(ptr);
66 }
67
GetTimeStamp(const char * ptr)68 uint32 GetTimeStamp(const char *ptr) {
69 return *reinterpret_cast<const uint32 *>(ptr + 8);
70 }
71
GetValue(const char * ptr)72 const char* GetValue(const char *ptr) {
73 return ptr + 12;
74 }
75
Update(char * ptr)76 void Update(char *ptr) {
77 const uint32 last_access_time = static_cast<uint32>(Clock::GetTime());
78 memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
79 }
80
Update(char * ptr,uint64 fp,const char * value,size_t value_size)81 void Update(char *ptr, uint64 fp, const char *value, size_t value_size) {
82 const uint32 last_access_time = static_cast<uint32>(Clock::GetTime());
83 memcpy(ptr, reinterpret_cast<const char *>(&fp), 8);
84 memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
85 memcpy(ptr + 12, value, value_size);
86 }
87
88 class CompareByTimeStamp {
89 public:
operator ()(const char * a,const char * b) const90 bool operator()(const char *a, const char *b) const {
91 return GetTimeStamp(a) > GetTimeStamp(b);
92 }
93 };
94 } // namespace
95
96 class LRUStorage::Node {
97 public:
Node()98 Node(): next(NULL), prev(NULL), value(NULL) {
99 }
100
101 Node *next;
102 Node *prev;
103 char *value;
104
105 private:
106 DISALLOW_COPY_AND_ASSIGN(Node);
107 };
108
109 class LRUStorage::LRUList {
110 public:
LRUList(size_t max_size)111 explicit LRUList(size_t max_size)
112 : max_size_(max_size), size_(0), last_(NULL), top_(NULL) {
113 }
114
~LRUList()115 ~LRUList() {
116 Clear();
117 }
118
Clear()119 void Clear() {
120 Node *node = top_;
121 while (node != NULL) {
122 Node *next = node->next;
123 delete node;
124 node = next;
125 }
126 size_ = 0;
127 top_ = last_ = NULL;
128 }
129
Add(char * value)130 Node *Add(char *value) {
131 if (size_ < max_size_) {
132 Node *node = new Node;
133 node->value = value;
134 if (last_ == NULL) {
135 node->prev = NULL;
136 top_ = node;
137 } else {
138 last_->next = node;
139 }
140 node->next = NULL;
141 node->prev = last_;
142 last_ = node;
143 ++size_;
144 return node;
145 }
146 LOG(WARNING) << "LRUList is full";
147 return NULL;
148 }
149
empty() const150 bool empty() const {
151 return (top_ == NULL);
152 }
153
size() const154 size_t size() const {
155 return size_;
156 }
157
GetLastNode()158 Node *GetLastNode() {
159 return last_;
160 }
161
MoveToTop(Node * node)162 void MoveToTop(Node *node) {
163 if (node->prev != NULL) { // this is top
164 Node *prev = node->prev;
165 Node *next = node->next;
166 prev->next = next;
167 if (next == NULL) {
168 last_ = prev;
169 } else {
170 next->prev = prev;
171 }
172 node->next = top_;
173 top_->prev = node;
174 top_ = node;
175 top_->prev = NULL;
176 }
177 }
178
179 private:
180 size_t max_size_;
181 size_t size_;
182 Node *last_;
183 Node *top_;
184
185 DISALLOW_COPY_AND_ASSIGN(LRUList);
186 };
187
Create(const char * filename)188 LRUStorage *LRUStorage::Create(const char *filename) {
189 std::unique_ptr<LRUStorage> n(new LRUStorage);
190 if (!n->Open(filename)) {
191 LOG(ERROR) << "could not open LRUStorage";
192 return NULL;
193 }
194 return n.release();
195 }
196
Create(const char * filename,size_t value_size,size_t size,uint32 seed)197 LRUStorage *LRUStorage::Create(const char *filename,
198 size_t value_size,
199 size_t size,
200 uint32 seed) {
201 std::unique_ptr<LRUStorage> n(new LRUStorage);
202 if (!n->OpenOrCreate(filename, value_size, size, seed)) {
203 LOG(ERROR) << "could not open LRUStorage";
204 return NULL;
205 }
206 return n.release();
207 }
208
CreateStorageFile(const char * filename,size_t value_size,size_t size,uint32 seed)209 bool LRUStorage::CreateStorageFile(const char *filename,
210 size_t value_size,
211 size_t size,
212 uint32 seed) {
213 if (value_size == 0 || value_size > kMaxValueSize) {
214 LOG(ERROR) << "value_size is out of range";
215 return false;
216 }
217
218 if (size == 0 || size > kMaxLRUSize) {
219 LOG(ERROR) << "size is out of range";
220 return false;
221 }
222
223 if (value_size % 4 != 0) {
224 LOG(ERROR) << "value_size_ must be 4 byte alignment";
225 return false;
226 }
227
228 OutputFileStream ofs(filename, std::ios::binary | std::ios::out);
229 if (!ofs) {
230 LOG(ERROR) << "cannot open " << filename;
231 return false;
232 }
233
234 const uint32 value_size_uint32 = static_cast<uint32>(value_size);
235 const uint32 size_uint32 = static_cast<uint32>(size);
236
237 ofs.write(reinterpret_cast<const char *>(&value_size_uint32),
238 sizeof(value_size_uint32));
239 ofs.write(reinterpret_cast<const char *>(&size_uint32),
240 sizeof(size_uint32));
241 ofs.write(reinterpret_cast<const char *>(&seed),
242 sizeof(seed));
243 std::vector<char> ary(value_size, '\0');
244 const uint32 last_access_time = 0;
245 const uint64 fp = 0;
246
247 for (size_t i = 0; i < size; ++i) {
248 ofs.write(reinterpret_cast<const char *>(&fp), sizeof(fp));
249 ofs.write(reinterpret_cast<const char *>(&last_access_time),
250 sizeof(last_access_time));
251 ofs.write(reinterpret_cast<const char *>(&ary[0]),
252 static_cast<std::streamsize>(ary.size() * sizeof(ary[0])));
253 }
254
255 return true;
256 }
257
258 // Reopen file after initializing mapped page.
Clear()259 bool LRUStorage::Clear() {
260 // Don't need to clear the page if the lru list is empty
261 if (mmap_.get() == NULL || lru_list_.get() == NULL ||
262 lru_list_->size() == 0) {
263 return true;
264 }
265 const size_t offset =
266 sizeof(value_size_) + sizeof(size_) + sizeof(seed_);
267 if (offset >= mmap_->size()) { // should not happen
268 return false;
269 }
270 memset(mmap_->begin() + offset, '\0', mmap_->size() - offset);
271 lru_list_.reset();
272 map_.clear();
273 Open(mmap_->begin(), mmap_->size());
274 return true;
275 }
276
Merge(const char * filename)277 bool LRUStorage::Merge(const char *filename) {
278 LRUStorage target_storage;
279 if (!target_storage.Open(filename)) {
280 return false;
281 }
282 return Merge(target_storage);
283 }
284
Merge(const LRUStorage & storage)285 bool LRUStorage::Merge(const LRUStorage &storage) {
286 if (storage.value_size() != value_size()) {
287 return false;
288 }
289
290 if (seed_ != storage.seed_) {
291 return false;
292 }
293
294 std::vector<const char *> ary;
295
296 // this file
297 {
298 const char *begin = begin_;
299 const char *end = end_;
300 while (begin < end) {
301 ary.push_back(begin);
302 begin += (value_size_ + 12);
303 }
304 }
305
306 // target file
307 {
308 const char *begin = storage.begin_;
309 const char *end = storage.end_;
310 while (begin < end) {
311 ary.push_back(begin);
312 begin += (value_size_ + 12);
313 }
314 }
315
316 std::stable_sort(ary.begin(), ary.end(), CompareByTimeStamp());
317
318 string buf;
319 std::set<uint64> seen; // remove duplicated entries.
320 for (size_t i = 0; i < ary.size(); ++i) {
321 if (!seen.insert(GetFP(ary[i])).second) {
322 continue;
323 }
324 buf.append(const_cast<const char *>(ary[i]), value_size_ + 12);
325 }
326
327 const size_t old_size = static_cast<size_t>(end_ - begin_);
328 const size_t new_size = std::min(buf.size(), old_size);
329
330 // TODO(taku): this part is not atomic.
331 // If the converter process is killed while memcpy or memset is running,
332 // the storage data will be broken.
333 memcpy(begin_, buf.data(), new_size);
334 if (new_size < old_size) {
335 memset(begin_ + new_size, '\0', old_size - new_size);
336 }
337
338 return Open(mmap_->begin(), mmap_->size());
339 }
340
LRUStorage()341 LRUStorage::LRUStorage()
342 : value_size_(0),
343 size_(0),
344 seed_(0),
345 last_item_(NULL),
346 begin_(NULL), end_(NULL) {}
347
~LRUStorage()348 LRUStorage::~LRUStorage() {
349 Close();
350 }
351
OpenOrCreate(const char * filename,size_t new_value_size,size_t new_size,uint32 new_seed)352 bool LRUStorage::OpenOrCreate(const char *filename,
353 size_t new_value_size,
354 size_t new_size,
355 uint32 new_seed) {
356 if (!FileUtil::FileExists(filename)) {
357 // This is also an expected scenario. Let's create a new data file.
358 VLOG(1) << filename << " does not exist. Creating a new one.";
359 if (!LRUStorage::CreateStorageFile(filename,
360 new_value_size,
361 new_size, new_seed)) {
362 LOG(ERROR) << "CreateStorageFile failed against " << filename;
363 return false;
364 }
365 }
366
367 if (!Open(filename)) {
368 Close();
369 LOG(ERROR) << "Failed to open the file or the data is corrupted. "
370 "So try to recreate new file. filename: " << filename;
371 // If the file exists but is corrupted, the following operation may
372 // may fix some problem. However, if the file was temporarily locked
373 // by some processes and now no longer locked, the following operation
374 // is likely to result in a simple permanent data loss.
375 // TODO(yukawa, team): Do not clear the data whenever we can open the
376 // data file and the content is actually valid.
377 if (!LRUStorage::CreateStorageFile(filename,
378 new_value_size,
379 new_size, new_seed)) {
380 LOG(ERROR) << "CreateStorageFile failed";
381 return false;
382 }
383 if (!Open(filename)) {
384 Close();
385 LOG(ERROR) << "Open failed after CreateStorageFile. Give up...";
386 return false;
387 }
388 }
389
390 // File format has changed
391 if (new_value_size != value_size() || new_size != size()) {
392 Close();
393 if (!LRUStorage::CreateStorageFile(filename, new_value_size,
394 new_size, new_seed)) {
395 LOG(ERROR) << "CreateStorageFile failed";
396 return false;
397 }
398 if (!Open(filename)) {
399 Close();
400 LOG(ERROR) << "Open failed after CreateStorageFile";
401 return false;
402 }
403 }
404
405 if (new_value_size != value_size() || new_size != size()) {
406 Close();
407 LOG(ERROR) << "file is broken";
408 return false;
409 }
410
411 return true;
412 }
413
Open(const char * filename)414 bool LRUStorage::Open(const char *filename) {
415 mmap_.reset(new Mmap);
416
417 if (mmap_.get() == NULL) {
418 LOG(ERROR) << "cannot make Mmap object";
419 return false;
420 }
421
422 if (!mmap_->Open(filename, "r+")) {
423 LOG(ERROR) << "cannot open " << filename
424 << " with read+write mode";
425 return false;
426 }
427
428 if (mmap_->size() < 8) {
429 LOG(ERROR) << "file size is too small";
430 return false;
431 }
432
433 filename_ = filename;
434 return Open(mmap_->begin(), mmap_->size());
435 }
436
Open(char * ptr,size_t ptr_size)437 bool LRUStorage::Open(char *ptr, size_t ptr_size) {
438 begin_ = ptr;
439 end_ = ptr + ptr_size;
440
441 uint32 value_size_uint32 = 0;
442 uint32 size_uint32 = 0;
443
444 ReadValue<uint32>(&begin_, &value_size_uint32);
445 ReadValue<uint32>(&begin_, &size_uint32);
446 ReadValue<uint32>(&begin_, &seed_);
447
448 value_size_ = static_cast<size_t>(value_size_uint32);
449 size_ = static_cast<size_t>(size_uint32);
450
451 if (value_size_ % 4 != 0) {
452 LOG(ERROR) << "value_size_ must be 4 byte alignment";
453 return false;
454 }
455
456 if (size_ == 0 || size_ > kMaxLRUSize) {
457 LOG(ERROR) << "LRU size is invalid: " << size_;
458 return false;
459 }
460
461 if (value_size_ == 0 || value_size_ > kMaxValueSize) {
462 LOG(ERROR) << "value_size is invalid: " << value_size_;
463 return false;
464 }
465
466 const size_t file_size = mmap_->size() - 12;
467 if ((value_size_ + 12) * size_ != file_size) {
468 LOG(ERROR) << "LRU file is broken";
469 return false;
470 }
471
472 std::vector<char *> ary;
473 char *begin = begin_;
474 char *end = end_;
475 while (begin < end) {
476 ary.push_back(begin);
477 begin += (value_size_ + 12);
478 }
479 std::stable_sort(ary.begin(), ary.end(), CompareByTimeStamp());
480
481 lru_list_.reset(new LRUList(size_));
482 map_.clear();
483 last_item_ = NULL;
484 for (size_t i = 0; i < ary.size(); ++i) {
485 if (GetTimeStamp(ary[i]) != 0) {
486 Node *node = lru_list_->Add(ary[i]);
487 map_.insert(std::make_pair(GetFP(ary[i]), node));
488 } else if (last_item_ == NULL) {
489 last_item_ = ary[i];
490 }
491 }
492
493 return true;
494 }
495
Close()496 void LRUStorage::Close() {
497 filename_.clear();
498 mmap_.reset();
499 lru_list_.reset();
500 map_.clear();
501 }
502
Lookup(const string & key) const503 const char* LRUStorage::Lookup(const string &key) const {
504 uint32 last_access_time = 0;
505 return Lookup(key, &last_access_time);
506 }
507
Lookup(const string & key,uint32 * last_access_time) const508 const char* LRUStorage::Lookup(const string &key,
509 uint32 *last_access_time) const {
510 const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
511 std::map<uint64, Node *>::const_iterator it = map_.find(fp);
512 if (it == map_.end()) {
513 return NULL;
514 }
515 *last_access_time = GetTimeStamp(it->second->value);
516 return GetValue(it->second->value);
517 }
518
GetAllValues(std::vector<string> * values) const519 bool LRUStorage::GetAllValues(std::vector<string> *values) const {
520 if (lru_list_.get() == NULL) {
521 return false;
522 }
523 DCHECK(values);
524 values->clear();
525 for (const Node *node = lru_list_->GetLastNode();
526 node != NULL;
527 node = node->prev) {
528 // Default constructor of string is not applicable
529 // because value's size() must return value_size_.
530 DCHECK(node->value);
531 values->push_back(string(GetValue(node->value), value_size_));
532 }
533 std::reverse(values->begin(), values->end());
534 return true;
535 }
536
Touch(const string & key)537 bool LRUStorage::Touch(const string &key) {
538 if (lru_list_.get() == NULL) {
539 return false;
540 }
541
542 const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
543 std::map<uint64, Node *>::iterator it = map_.find(fp);
544 if (it != map_.end()) { // find in the cache
545 Update(it->second->value);
546 lru_list_->MoveToTop(it->second);
547 return true;
548 }
549 return false;
550 }
551
Insert(const string & key,const char * value)552 bool LRUStorage::Insert(const string &key, const char *value) {
553 if (lru_list_.get() == NULL) {
554 return false;
555 }
556
557 const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
558 std::map<uint64, Node *>::iterator it = map_.find(fp);
559 if (it != map_.end()) { // find in the cache
560 Update(it->second->value, fp, value, value_size_);
561 lru_list_->MoveToTop(it->second);
562 } else if (lru_list_->size() >= size_ ||
563 last_item_ == NULL) { // not found, but cache is FULL
564 Node *node = lru_list_->GetLastNode();
565 const uint64 old_fp = GetFP(node->value); // remove oldest item
566 std::map<uint64, Node *>::iterator old_it = map_.find(old_fp);
567 if (old_it != map_.end()) {
568 map_.erase(old_it);
569 }
570 lru_list_->MoveToTop(node);
571 Update(node->value, fp, value, value_size_);
572 map_.insert(std::make_pair(fp, node));
573 } else if (last_item_ < mmap_->end()) { // not found, cahce is not FULL
574 Node *node = lru_list_->Add(last_item_);
575 lru_list_->MoveToTop(node);
576 Update(node->value, fp, value, value_size_);
577 map_.insert(std::make_pair(fp, node));
578 last_item_ += (value_size_ + 12);
579 if (last_item_ >= mmap_->end()) {
580 last_item_ = NULL;
581 }
582 } else {
583 LOG(ERROR) << "insertion failed";
584 return false;
585 }
586
587 return true;
588 }
589
TryInsert(const string & key,const char * value)590 bool LRUStorage::TryInsert(const string &key, const char *value) {
591 if (lru_list_.get() == NULL) {
592 return false;
593 }
594
595 const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
596 std::map<uint64, Node *>::iterator it = map_.find(fp);
597 if (it != map_.end()) { // find in the cache
598 Update(it->second->value, fp, value, value_size_);
599 lru_list_->MoveToTop(it->second);
600 }
601
602 return true;
603 }
604
value_size() const605 size_t LRUStorage::value_size() const {
606 return value_size_;
607 }
608
size() const609 size_t LRUStorage::size() const {
610 return size_;
611 }
612
used_size() const613 size_t LRUStorage::used_size() const {
614 return lru_list_.get() == NULL ? 0 : lru_list_->size();
615 }
616
seed() const617 uint32 LRUStorage::seed() const {
618 return seed_;
619 }
620
filename() const621 const string &LRUStorage::filename() const {
622 return filename_;
623 }
624
Write(size_t i,uint64 fp,const string & value,uint32 last_access_time)625 void LRUStorage::Write(size_t i,
626 uint64 fp,
627 const string &value,
628 uint32 last_access_time) {
629 DCHECK_LT(i, size_);
630 char *ptr = begin_ + (i * (value_size_ + 12));
631 memcpy(ptr, reinterpret_cast<const char *>(&fp), 8);
632 memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
633 if (value.size() == value_size_) {
634 memcpy(ptr + 12, value.data(), value_size_);
635 } else {
636 LOG(ERROR) << "value size is not " << value_size_ << " byte.";
637 }
638 }
639
Read(size_t i,uint64 * fp,string * value,uint32 * last_access_time) const640 void LRUStorage::Read(size_t i,
641 uint64 *fp,
642 string *value,
643 uint32 *last_access_time) const {
644 DCHECK_LT(i, size_);
645 const char *ptr = begin_ + (i * (value_size_ + 12));
646 *fp = GetFP(ptr);
647 value->assign(GetValue(ptr), value_size_);
648 *last_access_time = GetTimeStamp(ptr);
649 }
650
651 } // namespace storage
652 } // namespace mozc
653