1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "storage/lru_storage.h"
31 
32 #include <algorithm>
33 #include <cstdlib>
34 #include <cstring>
35 #include <ctime>
36 #include <map>
37 #include <memory>
38 #include <set>
39 #include <string>
40 #include <vector>
41 
42 #include "base/clock.h"
43 #include "base/file_stream.h"
44 #include "base/file_util.h"
45 #include "base/hash.h"
46 #include "base/logging.h"
47 #include "base/mmap.h"
48 #include "base/port.h"
49 #include "base/util.h"
50 
51 namespace mozc {
52 namespace storage {
53 
54 namespace {
55 const size_t kMaxLRUSize   = 1000000;  // 1M
56 const size_t kMaxValueSize = 1024;     // 1024 byte
57 
58 template <class T>
ReadValue(char ** ptr,T * value)59 inline void ReadValue(char **ptr, T *value) {
60   memcpy(value, *ptr, sizeof(*value));
61   *ptr += sizeof(*value);
62 }
63 
GetFP(const char * ptr)64 uint64 GetFP(const char *ptr) {
65   return *reinterpret_cast<const uint64 *>(ptr);
66 }
67 
GetTimeStamp(const char * ptr)68 uint32 GetTimeStamp(const char *ptr) {
69   return *reinterpret_cast<const uint32 *>(ptr + 8);
70 }
71 
GetValue(const char * ptr)72 const char* GetValue(const char *ptr) {
73   return ptr + 12;
74 }
75 
Update(char * ptr)76 void Update(char *ptr) {
77   const uint32 last_access_time = static_cast<uint32>(Clock::GetTime());
78   memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
79 }
80 
Update(char * ptr,uint64 fp,const char * value,size_t value_size)81 void Update(char *ptr, uint64 fp, const char *value, size_t value_size) {
82   const uint32 last_access_time = static_cast<uint32>(Clock::GetTime());
83   memcpy(ptr,     reinterpret_cast<const char *>(&fp), 8);
84   memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
85   memcpy(ptr + 12, value, value_size);
86 }
87 
88 class CompareByTimeStamp {
89  public:
operator ()(const char * a,const char * b) const90   bool operator()(const char *a, const char *b) const {
91     return GetTimeStamp(a) > GetTimeStamp(b);
92   }
93 };
94 }  // namespace
95 
96 class LRUStorage::Node {
97  public:
Node()98   Node(): next(NULL), prev(NULL), value(NULL) {
99   }
100 
101   Node *next;
102   Node *prev;
103   char *value;
104 
105  private:
106   DISALLOW_COPY_AND_ASSIGN(Node);
107 };
108 
109 class LRUStorage::LRUList {
110  public:
LRUList(size_t max_size)111   explicit LRUList(size_t max_size)
112       : max_size_(max_size), size_(0), last_(NULL), top_(NULL) {
113   }
114 
~LRUList()115   ~LRUList() {
116     Clear();
117   }
118 
Clear()119   void Clear() {
120     Node *node = top_;
121     while (node != NULL) {
122       Node *next = node->next;
123       delete node;
124       node = next;
125     }
126     size_ = 0;
127     top_ = last_ = NULL;
128   }
129 
Add(char * value)130   Node *Add(char *value) {
131     if (size_ < max_size_) {
132       Node *node = new Node;
133       node->value = value;
134       if (last_ == NULL) {
135         node->prev = NULL;
136         top_ = node;
137       } else {
138         last_->next = node;
139       }
140       node->next = NULL;
141       node->prev = last_;
142       last_ = node;
143       ++size_;
144       return node;
145     }
146     LOG(WARNING) << "LRUList is full";
147     return NULL;
148   }
149 
empty() const150   bool empty() const {
151     return (top_ == NULL);
152   }
153 
size() const154   size_t size() const {
155     return size_;
156   }
157 
GetLastNode()158   Node *GetLastNode() {
159     return last_;
160   }
161 
MoveToTop(Node * node)162   void MoveToTop(Node *node) {
163     if (node->prev != NULL) {  // this is top
164       Node *prev = node->prev;
165       Node *next = node->next;
166       prev->next = next;
167       if (next == NULL) {
168         last_ = prev;
169       } else {
170         next->prev = prev;
171       }
172       node->next = top_;
173       top_->prev = node;
174       top_ = node;
175       top_->prev = NULL;
176     }
177   }
178 
179  private:
180   size_t max_size_;
181   size_t size_;
182   Node *last_;
183   Node *top_;
184 
185   DISALLOW_COPY_AND_ASSIGN(LRUList);
186 };
187 
Create(const char * filename)188 LRUStorage *LRUStorage::Create(const char *filename) {
189   std::unique_ptr<LRUStorage> n(new LRUStorage);
190   if (!n->Open(filename)) {
191     LOG(ERROR) << "could not open LRUStorage";
192     return NULL;
193   }
194   return n.release();
195 }
196 
Create(const char * filename,size_t value_size,size_t size,uint32 seed)197 LRUStorage *LRUStorage::Create(const char *filename,
198                                size_t value_size,
199                                size_t size,
200                                uint32 seed) {
201   std::unique_ptr<LRUStorage> n(new LRUStorage);
202   if (!n->OpenOrCreate(filename, value_size, size, seed)) {
203     LOG(ERROR) << "could not open LRUStorage";
204     return NULL;
205   }
206   return n.release();
207 }
208 
CreateStorageFile(const char * filename,size_t value_size,size_t size,uint32 seed)209 bool LRUStorage::CreateStorageFile(const char *filename,
210                                    size_t value_size,
211                                    size_t size,
212                                    uint32 seed) {
213   if (value_size == 0 || value_size > kMaxValueSize) {
214     LOG(ERROR) << "value_size is out of range";
215     return false;
216   }
217 
218   if (size == 0 || size > kMaxLRUSize) {
219     LOG(ERROR) << "size is out of range";
220     return false;
221   }
222 
223   if (value_size % 4 != 0) {
224     LOG(ERROR) << "value_size_ must be 4 byte alignment";
225     return false;
226   }
227 
228   OutputFileStream ofs(filename, std::ios::binary | std::ios::out);
229   if (!ofs) {
230     LOG(ERROR) << "cannot open " << filename;
231     return false;
232   }
233 
234   const uint32 value_size_uint32 = static_cast<uint32>(value_size);
235   const uint32 size_uint32 = static_cast<uint32>(size);
236 
237   ofs.write(reinterpret_cast<const char *>(&value_size_uint32),
238             sizeof(value_size_uint32));
239   ofs.write(reinterpret_cast<const char *>(&size_uint32),
240             sizeof(size_uint32));
241   ofs.write(reinterpret_cast<const char *>(&seed),
242             sizeof(seed));
243   std::vector<char> ary(value_size, '\0');
244   const uint32 last_access_time = 0;
245   const uint64 fp = 0;
246 
247   for (size_t i = 0; i < size; ++i) {
248     ofs.write(reinterpret_cast<const char *>(&fp), sizeof(fp));
249     ofs.write(reinterpret_cast<const char *>(&last_access_time),
250               sizeof(last_access_time));
251     ofs.write(reinterpret_cast<const char *>(&ary[0]),
252               static_cast<std::streamsize>(ary.size() * sizeof(ary[0])));
253   }
254 
255   return true;
256 }
257 
258 // Reopen file after initializing mapped page.
Clear()259 bool LRUStorage::Clear() {
260   // Don't need to clear the page if the lru list is empty
261   if (mmap_.get() == NULL || lru_list_.get() == NULL ||
262       lru_list_->size() == 0) {
263     return true;
264   }
265   const size_t offset =
266       sizeof(value_size_) + sizeof(size_) + sizeof(seed_);
267   if (offset >= mmap_->size()) {   // should not happen
268     return false;
269   }
270   memset(mmap_->begin() + offset, '\0', mmap_->size() - offset);
271   lru_list_.reset();
272   map_.clear();
273   Open(mmap_->begin(), mmap_->size());
274   return true;
275 }
276 
Merge(const char * filename)277 bool LRUStorage::Merge(const char *filename) {
278   LRUStorage target_storage;
279   if (!target_storage.Open(filename)) {
280     return false;
281   }
282   return Merge(target_storage);
283 }
284 
Merge(const LRUStorage & storage)285 bool LRUStorage::Merge(const LRUStorage &storage) {
286   if (storage.value_size() !=  value_size()) {
287     return false;
288   }
289 
290   if (seed_ != storage.seed_) {
291     return false;
292   }
293 
294   std::vector<const char *> ary;
295 
296   // this file
297   {
298     const char *begin = begin_;
299     const char *end = end_;
300     while (begin < end) {
301       ary.push_back(begin);
302       begin += (value_size_ + 12);
303     }
304   }
305 
306   // target file
307   {
308     const char *begin = storage.begin_;
309     const char *end = storage.end_;
310     while (begin < end) {
311       ary.push_back(begin);
312       begin += (value_size_ + 12);
313     }
314   }
315 
316   std::stable_sort(ary.begin(), ary.end(), CompareByTimeStamp());
317 
318   string buf;
319   std::set<uint64> seen;   // remove duplicated entries.
320   for (size_t i = 0; i < ary.size(); ++i) {
321     if (!seen.insert(GetFP(ary[i])).second) {
322       continue;
323     }
324     buf.append(const_cast<const char *>(ary[i]), value_size_ + 12);
325   }
326 
327   const size_t old_size = static_cast<size_t>(end_ - begin_);
328   const size_t new_size = std::min(buf.size(), old_size);
329 
330   // TODO(taku): this part is not atomic.
331   // If the converter process is killed while memcpy or memset is running,
332   // the storage data will be broken.
333   memcpy(begin_, buf.data(), new_size);
334   if (new_size < old_size) {
335     memset(begin_ + new_size, '\0', old_size - new_size);
336   }
337 
338   return Open(mmap_->begin(), mmap_->size());
339 }
340 
LRUStorage()341 LRUStorage::LRUStorage()
342     : value_size_(0),
343       size_(0),
344       seed_(0),
345       last_item_(NULL),
346       begin_(NULL), end_(NULL) {}
347 
~LRUStorage()348 LRUStorage::~LRUStorage() {
349   Close();
350 }
351 
OpenOrCreate(const char * filename,size_t new_value_size,size_t new_size,uint32 new_seed)352 bool LRUStorage::OpenOrCreate(const char *filename,
353                               size_t new_value_size,
354                               size_t new_size,
355                               uint32 new_seed) {
356   if (!FileUtil::FileExists(filename)) {
357     // This is also an expected scenario. Let's create a new data file.
358     VLOG(1) << filename << " does not exist. Creating a new one.";
359     if (!LRUStorage::CreateStorageFile(filename,
360                                        new_value_size,
361                                        new_size, new_seed)) {
362       LOG(ERROR) << "CreateStorageFile failed against " << filename;
363       return false;
364     }
365   }
366 
367   if (!Open(filename)) {
368     Close();
369     LOG(ERROR) << "Failed to open the file or the data is corrupted. "
370                   "So try to recreate new file. filename: " << filename;
371     // If the file exists but is corrupted, the following operation may
372     // may fix some problem. However, if the file was temporarily locked
373     // by some processes and now no longer locked, the following operation
374     // is likely to result in a simple permanent data loss.
375     // TODO(yukawa, team): Do not clear the data whenever we can open the
376     //     data file and the content is actually valid.
377     if (!LRUStorage::CreateStorageFile(filename,
378                                        new_value_size,
379                                        new_size, new_seed)) {
380       LOG(ERROR) << "CreateStorageFile failed";
381       return false;
382     }
383     if (!Open(filename)) {
384       Close();
385       LOG(ERROR) << "Open failed after CreateStorageFile. Give up...";
386       return false;
387     }
388   }
389 
390   // File format has changed
391   if (new_value_size != value_size() || new_size != size()) {
392     Close();
393     if (!LRUStorage::CreateStorageFile(filename, new_value_size,
394                                        new_size, new_seed)) {
395       LOG(ERROR) << "CreateStorageFile failed";
396       return false;
397     }
398     if (!Open(filename)) {
399       Close();
400       LOG(ERROR) << "Open failed after CreateStorageFile";
401       return false;
402     }
403   }
404 
405   if (new_value_size != value_size() || new_size != size()) {
406     Close();
407     LOG(ERROR) << "file is broken";
408     return false;
409   }
410 
411   return true;
412 }
413 
Open(const char * filename)414 bool LRUStorage::Open(const char *filename) {
415   mmap_.reset(new Mmap);
416 
417   if (mmap_.get() == NULL) {
418     LOG(ERROR) << "cannot make Mmap object";
419     return false;
420   }
421 
422   if (!mmap_->Open(filename, "r+")) {
423     LOG(ERROR) << "cannot open " << filename
424                << " with read+write mode";
425     return false;
426   }
427 
428   if (mmap_->size() < 8) {
429     LOG(ERROR) << "file size is too small";
430     return false;
431   }
432 
433   filename_ = filename;
434   return Open(mmap_->begin(), mmap_->size());
435 }
436 
Open(char * ptr,size_t ptr_size)437 bool LRUStorage::Open(char *ptr, size_t ptr_size) {
438   begin_ = ptr;
439   end_ = ptr + ptr_size;
440 
441   uint32 value_size_uint32 = 0;
442   uint32 size_uint32 = 0;
443 
444   ReadValue<uint32>(&begin_, &value_size_uint32);
445   ReadValue<uint32>(&begin_, &size_uint32);
446   ReadValue<uint32>(&begin_, &seed_);
447 
448   value_size_ = static_cast<size_t>(value_size_uint32);
449   size_ = static_cast<size_t>(size_uint32);
450 
451   if (value_size_ % 4 != 0) {
452     LOG(ERROR) << "value_size_ must be 4 byte alignment";
453     return false;
454   }
455 
456   if (size_ == 0 || size_ > kMaxLRUSize) {
457     LOG(ERROR) << "LRU size is invalid: " << size_;
458     return false;
459   }
460 
461   if (value_size_ == 0 || value_size_ > kMaxValueSize) {
462     LOG(ERROR) << "value_size is invalid: " << value_size_;
463     return false;
464   }
465 
466   const size_t file_size = mmap_->size() - 12;
467   if ((value_size_ + 12) * size_ != file_size) {
468     LOG(ERROR) << "LRU file is broken";
469     return false;
470   }
471 
472   std::vector<char *> ary;
473   char *begin = begin_;
474   char *end = end_;
475   while (begin < end) {
476     ary.push_back(begin);
477     begin += (value_size_ + 12);
478   }
479   std::stable_sort(ary.begin(), ary.end(), CompareByTimeStamp());
480 
481   lru_list_.reset(new LRUList(size_));
482   map_.clear();
483   last_item_ = NULL;
484   for (size_t i = 0; i < ary.size(); ++i) {
485     if (GetTimeStamp(ary[i]) != 0) {
486       Node *node = lru_list_->Add(ary[i]);
487       map_.insert(std::make_pair(GetFP(ary[i]), node));
488     } else if (last_item_ == NULL) {
489       last_item_ = ary[i];
490     }
491   }
492 
493   return true;
494 }
495 
Close()496 void LRUStorage::Close() {
497   filename_.clear();
498   mmap_.reset();
499   lru_list_.reset();
500   map_.clear();
501 }
502 
Lookup(const string & key) const503 const char* LRUStorage::Lookup(const string &key) const {
504   uint32 last_access_time = 0;
505   return Lookup(key, &last_access_time);
506 }
507 
Lookup(const string & key,uint32 * last_access_time) const508 const char* LRUStorage::Lookup(const string &key,
509                                uint32 *last_access_time) const {
510   const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
511   std::map<uint64, Node *>::const_iterator it = map_.find(fp);
512   if (it == map_.end()) {
513     return NULL;
514   }
515   *last_access_time = GetTimeStamp(it->second->value);
516   return GetValue(it->second->value);
517 }
518 
GetAllValues(std::vector<string> * values) const519 bool LRUStorage::GetAllValues(std::vector<string> *values) const {
520   if (lru_list_.get() == NULL) {
521     return false;
522   }
523   DCHECK(values);
524   values->clear();
525   for (const Node *node = lru_list_->GetLastNode();
526        node != NULL;
527        node = node->prev) {
528     // Default constructor of string is not applicable
529     // because value's size() must return value_size_.
530     DCHECK(node->value);
531     values->push_back(string(GetValue(node->value), value_size_));
532   }
533   std::reverse(values->begin(), values->end());
534   return true;
535 }
536 
Touch(const string & key)537 bool LRUStorage::Touch(const string &key) {
538   if (lru_list_.get() == NULL) {
539     return false;
540   }
541 
542   const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
543   std::map<uint64, Node *>::iterator it = map_.find(fp);
544   if (it != map_.end()) {     // find in the cache
545     Update(it->second->value);
546     lru_list_->MoveToTop(it->second);
547     return true;
548   }
549   return false;
550 }
551 
Insert(const string & key,const char * value)552 bool LRUStorage::Insert(const string &key, const char *value) {
553   if (lru_list_.get() == NULL) {
554     return false;
555   }
556 
557   const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
558   std::map<uint64, Node *>::iterator it = map_.find(fp);
559   if (it != map_.end()) {     // find in the cache
560     Update(it->second->value, fp, value, value_size_);
561     lru_list_->MoveToTop(it->second);
562   } else if (lru_list_->size() >= size_ ||
563              last_item_ == NULL) {  // not found, but cache is FULL
564     Node *node = lru_list_->GetLastNode();
565     const uint64 old_fp = GetFP(node->value);  // remove oldest item
566     std::map<uint64, Node *>::iterator old_it = map_.find(old_fp);
567     if (old_it != map_.end()) {
568       map_.erase(old_it);
569     }
570     lru_list_->MoveToTop(node);
571     Update(node->value, fp, value, value_size_);
572     map_.insert(std::make_pair(fp, node));
573   } else if (last_item_ < mmap_->end()) {  // not found, cahce is not FULL
574     Node *node = lru_list_->Add(last_item_);
575     lru_list_->MoveToTop(node);
576     Update(node->value, fp, value, value_size_);
577     map_.insert(std::make_pair(fp, node));
578     last_item_ += (value_size_ + 12);
579     if (last_item_ >= mmap_->end()) {
580       last_item_ = NULL;
581     }
582   } else {
583     LOG(ERROR) << "insertion failed";
584     return false;
585   }
586 
587   return true;
588 }
589 
TryInsert(const string & key,const char * value)590 bool LRUStorage::TryInsert(const string &key, const char *value) {
591   if (lru_list_.get() == NULL) {
592     return false;
593   }
594 
595   const uint64 fp = Hash::FingerprintWithSeed(key, seed_);
596   std::map<uint64, Node *>::iterator it = map_.find(fp);
597   if (it != map_.end()) {     // find in the cache
598     Update(it->second->value, fp, value, value_size_);
599     lru_list_->MoveToTop(it->second);
600   }
601 
602   return true;
603 }
604 
value_size() const605 size_t LRUStorage::value_size() const {
606   return value_size_;
607 }
608 
size() const609 size_t LRUStorage::size() const {
610   return size_;
611 }
612 
used_size() const613 size_t LRUStorage::used_size() const {
614   return lru_list_.get() == NULL ? 0 : lru_list_->size();
615 }
616 
seed() const617 uint32 LRUStorage::seed() const {
618   return seed_;
619 }
620 
filename() const621 const string &LRUStorage::filename() const {
622   return filename_;
623 }
624 
Write(size_t i,uint64 fp,const string & value,uint32 last_access_time)625 void LRUStorage::Write(size_t i,
626                        uint64 fp,
627                        const string &value,
628                        uint32 last_access_time) {
629   DCHECK_LT(i, size_);
630   char *ptr = begin_ + (i * (value_size_ + 12));
631   memcpy(ptr,     reinterpret_cast<const char *>(&fp), 8);
632   memcpy(ptr + 8, reinterpret_cast<const char *>(&last_access_time), 4);
633   if (value.size() == value_size_) {
634     memcpy(ptr + 12, value.data(), value_size_);
635   } else {
636     LOG(ERROR) << "value size is not " << value_size_ << " byte.";
637   }
638 }
639 
Read(size_t i,uint64 * fp,string * value,uint32 * last_access_time) const640 void LRUStorage::Read(size_t i,
641                       uint64 *fp,
642                       string *value,
643                       uint32 *last_access_time) const {
644   DCHECK_LT(i, size_);
645   const char *ptr = begin_ + (i * (value_size_ + 12));
646   *fp = GetFP(ptr);
647   value->assign(GetValue(ptr), value_size_);
648   *last_access_time = GetTimeStamp(ptr);
649 }
650 
651 }  // namespace storage
652 }  // namespace mozc
653