1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
6 
7 #include <stddef.h>
8 
9 #include <iterator>
10 
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/check_op.h"
14 #include "base/files/file_enumerator.h"
15 #include "base/files/file_util.h"
16 #include "base/lazy_instance.h"
17 #include "base/macros.h"
18 #include "base/sequence_checker.h"
19 #include "base/stl_util.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/utf_string_conversions.h"
22 #include "base/task/lazy_thread_pool_task_runner.h"
23 #include "content/public/browser/browser_task_traits.h"
24 
25 #include "content/public/browser/browser_thread.h"
26 
27 using base::Bind;
28 using base::Callback;
29 using base::FileEnumerator;
30 using base::FilePath;
31 using base::Time;
32 using base::TimeDelta;
33 using base::TimeTicks;
34 using content::BrowserThread;
35 using std::map;
36 using std::string;
37 using std::vector;
38 
39 namespace {
40 
41 using std::set;
42 
impl_task_runner()43 base::SequencedTaskRunner* impl_task_runner() {
44   constexpr base::TaskTraits kBlockingTraits = {
45       base::MayBlock(), base::TaskPriority::BEST_EFFORT};
46   static base::LazyThreadPoolSequencedTaskRunner s_sequenced_task_task_runner =
47       LAZY_THREAD_POOL_SEQUENCED_TASK_RUNNER_INITIALIZER(kBlockingTraits);
48   return s_sequenced_task_task_runner.Get().get();
49 }
50 
51 typedef int32_t Trigram;
52 typedef char TrigramChar;
53 typedef uint16_t FileId;
54 
55 const int kMinTimeoutBetweenWorkedNitification = 200;
56 // Trigram characters include all ASCII printable characters (32-126) except for
57 // the capital letters, because the index is case insensitive.
58 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
59 const size_t kTrigramCount =
60     kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
61 const int kMaxReadLength = 10 * 1024;
62 const TrigramChar kUndefinedTrigramChar = -1;
63 const TrigramChar kBinaryTrigramChar = -2;
64 const Trigram kUndefinedTrigram = -1;
65 
66 class Index {
67  public:
68   Index();
69   // Index is only instantiated as a leak LazyInstance, so the destructor is
70   // never called.
71   ~Index() = delete;
72 
73   Time LastModifiedTimeForFile(const FilePath& file_path);
74   void SetTrigramsForFile(const FilePath& file_path,
75                           const vector<Trigram>& index,
76                           const Time& time);
77   vector<FilePath> Search(const string& query);
78   void NormalizeVectors();
79   void Reset();
80   void EnsureInitialized();
81 
82  private:
83   FileId GetFileId(const FilePath& file_path);
84 
85   typedef map<FilePath, FileId> FileIdsMap;
86   FileIdsMap file_ids_;
87   FileId last_file_id_;
88   // The index in this vector is the trigram id.
89   vector<vector<FileId> > index_;
90   typedef map<FilePath, Time> IndexedFilesMap;
91   IndexedFilesMap index_times_;
92   vector<bool> is_normalized_;
93   SEQUENCE_CHECKER(sequence_checker_);
94 
95   DISALLOW_COPY_AND_ASSIGN(Index);
96 };
97 
98 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
99 
TrigramCharForChar(char c)100 TrigramChar TrigramCharForChar(char c) {
101   static TrigramChar* trigram_chars = nullptr;
102   if (!trigram_chars) {
103     trigram_chars = new TrigramChar[256];
104     for (size_t i = 0; i < 256; ++i) {
105       if (i > 127) {
106         trigram_chars[i] = kUndefinedTrigramChar;
107         continue;
108       }
109       char ch = static_cast<char>(i);
110       if (ch == '\t')
111         ch = ' ';
112       if (base::IsAsciiUpper(ch))
113         ch = ch - 'A' + 'a';
114 
115       bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
116       if (is_binary_char) {
117         trigram_chars[i] = kBinaryTrigramChar;
118         continue;
119       }
120 
121       if (ch < ' ') {
122         trigram_chars[i] = kUndefinedTrigramChar;
123         continue;
124       }
125 
126       if (ch >= 'Z')
127         ch = ch - 'Z' - 1 + 'A';
128       ch -= ' ';
129       char signed_trigram_count = static_cast<char>(kTrigramCharacterCount);
130       CHECK(ch >= 0 && ch < signed_trigram_count);
131       trigram_chars[i] = ch;
132     }
133   }
134   unsigned char uc = static_cast<unsigned char>(c);
135   return trigram_chars[uc];
136 }
137 
TrigramAtIndex(const vector<TrigramChar> & trigram_chars,size_t index)138 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
139   static int kTrigramCharacterCountSquared =
140       kTrigramCharacterCount * kTrigramCharacterCount;
141   if (trigram_chars[index] == kUndefinedTrigramChar ||
142       trigram_chars[index + 1] == kUndefinedTrigramChar ||
143       trigram_chars[index + 2] == kUndefinedTrigramChar)
144     return kUndefinedTrigram;
145   Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
146                     kTrigramCharacterCount * trigram_chars[index + 1] +
147                     trigram_chars[index + 2];
148   return trigram;
149 }
150 
Index()151 Index::Index() : last_file_id_(0) {
152   Reset();
153 }
154 
Reset()155 void Index::Reset() {
156   file_ids_.clear();
157   index_.clear();
158   index_times_.clear();
159   is_normalized_.clear();
160   last_file_id_ = 0;
161 }
162 
EnsureInitialized()163 void Index::EnsureInitialized() {
164   if (!index_.empty())
165     return;
166   index_.resize(kTrigramCount);
167   is_normalized_.resize(kTrigramCount);
168   std::fill(is_normalized_.begin(), is_normalized_.end(), true);
169 }
170 
LastModifiedTimeForFile(const FilePath & file_path)171 Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
172   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
173   EnsureInitialized();
174   Time last_modified_time;
175   if (index_times_.find(file_path) != index_times_.end())
176     last_modified_time = index_times_[file_path];
177   return last_modified_time;
178 }
179 
SetTrigramsForFile(const FilePath & file_path,const vector<Trigram> & index,const Time & time)180 void Index::SetTrigramsForFile(const FilePath& file_path,
181                                const vector<Trigram>& index,
182                                const Time& time) {
183   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
184   EnsureInitialized();
185   FileId file_id = GetFileId(file_path);
186   auto it = index.begin();
187   for (; it != index.end(); ++it) {
188     Trigram trigram = *it;
189     index_[trigram].push_back(file_id);
190     is_normalized_[trigram] = false;
191   }
192   index_times_[file_path] = time;
193 }
194 
Search(const string & query)195 vector<FilePath> Index::Search(const string& query) {
196   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
197   EnsureInitialized();
198   const char* data = query.c_str();
199   vector<TrigramChar> trigram_chars;
200   trigram_chars.reserve(query.size());
201   for (size_t i = 0; i < query.size(); ++i) {
202       TrigramChar trigram_char = TrigramCharForChar(data[i]);
203       if (trigram_char == kBinaryTrigramChar)
204         trigram_char = kUndefinedTrigramChar;
205       trigram_chars.push_back(trigram_char);
206   }
207   vector<Trigram> trigrams;
208   for (size_t i = 0; i + 2 < query.size(); ++i) {
209     Trigram trigram = TrigramAtIndex(trigram_chars, i);
210     if (trigram != kUndefinedTrigram)
211       trigrams.push_back(trigram);
212   }
213   set<FileId> file_ids;
214   bool first = true;
215   vector<Trigram>::const_iterator it = trigrams.begin();
216   for (; it != trigrams.end(); ++it) {
217     Trigram trigram = *it;
218     if (first) {
219       std::copy(index_[trigram].begin(),
220                 index_[trigram].end(),
221                 std::inserter(file_ids, file_ids.begin()));
222       first = false;
223       continue;
224     }
225     set<FileId> intersection = base::STLSetIntersection<set<FileId> >(
226         file_ids, index_[trigram]);
227     file_ids.swap(intersection);
228   }
229   vector<FilePath> result;
230   FileIdsMap::const_iterator ids_it = file_ids_.begin();
231   for (; ids_it != file_ids_.end(); ++ids_it) {
232     if (trigrams.empty() || file_ids.find(ids_it->second) != file_ids.end()) {
233       result.push_back(ids_it->first);
234     }
235   }
236   return result;
237 }
238 
GetFileId(const FilePath & file_path)239 FileId Index::GetFileId(const FilePath& file_path) {
240   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
241   EnsureInitialized();
242   string file_path_str = file_path.AsUTF8Unsafe();
243   if (file_ids_.find(file_path) != file_ids_.end())
244     return file_ids_[file_path];
245   file_ids_[file_path] = ++last_file_id_;
246   return last_file_id_;
247 }
248 
NormalizeVectors()249 void Index::NormalizeVectors() {
250   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
251   EnsureInitialized();
252   for (size_t i = 0; i < kTrigramCount; ++i) {
253     if (!is_normalized_[i]) {
254       std::sort(index_[i].begin(), index_[i].end());
255       if (index_[i].capacity() > index_[i].size())
256         vector<FileId>(index_[i]).swap(index_[i]);
257       is_normalized_[i] = true;
258     }
259   }
260 }
261 
262 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
263 
264 }  // namespace
265 
FileSystemIndexingJob(const FilePath & file_system_path,const std::vector<base::FilePath> & excluded_folders,TotalWorkCallback total_work_callback,const WorkedCallback & worked_callback,DoneCallback done_callback)266 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
267     const FilePath& file_system_path,
268     const std::vector<base::FilePath>& excluded_folders,
269     TotalWorkCallback total_work_callback,
270     const WorkedCallback& worked_callback,
271     DoneCallback done_callback)
272     : file_system_path_(file_system_path),
273       excluded_folders_(excluded_folders),
274       total_work_callback_(std::move(total_work_callback)),
275       worked_callback_(worked_callback),
276       done_callback_(std::move(done_callback)),
277       files_indexed_(0),
278       stopped_(false) {
279   current_trigrams_set_.resize(kTrigramCount);
280   current_trigrams_.reserve(kTrigramCount);
281   pending_folders_.push_back(file_system_path);
282 }
283 
~FileSystemIndexingJob()284 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
285 
Start()286 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
287   DCHECK_CURRENTLY_ON(BrowserThread::UI);
288   impl_task_runner()->PostTask(
289       FROM_HERE, BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
290 }
291 
Stop()292 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
293   DCHECK_CURRENTLY_ON(BrowserThread::UI);
294   impl_task_runner()->PostTask(
295       FROM_HERE, BindOnce(&FileSystemIndexingJob::StopOnImplSequence, this));
296 }
297 
StopOnImplSequence()298 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnImplSequence() {
299   stopped_ = true;
300 }
301 
CollectFilesToIndex()302 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
303   DCHECK(impl_task_runner()->RunsTasksInCurrentSequence());
304   if (stopped_)
305     return;
306   if (!file_enumerator_) {
307     file_enumerator_.reset(new FileEnumerator(
308         pending_folders_.back(), false,
309         FileEnumerator::FILES | FileEnumerator::DIRECTORIES));
310     pending_folders_.pop_back();
311   }
312   FilePath file_path = file_enumerator_->Next();
313   if (file_path.empty() && !pending_folders_.empty()) {
314     file_enumerator_.reset(new FileEnumerator(
315         pending_folders_.back(), false,
316         FileEnumerator::FILES | FileEnumerator::DIRECTORIES));
317     pending_folders_.pop_back();
318     impl_task_runner()->PostTask(
319         FROM_HERE, BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
320     return;
321   }
322 
323   if (file_path.empty()) {
324     content::GetUIThreadTaskRunner({})->PostTask(
325         FROM_HERE, BindOnce(std::move(total_work_callback_), file_path_times_.size()));
326     indexing_it_ = file_path_times_.begin();
327     IndexFiles();
328     return;
329   }
330   if (file_enumerator_->GetInfo().IsDirectory()) {
331     bool excluded = false;
332     for (const FilePath& excluded_folder : excluded_folders_) {
333       excluded = excluded_folder.IsParent(file_path);
334       if (excluded)
335         break;
336     }
337     if (!excluded)
338       pending_folders_.push_back(file_path);
339     impl_task_runner()->PostTask(
340         FROM_HERE, BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
341     return;
342   }
343 
344   Time saved_last_modified_time =
345       g_trigram_index.Get().LastModifiedTimeForFile(file_path);
346   FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
347   Time current_last_modified_time = file_info.GetLastModifiedTime();
348   if (current_last_modified_time > saved_last_modified_time) {
349     file_path_times_[file_path] = current_last_modified_time;
350   }
351   impl_task_runner()->PostTask(
352       FROM_HERE, BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
353 }
354 
IndexFiles()355 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
356   DCHECK(impl_task_runner()->RunsTasksInCurrentSequence());
357   if (stopped_)
358     return;
359   if (indexing_it_ == file_path_times_.end()) {
360     g_trigram_index.Get().NormalizeVectors();
361     content::GetUIThreadTaskRunner({})->PostTask(FROM_HERE, std::move(done_callback_));
362     return;
363   }
364   FilePath file_path = indexing_it_->first;
365   current_file_.Initialize(file_path,
366                            base::File::FLAG_OPEN | base::File::FLAG_READ);
367 
368   if (!current_file_.IsValid()) {
369     FinishFileIndexing(false);
370     return;
371   }
372   current_file_offset_ = 0;
373   current_trigrams_.clear();
374   std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
375   ReadFromFile();
376 }
377 
ReadFromFile()378 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
379   if (stopped_) {
380     CloseFile();
381     return;
382   }
383   std::unique_ptr<char[]> data_ptr(new char[kMaxReadLength]);
384   const char* const data = data_ptr.get();
385   int bytes_read =
386       current_file_.Read(current_file_offset_, data_ptr.get(), kMaxReadLength);
387   if (bytes_read < 0) {
388     FinishFileIndexing(false);
389     return;
390   }
391 
392   if (bytes_read < 3) {
393     FinishFileIndexing(true);
394     return;
395   }
396 
397   size_t size = static_cast<size_t>(bytes_read);
398   vector<TrigramChar> trigram_chars;
399   trigram_chars.reserve(size);
400   for (size_t i = 0; i < size; ++i) {
401     TrigramChar trigram_char = TrigramCharForChar(data[i]);
402     if (trigram_char == kBinaryTrigramChar) {
403       current_trigrams_.clear();
404       FinishFileIndexing(true);
405       return;
406     }
407     trigram_chars.push_back(trigram_char);
408   }
409 
410   for (size_t i = 0; i + 2 < size; ++i) {
411     Trigram trigram = TrigramAtIndex(trigram_chars, i);
412     if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
413       current_trigrams_set_[trigram] = true;
414       current_trigrams_.push_back(trigram);
415     }
416   }
417   current_file_offset_ += bytes_read - 2;
418   impl_task_runner()->PostTask(
419       FROM_HERE, base::BindOnce(&FileSystemIndexingJob::ReadFromFile, this));
420 }
421 
FinishFileIndexing(bool success)422 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
423     bool success) {
424   DCHECK(impl_task_runner()->RunsTasksInCurrentSequence());
425   CloseFile();
426   if (success) {
427     FilePath file_path = indexing_it_->first;
428     g_trigram_index.Get().SetTrigramsForFile(
429         file_path, current_trigrams_, file_path_times_[file_path]);
430   }
431   ReportWorked();
432   ++indexing_it_;
433   impl_task_runner()->PostTask(
434       FROM_HERE, base::BindOnce(&FileSystemIndexingJob::IndexFiles, this));
435 }
436 
CloseFile()437 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
438   if (current_file_.IsValid())
439     current_file_.Close();
440 }
441 
ReportWorked()442 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
443   TimeTicks current_time = TimeTicks::Now();
444   bool should_send_worked_nitification = true;
445   if (!last_worked_notification_time_.is_null()) {
446     TimeDelta delta = current_time - last_worked_notification_time_;
447     if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
448       should_send_worked_nitification = false;
449   }
450   ++files_indexed_;
451   if (should_send_worked_nitification) {
452     last_worked_notification_time_ = current_time;
453     content::GetUIThreadTaskRunner({})->PostTask(
454         FROM_HERE, BindOnce(worked_callback_, files_indexed_));
455     files_indexed_ = 0;
456   }
457 }
458 
459 static int g_instance_count = 0;
460 
DevToolsFileSystemIndexer()461 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
462   impl_task_runner()->PostTask(FROM_HERE,
463                                base::BindOnce([]() { ++g_instance_count; }));
464 }
465 
~DevToolsFileSystemIndexer()466 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {
467   impl_task_runner()->PostTask(FROM_HERE, base::BindOnce([]() {
468                                  --g_instance_count;
469                                  if (!g_instance_count)
470                                    g_trigram_index.Get().Reset();
471                                }));
472 }
473 
474 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
IndexPath(const string & file_system_path,const vector<string> & excluded_folders,TotalWorkCallback total_work_callback,const WorkedCallback & worked_callback,DoneCallback done_callback)475 DevToolsFileSystemIndexer::IndexPath(
476     const string& file_system_path,
477     const vector<string>& excluded_folders,
478     TotalWorkCallback total_work_callback,
479     const WorkedCallback& worked_callback,
480     DoneCallback done_callback) {
481   DCHECK_CURRENTLY_ON(BrowserThread::UI);
482   vector<base::FilePath> paths;
483   for (const string& path : excluded_folders) {
484     paths.push_back(FilePath::FromUTF8Unsafe(path));
485   }
486   scoped_refptr<FileSystemIndexingJob> indexing_job = new FileSystemIndexingJob(
487       FilePath::FromUTF8Unsafe(file_system_path), paths, std::move(total_work_callback),
488       worked_callback, std::move(done_callback));
489   indexing_job->Start();
490   return indexing_job;
491 }
492 
SearchInPath(const std::string & file_system_path,const std::string & query,SearchCallback callback)493 void DevToolsFileSystemIndexer::SearchInPath(
494     const std::string& file_system_path,
495     const std::string& query,
496     SearchCallback callback) {
497   DCHECK_CURRENTLY_ON(BrowserThread::UI);
498   impl_task_runner()->PostTask(
499       FROM_HERE,
500       BindOnce(&DevToolsFileSystemIndexer::SearchInPathOnImplSequence, this,
501                file_system_path, query, std::move(callback)));
502 }
503 
SearchInPathOnImplSequence(const std::string & file_system_path,const std::string & query,SearchCallback callback)504 void DevToolsFileSystemIndexer::SearchInPathOnImplSequence(
505     const std::string& file_system_path,
506     const std::string& query,
507     SearchCallback callback) {
508   DCHECK(impl_task_runner()->RunsTasksInCurrentSequence());
509   vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
510   vector<string> result;
511   FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
512   vector<FilePath>::const_iterator it = file_paths.begin();
513   for (; it != file_paths.end(); ++it) {
514     if (path.IsParent(*it))
515       result.push_back(it->AsUTF8Unsafe());
516   }
517   content::GetUIThreadTaskRunner({})->PostTask(
518       FROM_HERE, BindOnce(std::move(callback), std::move(result)));
519 }
520