1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chromeos/components/local_search_service/inverted_index.h"
6 
7 #include <cmath>
8 #include <string>
9 #include <unordered_map>
10 #include <vector>
11 
12 #include "base/bind.h"
13 #include "base/strings/string16.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "base/test/task_environment.h"
16 #include "chromeos/components/local_search_service/shared_structs.h"
17 #include "chromeos/components/local_search_service/test_utils.h"
18 #include "testing/gmock/include/gmock/gmock.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20 
21 namespace chromeos {
22 namespace local_search_service {
23 
24 namespace {
25 
GetScoresFromTfidfResult(const std::vector<TfidfResult> & results)26 std::vector<float> GetScoresFromTfidfResult(
27     const std::vector<TfidfResult>& results) {
28   std::vector<float> scores;
29   for (const auto& item : results) {
30     scores.push_back(std::roundf(std::get<2>(item) * 100) / 100.0);
31   }
32   return scores;
33 }
34 
35 constexpr double kDefaultWeight = 1.0;
36 
37 }  // namespace
38 
39 class InvertedIndexTest : public ::testing::Test {
40  public:
InvertedIndexTest()41   InvertedIndexTest() {
42     index_.RegisterIndexBuiltCallback(base::BindRepeating(
43         &InvertedIndexTest::OnIndexBuilt, base::Unretained(this)));
44   }
45 
SetUp()46   void SetUp() override {
47     // All content weights are initialized to |kDefaultWeight|.
48     index_.doc_length_ =
49         std::unordered_map<std::string, uint32_t>({{"doc1", 8}, {"doc2", 6}});
50 
51     index_.dictionary_[base::UTF8ToUTF16("A")] = PostingList(
52         {{"doc1",
53           Posting({WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
54                    WeightedPosition(kDefaultWeight, Position("header", 3, 1)),
55                    WeightedPosition(kDefaultWeight, Position("body", 5, 1)),
56                    WeightedPosition(kDefaultWeight, Position("body", 7, 1))})},
57          {"doc2",
58           Posting(
59               {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
60                WeightedPosition(kDefaultWeight, Position("header", 4, 1))})}});
61 
62     index_.dictionary_[base::UTF8ToUTF16("B")] = PostingList(
63         {{"doc1",
64           Posting(
65               {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
66                WeightedPosition(kDefaultWeight, Position("body", 4, 1)),
67                WeightedPosition(kDefaultWeight, Position("header", 6, 1)),
68                WeightedPosition(kDefaultWeight, Position("body", 8, 1))})}});
69 
70     index_.dictionary_[base::UTF8ToUTF16("C")] = PostingList(
71         {{"doc2",
72           Posting(
73               {WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
74                WeightedPosition(kDefaultWeight, Position("body", 3, 1)),
75                WeightedPosition(kDefaultWeight, Position("header", 5, 1)),
76                WeightedPosition(kDefaultWeight, Position("body", 7, 1))})}});
77     index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("A"));
78     index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("B"));
79     index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("C"));
80 
81     // Manually set |is_index_built_| below because the docs above were not
82     // added to the index using the AddOrUpdate method.
83     index_.is_index_built_ = false;
84   }
85 
FindTerm(const base::string16 & term)86   PostingList FindTerm(const base::string16& term) {
87     return index_.FindTerm(term);
88   }
89 
FindMatchingDocumentsApproximately(const std::unordered_set<base::string16> & terms,double prefix_threshold,double block_threshold)90   std::vector<Result> FindMatchingDocumentsApproximately(
91       const std::unordered_set<base::string16>& terms,
92       double prefix_threshold,
93       double block_threshold) {
94     return index_.FindMatchingDocumentsApproximately(terms, prefix_threshold,
95                                                      block_threshold);
96   }
97 
AddDocuments(const DocumentToUpdate & documents)98   void AddDocuments(const DocumentToUpdate& documents) {
99     index_.AddDocuments(documents);
100   }
101 
RemoveDocuments(const std::vector<std::string> & doc_ids)102   void RemoveDocuments(const std::vector<std::string>& doc_ids) {
103     index_.RemoveDocuments(doc_ids);
104   }
105 
ClearInvertedIndex()106   void ClearInvertedIndex() { index_.ClearInvertedIndex(); }
107 
GetTfidf(const base::string16 & term)108   std::vector<TfidfResult> GetTfidf(const base::string16& term) {
109     return index_.GetTfidf(term);
110   }
111 
GetTfidfForDocId(const base::string16 & term,const std::string & docid,float * tfidf,size_t * number_positions)112   bool GetTfidfForDocId(const base::string16& term,
113                         const std::string& docid,
114                         float* tfidf,
115                         size_t* number_positions) {
116     const auto& posting = GetTfidf(term);
117     if (posting.empty()) {
118       return false;
119     }
120 
121     for (const auto& tfidf_result : posting) {
122       if (std::get<0>(tfidf_result) == docid) {
123         *number_positions = std::get<1>(tfidf_result).size();
124         *tfidf = std::get<2>(tfidf_result);
125         return true;
126       }
127     }
128     return false;
129   }
130 
BuildInvertedIndex()131   void BuildInvertedIndex() { index_.BuildInvertedIndex(); }
132 
IsInvertedIndexBuilt()133   bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); }
134 
GetDictionary()135   std::unordered_map<base::string16, PostingList> GetDictionary() {
136     return index_.dictionary_;
137   }
138 
GetDocLength()139   std::unordered_map<std::string, uint32_t> GetDocLength() {
140     return index_.doc_length_;
141   }
142 
GetTfidfCache()143   std::unordered_map<base::string16, std::vector<TfidfResult>> GetTfidfCache() {
144     return index_.tfidf_cache_;
145   }
146 
GetDocumentsToUpdate()147   DocumentToUpdate GetDocumentsToUpdate() {
148     return index_.documents_to_update_;
149   }
150 
GetTermToBeUpdated()151   TermSet GetTermToBeUpdated() { return index_.terms_to_be_updated_; }
152 
Wait()153   void Wait() { task_environment_.RunUntilIdle(); }
154 
BuildIndexCompleted()155   bool BuildIndexCompleted() {
156     return !(index_.update_in_progress_ || index_.index_building_in_progress_ ||
157              index_.request_to_build_index_);
158   }
159 
UpdateDocumentsCompleted()160   bool UpdateDocumentsCompleted() { return !index_.update_in_progress_; }
161 
OnIndexBuilt()162   void OnIndexBuilt() { ++num_built_; }
163 
NumBuilt()164   int NumBuilt() { return num_built_; }
165 
166  private:
167   int num_built_ = 0;
168   InvertedIndex index_;
169   base::test::TaskEnvironment task_environment_{
170       base::test::TaskEnvironment::MainThreadType::DEFAULT,
171       base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
172 };
173 
TEST_F(InvertedIndexTest,FindTermTest)174 TEST_F(InvertedIndexTest, FindTermTest) {
175   EXPECT_EQ(NumBuilt(), 0);
176   PostingList result = FindTerm(base::UTF8ToUTF16("A"));
177   ASSERT_EQ(result.size(), 2u);
178   EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
179   EXPECT_EQ(result["doc1"][0].position.start, 1u);
180   EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight);
181   EXPECT_EQ(result["doc1"][1].position.start, 3u);
182   EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight);
183   EXPECT_EQ(result["doc1"][2].position.start, 5u);
184   EXPECT_EQ(result["doc1"][3].weight, kDefaultWeight);
185   EXPECT_EQ(result["doc1"][3].position.start, 7u);
186 
187   EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
188   EXPECT_EQ(result["doc2"][0].position.start, 2u);
189   EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
190   EXPECT_EQ(result["doc2"][1].position.start, 4u);
191 }
192 
TEST_F(InvertedIndexTest,AddNewDocumentTest)193 TEST_F(InvertedIndexTest, AddNewDocumentTest) {
194   const base::string16 a_utf16(base::UTF8ToUTF16("A"));
195   const base::string16 d_utf16(base::UTF8ToUTF16("D"));
196 
197   EXPECT_EQ(NumBuilt(), 0);
198   AddDocuments({{"doc3",
199                  {{a_utf16,
200                    {{kDefaultWeight, {"header", 1, 1}},
201                     {kDefaultWeight / 2, {"body", 2, 1}},
202                     {kDefaultWeight, {"header", 4, 1}}}},
203                   {d_utf16,
204                    {{kDefaultWeight, {"header", 3, 1}},
205                     {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
206   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
207   EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
208   Wait();
209   EXPECT_EQ(NumBuilt(), 0);
210   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
211   // 4 terms "A", "B", "C", "D" need to be updated.
212   EXPECT_EQ(GetTermToBeUpdated().size(), 4u);
213   EXPECT_TRUE(UpdateDocumentsCompleted());
214 
215   EXPECT_EQ(GetDocLength()["doc3"], 5u);
216 
217   // Find "A"
218   PostingList result = FindTerm(a_utf16);
219   ASSERT_EQ(result.size(), 3u);
220   EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
221   EXPECT_EQ(result["doc3"][0].position.start, 1u);
222   EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
223   EXPECT_EQ(result["doc3"][1].position.start, 2u);
224   EXPECT_EQ(result["doc3"][2].weight, kDefaultWeight);
225   EXPECT_EQ(result["doc3"][2].position.start, 4u);
226 
227   // Find "D"
228   result = FindTerm(d_utf16);
229   ASSERT_EQ(result.size(), 1u);
230   EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
231   EXPECT_EQ(result["doc3"][0].position.start, 3u);
232   EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
233   EXPECT_EQ(result["doc3"][1].position.start, 5u);
234 
235   // Add multiple documents
236   AddDocuments({{"doc4",
237                  {{base::UTF8ToUTF16("E"),
238                    {{kDefaultWeight, {"header", 1, 1}},
239                     {kDefaultWeight / 2, {"body", 2, 1}},
240                     {kDefaultWeight, {"header", 4, 1}}}},
241                   {base::UTF8ToUTF16("F"),
242                    {{kDefaultWeight, {"header", 3, 1}},
243                     {kDefaultWeight / 2, {"body", 5, 1}}}}}},
244                 {"doc5",
245                  {{base::UTF8ToUTF16("E"),
246                    {{kDefaultWeight, {"header", 1, 1}},
247                     {kDefaultWeight / 2, {"body", 2, 1}},
248                     {kDefaultWeight, {"header", 4, 1}}}},
249                   {base::UTF8ToUTF16("G"),
250                    {{kDefaultWeight, {"header", 3, 1}},
251                     {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
252   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
253   EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
254   Wait();
255   EXPECT_EQ(NumBuilt(), 0);
256   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
257   // 7 terms "A", "B", "C", "D", "E", "F", "G" need to be updated.
258   EXPECT_EQ(GetTermToBeUpdated().size(), 7u);
259   EXPECT_TRUE(UpdateDocumentsCompleted());
260 
261   // Find "E"
262   result = FindTerm(base::UTF8ToUTF16("E"));
263   ASSERT_EQ(result.size(), 2u);
264 
265   // Find "F"
266   result = FindTerm(base::UTF8ToUTF16("F"));
267   ASSERT_EQ(result.size(), 1u);
268 }
269 
TEST_F(InvertedIndexTest,ReplaceDocumentTest)270 TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
271   const base::string16 a_utf16(base::UTF8ToUTF16("A"));
272   const base::string16 d_utf16(base::UTF8ToUTF16("D"));
273 
274   EXPECT_EQ(NumBuilt(), 0);
275   AddDocuments({{"doc1",
276                  {{a_utf16,
277                    {{kDefaultWeight, {"header", 1, 1}},
278                     {kDefaultWeight / 4, {"body", 2, 1}},
279                     {kDefaultWeight / 2, {"header", 4, 1}}}},
280                   {d_utf16,
281                    {{kDefaultWeight / 3, {"header", 3, 1}},
282                     {kDefaultWeight / 5, {"body", 5, 1}}}}}}});
283   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
284   Wait();
285   EXPECT_EQ(NumBuilt(), 0);
286   EXPECT_TRUE(UpdateDocumentsCompleted());
287 
288   EXPECT_EQ(GetDocLength()["doc1"], 5u);
289   EXPECT_EQ(GetDocLength()["doc2"], 6u);
290 
291   // Find "A"
292   PostingList result = FindTerm(a_utf16);
293   ASSERT_EQ(result.size(), 2u);
294   EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
295   EXPECT_EQ(result["doc1"][0].position.start, 1u);
296   EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 4);
297   EXPECT_EQ(result["doc1"][1].position.start, 2u);
298   EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight / 2);
299   EXPECT_EQ(result["doc1"][2].position.start, 4u);
300 
301   // Find "B"
302   result = FindTerm(base::UTF8ToUTF16("B"));
303   ASSERT_EQ(result.size(), 0u);
304 
305   // Find "D"
306   result = FindTerm(d_utf16);
307   ASSERT_EQ(result.size(), 1u);
308   EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight / 3);
309   EXPECT_EQ(result["doc1"][0].position.start, 3u);
310   EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 5);
311   EXPECT_EQ(result["doc1"][1].position.start, 5u);
312 }
313 
TEST_F(InvertedIndexTest,RemoveDocumentTest)314 TEST_F(InvertedIndexTest, RemoveDocumentTest) {
315   EXPECT_EQ(GetDictionary().size(), 3u);
316   EXPECT_EQ(GetDocLength().size(), 2u);
317 
318   EXPECT_EQ(NumBuilt(), 0);
319   RemoveDocuments({"doc1"});
320   Wait();
321   EXPECT_EQ(NumBuilt(), 0);
322   EXPECT_TRUE(UpdateDocumentsCompleted());
323 
324   EXPECT_EQ(GetDictionary().size(), 2u);
325   EXPECT_EQ(GetDocLength().size(), 1u);
326   EXPECT_EQ(GetDocLength()["doc2"], 6u);
327 
328   // Find "A"
329   PostingList result = FindTerm(base::UTF8ToUTF16("A"));
330   ASSERT_EQ(result.size(), 1u);
331   EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
332   EXPECT_EQ(result["doc2"][0].position.start, 2u);
333   EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
334   EXPECT_EQ(result["doc2"][1].position.start, 4u);
335 
336   // Find "B"
337   result = FindTerm(base::UTF8ToUTF16("B"));
338   ASSERT_EQ(result.size(), 0u);
339 
340   // Find "C"
341   result = FindTerm(base::UTF8ToUTF16("C"));
342   ASSERT_EQ(result.size(), 1u);
343   EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
344   EXPECT_EQ(result["doc2"][0].position.start, 1u);
345   EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
346   EXPECT_EQ(result["doc2"][1].position.start, 3u);
347   EXPECT_EQ(result["doc2"][2].weight, kDefaultWeight);
348   EXPECT_EQ(result["doc2"][2].position.start, 5u);
349   EXPECT_EQ(result["doc2"][3].weight, kDefaultWeight);
350   EXPECT_EQ(result["doc2"][3].position.start, 7u);
351 
352   // Removes multiple documents
353   RemoveDocuments({"doc1", "doc2", "doc3"});
354   Wait();
355   EXPECT_EQ(NumBuilt(), 0);
356   EXPECT_TRUE(UpdateDocumentsCompleted());
357   EXPECT_EQ(GetDictionary().size(), 0u);
358   EXPECT_EQ(GetDocLength().size(), 0u);
359 }
360 
TEST_F(InvertedIndexTest,TfidfFromZeroTest)361 TEST_F(InvertedIndexTest, TfidfFromZeroTest) {
362   EXPECT_EQ(GetTfidfCache().size(), 0u);
363   EXPECT_FALSE(IsInvertedIndexBuilt());
364   EXPECT_EQ(NumBuilt(), 0);
365   BuildInvertedIndex();
366   Wait();
367   EXPECT_EQ(NumBuilt(), 1);
368   EXPECT_TRUE(BuildIndexCompleted());
369 
370   std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
371   EXPECT_THAT(GetScoresFromTfidfResult(results),
372               testing::UnorderedElementsAre(0.5, 0.33));
373 
374   results = GetTfidf(base::UTF8ToUTF16("B"));
375   EXPECT_EQ(results.size(), 1u);
376   EXPECT_THAT(GetScoresFromTfidfResult(results),
377               testing::UnorderedElementsAre(0.7));
378 
379   results = GetTfidf(base::UTF8ToUTF16("C"));
380   EXPECT_EQ(results.size(), 1u);
381   EXPECT_THAT(GetScoresFromTfidfResult(results),
382               testing::UnorderedElementsAre(0.94));
383 
384   results = GetTfidf(base::UTF8ToUTF16("D"));
385   EXPECT_EQ(results.size(), 0u);
386 }
387 
TEST_F(InvertedIndexTest,UpdateIndexTest)388 TEST_F(InvertedIndexTest, UpdateIndexTest) {
389   EXPECT_EQ(GetTfidfCache().size(), 0u);
390   BuildInvertedIndex();
391   Wait();
392   EXPECT_EQ(NumBuilt(), 1);
393   EXPECT_TRUE(BuildIndexCompleted());
394 
395   EXPECT_TRUE(IsInvertedIndexBuilt());
396   EXPECT_EQ(GetTfidfCache().size(), 3u);
397 
398   // Replaces "doc1"
399   AddDocuments({{"doc1",
400                  {{base::UTF8ToUTF16("A"),
401                    {{kDefaultWeight / 2, {"header", 1, 1}},
402                     {kDefaultWeight / 4, {"body", 2, 1}},
403                     {kDefaultWeight / 2, {"header", 4, 1}}}},
404                   {base::UTF8ToUTF16("D"),
405                    {{kDefaultWeight, {"header", 3, 1}},
406                     {kDefaultWeight, {"body", 5, 1}}}}}}});
407   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
408   Wait();
409   EXPECT_EQ(NumBuilt(), 1);
410   EXPECT_TRUE(UpdateDocumentsCompleted());
411 
412   EXPECT_FALSE(IsInvertedIndexBuilt());
413   BuildInvertedIndex();
414   Wait();
415   EXPECT_EQ(NumBuilt(), 2);
416   EXPECT_TRUE(BuildIndexCompleted());
417 
418   EXPECT_EQ(GetTfidfCache().size(), 3u);
419 
420   std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
421   const double expected_tfidf_A_doc1 =
422       std::roundf(
423           TfIdfScore(
424               /*num_docs=*/2,
425               /*num_docs_with_term=*/2,
426               /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight / 2 +
427                   kDefaultWeight / 4 + kDefaultWeight / 2,
428               /*doc_length=*/5) *
429           100) /
430       100;
431   const double expected_tfidf_A_doc2 =
432       std::roundf(
433           TfIdfScore(/*num_docs=*/2,
434                      /*num_docs_with_term=*/2,
435                      /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
436                      /*doc_length=*/6) *
437           100) /
438       100;
439   EXPECT_THAT(GetScoresFromTfidfResult(results),
440               testing::UnorderedElementsAre(expected_tfidf_A_doc1,
441                                             expected_tfidf_A_doc2));
442 
443   results = GetTfidf(base::UTF8ToUTF16("B"));
444   EXPECT_THAT(GetScoresFromTfidfResult(results),
445               testing::UnorderedElementsAre());
446 
447   results = GetTfidf(base::UTF8ToUTF16("C"));
448   const double expected_tfidf_C_doc2 =
449       std::roundf(
450           TfIdfScore(/*num_docs=*/2,
451                      /*num_docs_with_term=*/1,
452                      /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 4,
453                      /*doc_length=*/6) *
454           100) /
455       100;
456   EXPECT_THAT(GetScoresFromTfidfResult(results),
457               testing::UnorderedElementsAre(expected_tfidf_C_doc2));
458 
459   results = GetTfidf(base::UTF8ToUTF16("D"));
460   const double expected_tfidf_D_doc1 =
461       std::roundf(
462           TfIdfScore(/*num_docs=*/2,
463                      /*num_docs_with_term=*/1,
464                      /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
465                      /*doc_length=*/5) *
466           100) /
467       100;
468   EXPECT_THAT(GetScoresFromTfidfResult(results),
469               testing::UnorderedElementsAre(expected_tfidf_D_doc1));
470 }
471 
TEST_F(InvertedIndexTest,ClearInvertedIndexTest)472 TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
473   EXPECT_EQ(GetTfidfCache().size(), 0u);
474   BuildInvertedIndex();
475   Wait();
476   EXPECT_EQ(NumBuilt(), 1);
477   EXPECT_TRUE(BuildIndexCompleted());
478 
479   EXPECT_TRUE(IsInvertedIndexBuilt());
480   EXPECT_EQ(GetTfidfCache().size(), 3u);
481 
482   // Add a document and clear the index simultaneously.
483   const base::string16 a_utf16(base::UTF8ToUTF16("A"));
484   const base::string16 d_utf16(base::UTF8ToUTF16("D"));
485   AddDocuments({{"doc3",
486                  {{a_utf16,
487                    {{kDefaultWeight, {"header", 1, 1}},
488                     {kDefaultWeight / 2, {"body", 2, 1}},
489                     {kDefaultWeight, {"header", 4, 1}}}},
490                   {d_utf16,
491                    {{kDefaultWeight, {"header", 3, 1}},
492                     {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
493   ClearInvertedIndex();
494   Wait();
495 
496   EXPECT_EQ(GetTfidfCache().size(), 0u);
497   EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
498   EXPECT_EQ(GetDocLength().size(), 0u);
499   EXPECT_EQ(GetDictionary().size(), 0u);
500   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
501 }
502 
TEST_F(InvertedIndexTest,FindMatchingDocumentsApproximatelyTest)503 TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
504   const double prefix_threshold = 1.0;
505   const double block_threshold = 1.0;
506   const base::string16 a_utf16(base::UTF8ToUTF16("A"));
507   const base::string16 b_utf16(base::UTF8ToUTF16("B"));
508   const base::string16 c_utf16(base::UTF8ToUTF16("C"));
509   const base::string16 d_utf16(base::UTF8ToUTF16("D"));
510 
511   EXPECT_EQ(NumBuilt(), 0);
512 
513   // Replace doc1, same occurrences, just different weights.
514   AddDocuments({{"doc1",
515                  {{a_utf16,
516                    {{kDefaultWeight, {"header", 1, 1}},
517                     {kDefaultWeight, {"header", 3, 1}},
518                     {kDefaultWeight / 2, {"body", 5, 1}},
519                     {kDefaultWeight / 2, {"body", 7, 1}}}},
520                   {b_utf16,
521                    {{kDefaultWeight / 2, {"header", 2, 1}},
522                     {kDefaultWeight / 2, {"header", 6, 1}},
523                     {kDefaultWeight / 3, {"body", 4, 1}},
524                     {kDefaultWeight / 3, {"body", 5, 1}}}}}}});
525   EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
526   Wait();
527   EXPECT_EQ(NumBuilt(), 0);
528   EXPECT_TRUE(UpdateDocumentsCompleted());
529 
530   BuildInvertedIndex();
531   Wait();
532   EXPECT_EQ(NumBuilt(), 1);
533   EXPECT_TRUE(BuildIndexCompleted());
534 
535   {
536     // "A" exists in "doc1" and "doc2". The score of each document is simply A's
537     // TF-IDF score.
538     const std::vector<Result> matching_docs =
539         FindMatchingDocumentsApproximately({a_utf16}, prefix_threshold,
540                                            block_threshold);
541 
542     EXPECT_EQ(matching_docs.size(), 2u);
543     std::vector<std::string> expected_docids = {"doc1", "doc2"};
544     // TODO(jiameng): for testing, we should  use another independent method to
545     // verify scores.
546     std::vector<float> actual_scores;
547     for (size_t i = 0; i < 2; ++i) {
548       const auto& docid = expected_docids[i];
549       float expected_score = 0;
550       size_t expected_num_pos = 0u;
551       EXPECT_TRUE(
552           GetTfidfForDocId(a_utf16, docid, &expected_score, &expected_num_pos));
553       CheckResult(matching_docs[i], docid, expected_score, expected_num_pos);
554       actual_scores.push_back(expected_score);
555     }
556 
557     // Check scores are non-increasing.
558     EXPECT_GE(actual_scores[0], actual_scores[1]);
559   }
560 
561   {
562     // "D" does not exist in the index.
563     const std::vector<Result> matching_docs =
564         FindMatchingDocumentsApproximately({d_utf16}, prefix_threshold,
565                                            block_threshold);
566 
567     EXPECT_TRUE(matching_docs.empty());
568   }
569 
570   {
571     // Query is {"A", "B", "C"}, which matches all documents.
572     const std::vector<Result> matching_docs =
573         FindMatchingDocumentsApproximately({a_utf16, b_utf16, c_utf16},
574                                            prefix_threshold, block_threshold);
575     EXPECT_EQ(matching_docs.size(), 2u);
576 
577     // Ranked documents are {"doc2", "doc1"}.
578     // "doc2" score comes from sum of TF-IDF of "A" and "C"
579     float expected_score2_A = 0;
580     size_t expected_num_pos2_A = 0u;
581     EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc2", &expected_score2_A,
582                                  &expected_num_pos2_A));
583     float expected_score2_C = 0;
584     size_t expected_num_pos2_C = 0u;
585     EXPECT_TRUE(GetTfidfForDocId(c_utf16, "doc2", &expected_score2_C,
586                                  &expected_num_pos2_C));
587     const float expected_score2 = expected_score2_A + expected_score2_C;
588     const size_t expected_num_pos2 = expected_num_pos2_A + expected_num_pos2_C;
589     CheckResult(matching_docs[0], "doc2", expected_score2, expected_num_pos2);
590 
591     // "doc1" score comes from sum of TF-IDF of "A" and "B".
592     float expected_score1_A = 0;
593     size_t expected_num_pos1_A = 0u;
594     EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc1", &expected_score1_A,
595                                  &expected_num_pos1_A));
596     float expected_score1_B = 0;
597     size_t expected_num_pos1_B = 0u;
598     EXPECT_TRUE(GetTfidfForDocId(b_utf16, "doc1", &expected_score1_B,
599                                  &expected_num_pos1_B));
600     const float expected_score1 = expected_score1_A + expected_score1_B;
601     const size_t expected_num_pos1 = expected_num_pos1_B + expected_num_pos1_B;
602     CheckResult(matching_docs[1], "doc1", expected_score1, expected_num_pos1);
603 
604     EXPECT_GE(expected_score2, expected_score1);
605   }
606 }
607 
608 }  // namespace local_search_service
609 }  // namespace chromeos
610