1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chromeos/components/local_search_service/inverted_index.h"
6
7 #include <cmath>
8 #include <string>
9 #include <unordered_map>
10 #include <vector>
11
12 #include "base/bind.h"
13 #include "base/strings/string16.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "base/test/task_environment.h"
16 #include "chromeos/components/local_search_service/shared_structs.h"
17 #include "chromeos/components/local_search_service/test_utils.h"
18 #include "testing/gmock/include/gmock/gmock.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20
21 namespace chromeos {
22 namespace local_search_service {
23
24 namespace {
25
GetScoresFromTfidfResult(const std::vector<TfidfResult> & results)26 std::vector<float> GetScoresFromTfidfResult(
27 const std::vector<TfidfResult>& results) {
28 std::vector<float> scores;
29 for (const auto& item : results) {
30 scores.push_back(std::roundf(std::get<2>(item) * 100) / 100.0);
31 }
32 return scores;
33 }
34
35 constexpr double kDefaultWeight = 1.0;
36
37 } // namespace
38
39 class InvertedIndexTest : public ::testing::Test {
40 public:
InvertedIndexTest()41 InvertedIndexTest() {
42 index_.RegisterIndexBuiltCallback(base::BindRepeating(
43 &InvertedIndexTest::OnIndexBuilt, base::Unretained(this)));
44 }
45
SetUp()46 void SetUp() override {
47 // All content weights are initialized to |kDefaultWeight|.
48 index_.doc_length_ =
49 std::unordered_map<std::string, uint32_t>({{"doc1", 8}, {"doc2", 6}});
50
51 index_.dictionary_[base::UTF8ToUTF16("A")] = PostingList(
52 {{"doc1",
53 Posting({WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
54 WeightedPosition(kDefaultWeight, Position("header", 3, 1)),
55 WeightedPosition(kDefaultWeight, Position("body", 5, 1)),
56 WeightedPosition(kDefaultWeight, Position("body", 7, 1))})},
57 {"doc2",
58 Posting(
59 {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
60 WeightedPosition(kDefaultWeight, Position("header", 4, 1))})}});
61
62 index_.dictionary_[base::UTF8ToUTF16("B")] = PostingList(
63 {{"doc1",
64 Posting(
65 {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
66 WeightedPosition(kDefaultWeight, Position("body", 4, 1)),
67 WeightedPosition(kDefaultWeight, Position("header", 6, 1)),
68 WeightedPosition(kDefaultWeight, Position("body", 8, 1))})}});
69
70 index_.dictionary_[base::UTF8ToUTF16("C")] = PostingList(
71 {{"doc2",
72 Posting(
73 {WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
74 WeightedPosition(kDefaultWeight, Position("body", 3, 1)),
75 WeightedPosition(kDefaultWeight, Position("header", 5, 1)),
76 WeightedPosition(kDefaultWeight, Position("body", 7, 1))})}});
77 index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("A"));
78 index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("B"));
79 index_.terms_to_be_updated_.insert(base::UTF8ToUTF16("C"));
80
81 // Manually set |is_index_built_| below because the docs above were not
82 // added to the index using the AddOrUpdate method.
83 index_.is_index_built_ = false;
84 }
85
FindTerm(const base::string16 & term)86 PostingList FindTerm(const base::string16& term) {
87 return index_.FindTerm(term);
88 }
89
FindMatchingDocumentsApproximately(const std::unordered_set<base::string16> & terms,double prefix_threshold,double block_threshold)90 std::vector<Result> FindMatchingDocumentsApproximately(
91 const std::unordered_set<base::string16>& terms,
92 double prefix_threshold,
93 double block_threshold) {
94 return index_.FindMatchingDocumentsApproximately(terms, prefix_threshold,
95 block_threshold);
96 }
97
AddDocuments(const DocumentToUpdate & documents)98 void AddDocuments(const DocumentToUpdate& documents) {
99 index_.AddDocuments(documents);
100 }
101
RemoveDocuments(const std::vector<std::string> & doc_ids)102 void RemoveDocuments(const std::vector<std::string>& doc_ids) {
103 index_.RemoveDocuments(doc_ids);
104 }
105
ClearInvertedIndex()106 void ClearInvertedIndex() { index_.ClearInvertedIndex(); }
107
GetTfidf(const base::string16 & term)108 std::vector<TfidfResult> GetTfidf(const base::string16& term) {
109 return index_.GetTfidf(term);
110 }
111
GetTfidfForDocId(const base::string16 & term,const std::string & docid,float * tfidf,size_t * number_positions)112 bool GetTfidfForDocId(const base::string16& term,
113 const std::string& docid,
114 float* tfidf,
115 size_t* number_positions) {
116 const auto& posting = GetTfidf(term);
117 if (posting.empty()) {
118 return false;
119 }
120
121 for (const auto& tfidf_result : posting) {
122 if (std::get<0>(tfidf_result) == docid) {
123 *number_positions = std::get<1>(tfidf_result).size();
124 *tfidf = std::get<2>(tfidf_result);
125 return true;
126 }
127 }
128 return false;
129 }
130
BuildInvertedIndex()131 void BuildInvertedIndex() { index_.BuildInvertedIndex(); }
132
IsInvertedIndexBuilt()133 bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); }
134
GetDictionary()135 std::unordered_map<base::string16, PostingList> GetDictionary() {
136 return index_.dictionary_;
137 }
138
GetDocLength()139 std::unordered_map<std::string, uint32_t> GetDocLength() {
140 return index_.doc_length_;
141 }
142
GetTfidfCache()143 std::unordered_map<base::string16, std::vector<TfidfResult>> GetTfidfCache() {
144 return index_.tfidf_cache_;
145 }
146
GetDocumentsToUpdate()147 DocumentToUpdate GetDocumentsToUpdate() {
148 return index_.documents_to_update_;
149 }
150
GetTermToBeUpdated()151 TermSet GetTermToBeUpdated() { return index_.terms_to_be_updated_; }
152
Wait()153 void Wait() { task_environment_.RunUntilIdle(); }
154
BuildIndexCompleted()155 bool BuildIndexCompleted() {
156 return !(index_.update_in_progress_ || index_.index_building_in_progress_ ||
157 index_.request_to_build_index_);
158 }
159
UpdateDocumentsCompleted()160 bool UpdateDocumentsCompleted() { return !index_.update_in_progress_; }
161
OnIndexBuilt()162 void OnIndexBuilt() { ++num_built_; }
163
NumBuilt()164 int NumBuilt() { return num_built_; }
165
166 private:
167 int num_built_ = 0;
168 InvertedIndex index_;
169 base::test::TaskEnvironment task_environment_{
170 base::test::TaskEnvironment::MainThreadType::DEFAULT,
171 base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
172 };
173
TEST_F(InvertedIndexTest,FindTermTest)174 TEST_F(InvertedIndexTest, FindTermTest) {
175 EXPECT_EQ(NumBuilt(), 0);
176 PostingList result = FindTerm(base::UTF8ToUTF16("A"));
177 ASSERT_EQ(result.size(), 2u);
178 EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
179 EXPECT_EQ(result["doc1"][0].position.start, 1u);
180 EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight);
181 EXPECT_EQ(result["doc1"][1].position.start, 3u);
182 EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight);
183 EXPECT_EQ(result["doc1"][2].position.start, 5u);
184 EXPECT_EQ(result["doc1"][3].weight, kDefaultWeight);
185 EXPECT_EQ(result["doc1"][3].position.start, 7u);
186
187 EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
188 EXPECT_EQ(result["doc2"][0].position.start, 2u);
189 EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
190 EXPECT_EQ(result["doc2"][1].position.start, 4u);
191 }
192
TEST_F(InvertedIndexTest,AddNewDocumentTest)193 TEST_F(InvertedIndexTest, AddNewDocumentTest) {
194 const base::string16 a_utf16(base::UTF8ToUTF16("A"));
195 const base::string16 d_utf16(base::UTF8ToUTF16("D"));
196
197 EXPECT_EQ(NumBuilt(), 0);
198 AddDocuments({{"doc3",
199 {{a_utf16,
200 {{kDefaultWeight, {"header", 1, 1}},
201 {kDefaultWeight / 2, {"body", 2, 1}},
202 {kDefaultWeight, {"header", 4, 1}}}},
203 {d_utf16,
204 {{kDefaultWeight, {"header", 3, 1}},
205 {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
206 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
207 EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
208 Wait();
209 EXPECT_EQ(NumBuilt(), 0);
210 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
211 // 4 terms "A", "B", "C", "D" need to be updated.
212 EXPECT_EQ(GetTermToBeUpdated().size(), 4u);
213 EXPECT_TRUE(UpdateDocumentsCompleted());
214
215 EXPECT_EQ(GetDocLength()["doc3"], 5u);
216
217 // Find "A"
218 PostingList result = FindTerm(a_utf16);
219 ASSERT_EQ(result.size(), 3u);
220 EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
221 EXPECT_EQ(result["doc3"][0].position.start, 1u);
222 EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
223 EXPECT_EQ(result["doc3"][1].position.start, 2u);
224 EXPECT_EQ(result["doc3"][2].weight, kDefaultWeight);
225 EXPECT_EQ(result["doc3"][2].position.start, 4u);
226
227 // Find "D"
228 result = FindTerm(d_utf16);
229 ASSERT_EQ(result.size(), 1u);
230 EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
231 EXPECT_EQ(result["doc3"][0].position.start, 3u);
232 EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
233 EXPECT_EQ(result["doc3"][1].position.start, 5u);
234
235 // Add multiple documents
236 AddDocuments({{"doc4",
237 {{base::UTF8ToUTF16("E"),
238 {{kDefaultWeight, {"header", 1, 1}},
239 {kDefaultWeight / 2, {"body", 2, 1}},
240 {kDefaultWeight, {"header", 4, 1}}}},
241 {base::UTF8ToUTF16("F"),
242 {{kDefaultWeight, {"header", 3, 1}},
243 {kDefaultWeight / 2, {"body", 5, 1}}}}}},
244 {"doc5",
245 {{base::UTF8ToUTF16("E"),
246 {{kDefaultWeight, {"header", 1, 1}},
247 {kDefaultWeight / 2, {"body", 2, 1}},
248 {kDefaultWeight, {"header", 4, 1}}}},
249 {base::UTF8ToUTF16("G"),
250 {{kDefaultWeight, {"header", 3, 1}},
251 {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
252 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
253 EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
254 Wait();
255 EXPECT_EQ(NumBuilt(), 0);
256 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
257 // 7 terms "A", "B", "C", "D", "E", "F", "G" need to be updated.
258 EXPECT_EQ(GetTermToBeUpdated().size(), 7u);
259 EXPECT_TRUE(UpdateDocumentsCompleted());
260
261 // Find "E"
262 result = FindTerm(base::UTF8ToUTF16("E"));
263 ASSERT_EQ(result.size(), 2u);
264
265 // Find "F"
266 result = FindTerm(base::UTF8ToUTF16("F"));
267 ASSERT_EQ(result.size(), 1u);
268 }
269
TEST_F(InvertedIndexTest,ReplaceDocumentTest)270 TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
271 const base::string16 a_utf16(base::UTF8ToUTF16("A"));
272 const base::string16 d_utf16(base::UTF8ToUTF16("D"));
273
274 EXPECT_EQ(NumBuilt(), 0);
275 AddDocuments({{"doc1",
276 {{a_utf16,
277 {{kDefaultWeight, {"header", 1, 1}},
278 {kDefaultWeight / 4, {"body", 2, 1}},
279 {kDefaultWeight / 2, {"header", 4, 1}}}},
280 {d_utf16,
281 {{kDefaultWeight / 3, {"header", 3, 1}},
282 {kDefaultWeight / 5, {"body", 5, 1}}}}}}});
283 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
284 Wait();
285 EXPECT_EQ(NumBuilt(), 0);
286 EXPECT_TRUE(UpdateDocumentsCompleted());
287
288 EXPECT_EQ(GetDocLength()["doc1"], 5u);
289 EXPECT_EQ(GetDocLength()["doc2"], 6u);
290
291 // Find "A"
292 PostingList result = FindTerm(a_utf16);
293 ASSERT_EQ(result.size(), 2u);
294 EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
295 EXPECT_EQ(result["doc1"][0].position.start, 1u);
296 EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 4);
297 EXPECT_EQ(result["doc1"][1].position.start, 2u);
298 EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight / 2);
299 EXPECT_EQ(result["doc1"][2].position.start, 4u);
300
301 // Find "B"
302 result = FindTerm(base::UTF8ToUTF16("B"));
303 ASSERT_EQ(result.size(), 0u);
304
305 // Find "D"
306 result = FindTerm(d_utf16);
307 ASSERT_EQ(result.size(), 1u);
308 EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight / 3);
309 EXPECT_EQ(result["doc1"][0].position.start, 3u);
310 EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 5);
311 EXPECT_EQ(result["doc1"][1].position.start, 5u);
312 }
313
TEST_F(InvertedIndexTest,RemoveDocumentTest)314 TEST_F(InvertedIndexTest, RemoveDocumentTest) {
315 EXPECT_EQ(GetDictionary().size(), 3u);
316 EXPECT_EQ(GetDocLength().size(), 2u);
317
318 EXPECT_EQ(NumBuilt(), 0);
319 RemoveDocuments({"doc1"});
320 Wait();
321 EXPECT_EQ(NumBuilt(), 0);
322 EXPECT_TRUE(UpdateDocumentsCompleted());
323
324 EXPECT_EQ(GetDictionary().size(), 2u);
325 EXPECT_EQ(GetDocLength().size(), 1u);
326 EXPECT_EQ(GetDocLength()["doc2"], 6u);
327
328 // Find "A"
329 PostingList result = FindTerm(base::UTF8ToUTF16("A"));
330 ASSERT_EQ(result.size(), 1u);
331 EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
332 EXPECT_EQ(result["doc2"][0].position.start, 2u);
333 EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
334 EXPECT_EQ(result["doc2"][1].position.start, 4u);
335
336 // Find "B"
337 result = FindTerm(base::UTF8ToUTF16("B"));
338 ASSERT_EQ(result.size(), 0u);
339
340 // Find "C"
341 result = FindTerm(base::UTF8ToUTF16("C"));
342 ASSERT_EQ(result.size(), 1u);
343 EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
344 EXPECT_EQ(result["doc2"][0].position.start, 1u);
345 EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
346 EXPECT_EQ(result["doc2"][1].position.start, 3u);
347 EXPECT_EQ(result["doc2"][2].weight, kDefaultWeight);
348 EXPECT_EQ(result["doc2"][2].position.start, 5u);
349 EXPECT_EQ(result["doc2"][3].weight, kDefaultWeight);
350 EXPECT_EQ(result["doc2"][3].position.start, 7u);
351
352 // Removes multiple documents
353 RemoveDocuments({"doc1", "doc2", "doc3"});
354 Wait();
355 EXPECT_EQ(NumBuilt(), 0);
356 EXPECT_TRUE(UpdateDocumentsCompleted());
357 EXPECT_EQ(GetDictionary().size(), 0u);
358 EXPECT_EQ(GetDocLength().size(), 0u);
359 }
360
TEST_F(InvertedIndexTest,TfidfFromZeroTest)361 TEST_F(InvertedIndexTest, TfidfFromZeroTest) {
362 EXPECT_EQ(GetTfidfCache().size(), 0u);
363 EXPECT_FALSE(IsInvertedIndexBuilt());
364 EXPECT_EQ(NumBuilt(), 0);
365 BuildInvertedIndex();
366 Wait();
367 EXPECT_EQ(NumBuilt(), 1);
368 EXPECT_TRUE(BuildIndexCompleted());
369
370 std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
371 EXPECT_THAT(GetScoresFromTfidfResult(results),
372 testing::UnorderedElementsAre(0.5, 0.33));
373
374 results = GetTfidf(base::UTF8ToUTF16("B"));
375 EXPECT_EQ(results.size(), 1u);
376 EXPECT_THAT(GetScoresFromTfidfResult(results),
377 testing::UnorderedElementsAre(0.7));
378
379 results = GetTfidf(base::UTF8ToUTF16("C"));
380 EXPECT_EQ(results.size(), 1u);
381 EXPECT_THAT(GetScoresFromTfidfResult(results),
382 testing::UnorderedElementsAre(0.94));
383
384 results = GetTfidf(base::UTF8ToUTF16("D"));
385 EXPECT_EQ(results.size(), 0u);
386 }
387
TEST_F(InvertedIndexTest,UpdateIndexTest)388 TEST_F(InvertedIndexTest, UpdateIndexTest) {
389 EXPECT_EQ(GetTfidfCache().size(), 0u);
390 BuildInvertedIndex();
391 Wait();
392 EXPECT_EQ(NumBuilt(), 1);
393 EXPECT_TRUE(BuildIndexCompleted());
394
395 EXPECT_TRUE(IsInvertedIndexBuilt());
396 EXPECT_EQ(GetTfidfCache().size(), 3u);
397
398 // Replaces "doc1"
399 AddDocuments({{"doc1",
400 {{base::UTF8ToUTF16("A"),
401 {{kDefaultWeight / 2, {"header", 1, 1}},
402 {kDefaultWeight / 4, {"body", 2, 1}},
403 {kDefaultWeight / 2, {"header", 4, 1}}}},
404 {base::UTF8ToUTF16("D"),
405 {{kDefaultWeight, {"header", 3, 1}},
406 {kDefaultWeight, {"body", 5, 1}}}}}}});
407 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
408 Wait();
409 EXPECT_EQ(NumBuilt(), 1);
410 EXPECT_TRUE(UpdateDocumentsCompleted());
411
412 EXPECT_FALSE(IsInvertedIndexBuilt());
413 BuildInvertedIndex();
414 Wait();
415 EXPECT_EQ(NumBuilt(), 2);
416 EXPECT_TRUE(BuildIndexCompleted());
417
418 EXPECT_EQ(GetTfidfCache().size(), 3u);
419
420 std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
421 const double expected_tfidf_A_doc1 =
422 std::roundf(
423 TfIdfScore(
424 /*num_docs=*/2,
425 /*num_docs_with_term=*/2,
426 /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight / 2 +
427 kDefaultWeight / 4 + kDefaultWeight / 2,
428 /*doc_length=*/5) *
429 100) /
430 100;
431 const double expected_tfidf_A_doc2 =
432 std::roundf(
433 TfIdfScore(/*num_docs=*/2,
434 /*num_docs_with_term=*/2,
435 /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
436 /*doc_length=*/6) *
437 100) /
438 100;
439 EXPECT_THAT(GetScoresFromTfidfResult(results),
440 testing::UnorderedElementsAre(expected_tfidf_A_doc1,
441 expected_tfidf_A_doc2));
442
443 results = GetTfidf(base::UTF8ToUTF16("B"));
444 EXPECT_THAT(GetScoresFromTfidfResult(results),
445 testing::UnorderedElementsAre());
446
447 results = GetTfidf(base::UTF8ToUTF16("C"));
448 const double expected_tfidf_C_doc2 =
449 std::roundf(
450 TfIdfScore(/*num_docs=*/2,
451 /*num_docs_with_term=*/1,
452 /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 4,
453 /*doc_length=*/6) *
454 100) /
455 100;
456 EXPECT_THAT(GetScoresFromTfidfResult(results),
457 testing::UnorderedElementsAre(expected_tfidf_C_doc2));
458
459 results = GetTfidf(base::UTF8ToUTF16("D"));
460 const double expected_tfidf_D_doc1 =
461 std::roundf(
462 TfIdfScore(/*num_docs=*/2,
463 /*num_docs_with_term=*/1,
464 /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
465 /*doc_length=*/5) *
466 100) /
467 100;
468 EXPECT_THAT(GetScoresFromTfidfResult(results),
469 testing::UnorderedElementsAre(expected_tfidf_D_doc1));
470 }
471
TEST_F(InvertedIndexTest,ClearInvertedIndexTest)472 TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
473 EXPECT_EQ(GetTfidfCache().size(), 0u);
474 BuildInvertedIndex();
475 Wait();
476 EXPECT_EQ(NumBuilt(), 1);
477 EXPECT_TRUE(BuildIndexCompleted());
478
479 EXPECT_TRUE(IsInvertedIndexBuilt());
480 EXPECT_EQ(GetTfidfCache().size(), 3u);
481
482 // Add a document and clear the index simultaneously.
483 const base::string16 a_utf16(base::UTF8ToUTF16("A"));
484 const base::string16 d_utf16(base::UTF8ToUTF16("D"));
485 AddDocuments({{"doc3",
486 {{a_utf16,
487 {{kDefaultWeight, {"header", 1, 1}},
488 {kDefaultWeight / 2, {"body", 2, 1}},
489 {kDefaultWeight, {"header", 4, 1}}}},
490 {d_utf16,
491 {{kDefaultWeight, {"header", 3, 1}},
492 {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
493 ClearInvertedIndex();
494 Wait();
495
496 EXPECT_EQ(GetTfidfCache().size(), 0u);
497 EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
498 EXPECT_EQ(GetDocLength().size(), 0u);
499 EXPECT_EQ(GetDictionary().size(), 0u);
500 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
501 }
502
TEST_F(InvertedIndexTest,FindMatchingDocumentsApproximatelyTest)503 TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
504 const double prefix_threshold = 1.0;
505 const double block_threshold = 1.0;
506 const base::string16 a_utf16(base::UTF8ToUTF16("A"));
507 const base::string16 b_utf16(base::UTF8ToUTF16("B"));
508 const base::string16 c_utf16(base::UTF8ToUTF16("C"));
509 const base::string16 d_utf16(base::UTF8ToUTF16("D"));
510
511 EXPECT_EQ(NumBuilt(), 0);
512
513 // Replace doc1, same occurrences, just different weights.
514 AddDocuments({{"doc1",
515 {{a_utf16,
516 {{kDefaultWeight, {"header", 1, 1}},
517 {kDefaultWeight, {"header", 3, 1}},
518 {kDefaultWeight / 2, {"body", 5, 1}},
519 {kDefaultWeight / 2, {"body", 7, 1}}}},
520 {b_utf16,
521 {{kDefaultWeight / 2, {"header", 2, 1}},
522 {kDefaultWeight / 2, {"header", 6, 1}},
523 {kDefaultWeight / 3, {"body", 4, 1}},
524 {kDefaultWeight / 3, {"body", 5, 1}}}}}}});
525 EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
526 Wait();
527 EXPECT_EQ(NumBuilt(), 0);
528 EXPECT_TRUE(UpdateDocumentsCompleted());
529
530 BuildInvertedIndex();
531 Wait();
532 EXPECT_EQ(NumBuilt(), 1);
533 EXPECT_TRUE(BuildIndexCompleted());
534
535 {
536 // "A" exists in "doc1" and "doc2". The score of each document is simply A's
537 // TF-IDF score.
538 const std::vector<Result> matching_docs =
539 FindMatchingDocumentsApproximately({a_utf16}, prefix_threshold,
540 block_threshold);
541
542 EXPECT_EQ(matching_docs.size(), 2u);
543 std::vector<std::string> expected_docids = {"doc1", "doc2"};
544 // TODO(jiameng): for testing, we should use another independent method to
545 // verify scores.
546 std::vector<float> actual_scores;
547 for (size_t i = 0; i < 2; ++i) {
548 const auto& docid = expected_docids[i];
549 float expected_score = 0;
550 size_t expected_num_pos = 0u;
551 EXPECT_TRUE(
552 GetTfidfForDocId(a_utf16, docid, &expected_score, &expected_num_pos));
553 CheckResult(matching_docs[i], docid, expected_score, expected_num_pos);
554 actual_scores.push_back(expected_score);
555 }
556
557 // Check scores are non-increasing.
558 EXPECT_GE(actual_scores[0], actual_scores[1]);
559 }
560
561 {
562 // "D" does not exist in the index.
563 const std::vector<Result> matching_docs =
564 FindMatchingDocumentsApproximately({d_utf16}, prefix_threshold,
565 block_threshold);
566
567 EXPECT_TRUE(matching_docs.empty());
568 }
569
570 {
571 // Query is {"A", "B", "C"}, which matches all documents.
572 const std::vector<Result> matching_docs =
573 FindMatchingDocumentsApproximately({a_utf16, b_utf16, c_utf16},
574 prefix_threshold, block_threshold);
575 EXPECT_EQ(matching_docs.size(), 2u);
576
577 // Ranked documents are {"doc2", "doc1"}.
578 // "doc2" score comes from sum of TF-IDF of "A" and "C"
579 float expected_score2_A = 0;
580 size_t expected_num_pos2_A = 0u;
581 EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc2", &expected_score2_A,
582 &expected_num_pos2_A));
583 float expected_score2_C = 0;
584 size_t expected_num_pos2_C = 0u;
585 EXPECT_TRUE(GetTfidfForDocId(c_utf16, "doc2", &expected_score2_C,
586 &expected_num_pos2_C));
587 const float expected_score2 = expected_score2_A + expected_score2_C;
588 const size_t expected_num_pos2 = expected_num_pos2_A + expected_num_pos2_C;
589 CheckResult(matching_docs[0], "doc2", expected_score2, expected_num_pos2);
590
591 // "doc1" score comes from sum of TF-IDF of "A" and "B".
592 float expected_score1_A = 0;
593 size_t expected_num_pos1_A = 0u;
594 EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc1", &expected_score1_A,
595 &expected_num_pos1_A));
596 float expected_score1_B = 0;
597 size_t expected_num_pos1_B = 0u;
598 EXPECT_TRUE(GetTfidfForDocId(b_utf16, "doc1", &expected_score1_B,
599 &expected_num_pos1_B));
600 const float expected_score1 = expected_score1_A + expected_score1_B;
601 const size_t expected_num_pos1 = expected_num_pos1_B + expected_num_pos1_B;
602 CheckResult(matching_docs[1], "doc1", expected_score1, expected_num_pos1);
603
604 EXPECT_GE(expected_score2, expected_score1);
605 }
606 }
607
608 } // namespace local_search_service
609 } // namespace chromeos
610