1 /*****************************************************************************
2
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file fts/fts0que.cc
29 Full Text Search functionality.
30
31 Created 2007/03/27 Sunny Bains
32 Completed 2011/7/10 Sunny and Jimmy Yang
33 *******************************************************/
34
35 #include "ha_prototypes.h"
36
37 #include "dict0dict.h"
38 #include "ut0rbt.h"
39 #include "row0sel.h"
40 #include "fts0fts.h"
41 #include "fts0priv.h"
42 #include "fts0ast.h"
43 #include "fts0pars.h"
44 #include "fts0types.h"
45 #include "fts0plugin.h"
46 #include "ut0new.h"
47
48 #ifdef UNIV_NONINL
49 #include "fts0types.ic"
50 #include "fts0vlc.ic"
51 #endif
52
53 #include <iomanip>
54 #include <vector>
55
56 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
57
58 #define RANK_DOWNGRADE (-1.0F)
59 #define RANK_UPGRADE (1.0F)
60
61 /* Maximum number of words supported in a phrase or proximity search. */
62 #define MAX_PROXIMITY_ITEM 128
63
64 /* Memory used by rbt itself for create and node add */
65 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
66 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
67
68 /*Initial byte length for 'words' in fts_ranking_t */
69 #define RANKING_WORDS_INIT_LEN 4
70
71 // FIXME: Need to have a generic iterator that traverses the ilist.
72
73 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
74
75 struct fts_word_freq_t;
76
77 /** State of an FTS query. */
78 struct fts_query_t {
79 mem_heap_t* heap; /*!< Heap to use for allocations */
80
81 trx_t* trx; /*!< The query transaction */
82
83 dict_index_t* index; /*!< The FTS index to search */
84 /*!< FTS auxiliary common table def */
85
86 fts_table_t fts_common_table;
87
88 fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
89
90 ulint total_size; /*!< total memory size used by query */
91
92 fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
93 filtered from the output */
94
95 fts_ast_node_t* root; /*!< Abstract syntax tree */
96
97 fts_ast_node_t* cur_node; /*!< Current tree node */
98
99 ib_rbt_t* word_map; /*!< Matched word map for
100 searching by word*/
101
102 word_vector_t* word_vector; /*!< Matched word vector for
103 searching by index */
104
105 ib_rbt_t* doc_ids; /*!< The current set of matching
106 doc ids, elements are of
107 type fts_ranking_t */
108
109 ib_rbt_t* intersection; /*!< The doc ids that were found in
110 doc_ids, this tree will become
111 the new doc_ids, elements are of type
112 fts_ranking_t */
113
114 /*!< Prepared statement to read the
115 nodes from the FTS INDEX */
116 que_t* read_nodes_graph;
117
118 fts_ast_oper_t oper; /*!< Current boolean mode operator */
119
120 /*!< TRUE if we want to collect the
121 word positions within the document */
122 ibool collect_positions;
123
124 ulint flags; /*!< Specify the full text search type,
125 such as boolean search, phrase
126 search, proximity search etc. */
127
128 ulint distance; /*!< The proximity distance of a
129 phrase search. */
130
131 /*!< These doc ids are used as a
132 boundary condition when searching the
133 FTS index rows */
134
135 doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
136
137 doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
138
139 bool boolean_mode; /*!< TRUE if boolean mode query */
140
141 ib_vector_t* matched; /*!< Array of matching documents
142 (fts_match_t) to search for a phrase */
143
144 ib_vector_t** match_array; /*!< Used for proximity search, contains
145 position info for each matched word
146 in the word list */
147
148 ib_uint64_t total_docs; /*!< The total number of documents */
149
150 ulint total_words; /*!< The total number of words */
151
152 dberr_t error; /*!< Error code if any, that is
153 encountered during query processing */
154
155 ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
156 document, its elements are of type
157 fts_word_freq_t */
158
159 ib_rbt_t* wildcard_words; /*!< words with wildcard */
160
161 bool multi_exist; /*!< multiple FTS_EXIST oper */
162
163 st_mysql_ftparser* parser; /*!< fts plugin parser */
164
165 /** limit value for the fts query */
166 ulonglong limit;
167
168 /** number of docs fetched by query. This is to restrict the
169 result with limit value */
170 ulonglong n_docs;
171
172 ulint nested_exp_count; /*!< number of nested sub expression
173 limit */
174 };
175
176 /** For phrase matching, first we collect the documents and the positions
177 then we match. */
178 struct fts_match_t {
179 doc_id_t doc_id; /*!< Document id */
180
181 ulint start; /*!< Start the phrase match from
182 this offset within the positions
183 vector. */
184
185 ib_vector_t* positions; /*!< Offsets of a word in a
186 document */
187 };
188
189 /** For matching tokens in a phrase search. We use this data structure in
190 the callback that determines whether a document should be accepted or
191 rejected for a phrase search. */
192 struct fts_select_t {
193 doc_id_t doc_id; /*!< The document id to match */
194
195 ulint min_pos; /*!< For found to be TRUE at least
196 one position must be greater than
197 min_pos. */
198
199 ibool found; /*!< TRUE if found */
200
201 fts_word_freq_t*
202 word_freq; /*!< Word frequency instance of the
203 current word being looked up in
204 the FTS index */
205 };
206
207 typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
208
209 /** structure defines a set of ranges for original documents, each of which
210 has a minimum position and maximum position. Text in such range should
211 contain all words in the proximity search. We will need to count the
212 words in such range to make sure it is less than the specified distance
213 of the proximity search */
214 struct fts_proximity_t {
215 ulint n_pos; /*!< number of position set, defines
216 a range (min to max) containing all
217 matching words */
218 pos_vector_t min_pos; /*!< the minimum position (in bytes)
219 of the range */
220 pos_vector_t max_pos; /*!< the maximum position (in bytes)
221 of the range */
222 };
223
224 /** The match positions and tokesn to match */
225 struct fts_phrase_t {
fts_phrase_tfts_phrase_t226 fts_phrase_t(const dict_table_t* table)
227 :
228 found(false),
229 match(NULL),
230 tokens(NULL),
231 distance(0),
232 charset(NULL),
233 heap(NULL),
234 page_size(dict_table_page_size(table)),
235 proximity_pos(NULL),
236 parser(NULL)
237 {
238 }
239
240 /** Match result */
241 ibool found;
242
243 /** Positions within text */
244 const fts_match_t* match;
245
246 /** Tokens to match */
247 const ib_vector_t* tokens;
248
249 /** For matching on proximity distance. Can be 0 for exact match */
250 ulint distance;
251
252 /** Phrase match charset */
253 CHARSET_INFO* charset;
254
255 /** Heap for word processing */
256 mem_heap_t* heap;
257
258 /** Row page size */
259 const page_size_t page_size;
260
261 /** Position info for proximity search verification. Records the
262 min and max position of words matched */
263 fts_proximity_t* proximity_pos;
264
265 /** FTS plugin parser */
266 st_mysql_ftparser* parser;
267 };
268
269 /** Paramter passed to fts phrase match by parser */
270 struct fts_phrase_param_t {
271 fts_phrase_t* phrase; /*!< Match phrase instance */
272 ulint token_index; /*!< Index of token to match next */
273 mem_heap_t* heap; /*!< Heap for word processing */
274 };
275
276 /** For storing the frequncy of a word/term in a document */
277 struct fts_doc_freq_t {
278 doc_id_t doc_id; /*!< Document id */
279 ulint freq; /*!< Frequency of a word in a document */
280 };
281
282 /** To determine the word frequency per document. */
283 struct fts_word_freq_t {
284 fts_string_t word; /*!< Word for which we need the freq,
285 it's allocated on the query heap */
286
287 ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
288 word frequencies. The elements are
289 of type fts_doc_freq_t */
290 ib_uint64_t doc_count; /*!< Total number of documents that
291 contain this word */
292 double idf; /*!< Inverse document frequency */
293 };
294
295 /********************************************************************
296 Callback function to fetch the rows in an FTS INDEX record.
297 @return always TRUE */
298 static
299 ibool
300 fts_query_index_fetch_nodes(
301 /*========================*/
302 void* row, /*!< in: sel_node_t* */
303 void* user_arg); /*!< in: pointer to ib_vector_t */
304
305 /********************************************************************
306 Read and filter nodes.
307 @return fts_node_t instance */
308 static
309 dberr_t
310 fts_query_filter_doc_ids(
311 /*=====================*/
312 fts_query_t* query, /*!< in: query instance */
313 const fts_string_t* word, /*!< in: the current word */
314 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
315 const fts_node_t* node, /*!< in: current FTS node */
316 void* data, /*!< in: doc id ilist */
317 ulint len, /*!< in: doc id ilist size */
318 ibool calc_doc_count);/*!< in: whether to remember doc
319 count */
320
321 #if 0
322 /*****************************************************************//***
323 Find a doc_id in a word's ilist.
324 @return TRUE if found. */
325 static
326 ibool
327 fts_query_find_doc_id(
328 /*==================*/
329 fts_select_t* select, /*!< in/out: search the doc id selected,
330 update the frequency if found. */
331 void* data, /*!< in: doc id ilist */
332 ulint len); /*!< in: doc id ilist size */
333 #endif
334
335 /*************************************************************//**
336 This function implements a simple "blind" query expansion search:
337 words in documents found in the first search pass will be used as
338 search arguments to search the document again, thus "expand"
339 the search result set.
340 @return DB_SUCCESS if success, otherwise the error code */
341 static
342 dberr_t
343 fts_expand_query(
344 /*=============*/
345 dict_index_t* index, /*!< in: FTS index to search */
346 fts_query_t* query) /*!< in: query result, to be freed
347 by the client */
348 MY_ATTRIBUTE((nonnull, warn_unused_result));
349 /*************************************************************//**
350 This function finds documents that contain all words in a
351 phrase or proximity search. And if proximity search, verify
352 the words are close enough to each other, as in specified distance.
353 This function is called for phrase and proximity search.
354 @return TRUE if documents are found, FALSE if otherwise */
355 static
356 ibool
357 fts_phrase_or_proximity_search(
358 /*===========================*/
359 fts_query_t* query, /*!< in/out: query instance
360 query->doc_ids might be instantiated
361 with qualified doc IDs */
362 ib_vector_t* tokens); /*!< in: Tokens contain words */
363 /*************************************************************//**
364 This function checks whether words in result documents are close to
365 each other (within proximity range as specified by "distance").
366 If "distance" is MAX_ULINT, then it will find all combinations of
367 positions of matching words and store min and max positions
368 in the "qualified_pos" for later verification.
369 @return true if words are close to each other, false if otherwise */
370 static
371 bool
372 fts_proximity_get_positions(
373 /*========================*/
374 fts_match_t** match, /*!< in: query instance */
375 ulint num_match, /*!< in: number of matching
376 items */
377 ulint distance, /*!< in: distance value
378 for proximity search */
379 fts_proximity_t* qualified_pos); /*!< out: the position info
380 records ranges containing
381 all matching words. */
382 #if 0
383 /********************************************************************
384 Get the total number of words in a documents. */
385 static
386 ulint
387 fts_query_terms_in_document(
388 /*========================*/
389 /*!< out: DB_SUCCESS if all go well
390 else error code */
391 fts_query_t* query, /*!< in: FTS query state */
392 doc_id_t doc_id, /*!< in: the word to check */
393 ulint* total); /*!< out: total words in document */
394 #endif
395
396 /********************************************************************
397 Compare two fts_doc_freq_t doc_ids.
398 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
399 UNIV_INLINE
400 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)401 fts_freq_doc_id_cmp(
402 /*================*/
403 const void* p1, /*!< in: id1 */
404 const void* p2) /*!< in: id2 */
405 {
406 const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
407 const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
408
409 return((int) (fq1->doc_id - fq2->doc_id));
410 }
411
412 #if 0
413 /*******************************************************************//**
414 Print the table used for calculating LCS. */
415 static
416 void
417 fts_print_lcs_table(
418 /*================*/
419 const ulint* table, /*!< in: array to print */
420 ulint n_rows, /*!< in: total no. of rows */
421 ulint n_cols) /*!< in: total no. of cols */
422 {
423 ulint i;
424
425 for (i = 0; i < n_rows; ++i) {
426 ulint j;
427
428 printf("\n");
429
430 for (j = 0; j < n_cols; ++j) {
431
432 printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
433 }
434 }
435 }
436
437 /********************************************************************
438 Find the longest common subsequence between the query string and
439 the document. */
440 static
441 ulint
442 fts_query_lcs(
443 /*==========*/
444 /*!< out: LCS (length) between
445 two ilists */
446 const ulint* p1, /*!< in: word positions of query */
447 ulint len_p1, /*!< in: no. of elements in p1 */
448 const ulint* p2, /*!< in: word positions within document */
449 ulint len_p2) /*!< in: no. of elements in p2 */
450 {
451 int i;
452 ulint len = 0;
453 ulint r = len_p1;
454 ulint c = len_p2;
455 ulint size = (r + 1) * (c + 1) * sizeof(ulint);
456 ulint* table = (ulint*) ut_malloc_nokey(size);
457
458 /* Traverse the table backwards, from the last row to the first and
459 also from the last column to the first. We compute the smaller
460 common subsequeces first, then use the caluclated values to determine
461 the longest common subsequence. The result will be in TABLE[0][0]. */
462 for (i = r; i >= 0; --i) {
463 int j;
464
465 for (j = c; j >= 0; --j) {
466
467 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
468
469 FTS_ELEM(table, c, i, j) = 0;
470
471 } else if (p1[i] == p2[j]) {
472
473 FTS_ELEM(table, c, i, j) = FTS_ELEM(
474 table, c, i + 1, j + 1) + 1;
475
476 } else {
477
478 ulint value;
479
480 value = ut_max(
481 FTS_ELEM(table, c, i + 1, j),
482 FTS_ELEM(table, c, i, j + 1));
483
484 FTS_ELEM(table, c, i, j) = value;
485 }
486 }
487 }
488
489 len = FTS_ELEM(table, c, 0, 0);
490
491 fts_print_lcs_table(table, r, c);
492 printf("\nLen=%lu\n", len);
493
494 ut_free(table);
495
496 return(len);
497 }
498 #endif
499
500 /*******************************************************************//**
501 Compare two fts_ranking_t instance on their rank value and doc ids in
502 descending order on the rank and ascending order on doc id.
503 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
504 static
505 int
fts_query_compare_rank(const void * p1,const void * p2)506 fts_query_compare_rank(
507 /*===================*/
508 const void* p1, /*!< in: pointer to elem */
509 const void* p2) /*!< in: pointer to elem */
510 {
511 const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
512 const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
513
514 if (r2->rank < r1->rank) {
515 return(-1);
516 } else if (r2->rank == r1->rank) {
517
518 if (r1->doc_id < r2->doc_id) {
519 return(1);
520 } else if (r1->doc_id > r2->doc_id) {
521 return(1);
522 }
523
524 return(0);
525 }
526
527 return(1);
528 }
529
530 /*******************************************************************//**
531 Create words in ranking */
532 static
533 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)534 fts_ranking_words_create(
535 /*=====================*/
536 fts_query_t* query, /*!< in: query instance */
537 fts_ranking_t* ranking) /*!< in: ranking instance */
538 {
539 ranking->words = static_cast<byte*>(
540 mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
541 ranking->words_len = RANKING_WORDS_INIT_LEN;
542 }
543
544 /*
545 The optimization here is using a char array(bitmap) to replace words rb tree
546 in fts_ranking_t.
547
548 It can save lots of memory except in some cases of QUERY EXPANSION.
549
550 'word_map' is used as a word dictionary, in which the key is a word, the value
551 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
552 if not, we add it into 'word_map', and give it a position(actually a number).
553 then we set the corresponding bit to '1' at the position in the char array 'words'.
554
555 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
556 more quickly than searching by value in 'word_map'. we use 'word_vector'
557 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
558 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
559 by the word.
560 */
561
562 /*******************************************************************//**
563 Add a word into ranking */
564 static
565 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)566 fts_ranking_words_add(
567 /*==================*/
568 fts_query_t* query, /*!< in: query instance */
569 fts_ranking_t* ranking, /*!< in: ranking instance */
570 const fts_string_t* word) /*!< in: term/word to add */
571 {
572 ulint pos;
573 ulint byte_offset;
574 ulint bit_offset;
575 ib_rbt_bound_t parent;
576
577 /* Note: we suppose the word map and vector are append-only. */
578 ut_ad(query->word_vector->size() == rbt_size(query->word_map));
579
580 /* We use ib_rbt to simulate a map, f_n_char means position. */
581 if (rbt_search(query->word_map, &parent, word) == 0) {
582 fts_string_t* result_word;
583
584 result_word = rbt_value(fts_string_t, parent.last);
585 pos = result_word->f_n_char;
586 ut_ad(pos < rbt_size(query->word_map));
587 } else {
588 /* Add the word to map. */
589 fts_string_t new_word;
590
591 pos = rbt_size(query->word_map);
592
593 fts_string_dup(&new_word, word, query->heap);
594 new_word.f_n_char = pos;
595
596 rbt_add_node(query->word_map, &parent, &new_word);
597 ut_ad(rbt_validate(query->word_map));
598 query->word_vector->push_back(new_word);
599 }
600
601 /* Check words len */
602 byte_offset = pos / CHAR_BIT;
603 if (byte_offset >= ranking->words_len) {
604 byte* words = ranking->words;
605 ulint words_len = ranking->words_len;
606
607 while (byte_offset >= words_len) {
608 words_len *= 2;
609 }
610
611 ranking->words = static_cast<byte*>(
612 mem_heap_zalloc(query->heap, words_len));
613 ut_memcpy(ranking->words, words, ranking->words_len);
614 ranking->words_len = words_len;
615 }
616
617 /* Set ranking words */
618 ut_ad(byte_offset < ranking->words_len);
619 bit_offset = pos % CHAR_BIT;
620 ranking->words[byte_offset] |= 1 << bit_offset;
621 }
622
623 /*******************************************************************//**
624 Get a word from a ranking
625 @return true if it's successful */
626 static
627 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)628 fts_ranking_words_get_next(
629 /*=======================*/
630 const fts_query_t* query, /*!< in: query instance */
631 fts_ranking_t* ranking,/*!< in: ranking instance */
632 ulint* pos, /*!< in/out: word start pos */
633 fts_string_t* word) /*!< in/out: term/word to add */
634 {
635 bool ret = false;
636 ulint max_pos = ranking->words_len * CHAR_BIT;
637
638 /* Search for next word */
639 while (*pos < max_pos) {
640 ulint byte_offset = *pos / CHAR_BIT;
641 ulint bit_offset = *pos % CHAR_BIT;
642
643 if (ranking->words[byte_offset] & (1 << bit_offset)) {
644 ret = true;
645 break;
646 }
647
648 *pos += 1;
649 };
650
651 /* Get next word from word vector */
652 if (ret) {
653 ut_ad(*pos < query->word_vector->size());
654 *word = query->word_vector->at((size_t)*pos);
655 *pos += 1;
656 }
657
658 return ret;
659 }
660
661 /*******************************************************************//**
662 Add a word if it doesn't exist, to the term freq RB tree. We store
663 a pointer to the word that is passed in as the argument.
664 @return pointer to word */
665 static
666 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)667 fts_query_add_word_freq(
668 /*====================*/
669 fts_query_t* query, /*!< in: query instance */
670 const fts_string_t* word) /*!< in: term/word to add */
671 {
672 ib_rbt_bound_t parent;
673
674 /* Lookup the word in our rb tree and add if it doesn't exist. */
675 if (rbt_search(query->word_freqs, &parent, word) != 0) {
676 fts_word_freq_t word_freq;
677
678 memset(&word_freq, 0, sizeof(word_freq));
679
680 fts_string_dup(&word_freq.word, word, query->heap);
681
682 word_freq.doc_count = 0;
683
684 word_freq.doc_freqs = rbt_create(
685 sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
686
687 parent.last = rbt_add_node(
688 query->word_freqs, &parent, &word_freq);
689
690 query->total_size += word->f_len
691 + SIZEOF_RBT_CREATE
692 + SIZEOF_RBT_NODE_ADD
693 + sizeof(fts_word_freq_t);
694 }
695
696 return(rbt_value(fts_word_freq_t, parent.last));
697 }
698
699 /*******************************************************************//**
700 Add a doc id if it doesn't exist, to the doc freq RB tree.
701 @return pointer to word */
702 static
703 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)704 fts_query_add_doc_freq(
705 /*===================*/
706 fts_query_t* query, /*!< in: query instance */
707 ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
708 doc_id_t doc_id) /*!< in: doc id to add */
709 {
710 ib_rbt_bound_t parent;
711
712 /* Lookup the doc id in our rb tree and add if it doesn't exist. */
713 if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
714 fts_doc_freq_t doc_freq;
715
716 memset(&doc_freq, 0, sizeof(doc_freq));
717
718 doc_freq.freq = 0;
719 doc_freq.doc_id = doc_id;
720
721 parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
722
723 query->total_size += SIZEOF_RBT_NODE_ADD
724 + sizeof(fts_doc_freq_t);
725 }
726
727 return(rbt_value(fts_doc_freq_t, parent.last));
728 }
729
730 /*******************************************************************//**
731 Add the doc id to the query set only if it's not in the
732 deleted array. */
733 static
734 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)735 fts_query_union_doc_id(
736 /*===================*/
737 fts_query_t* query, /*!< in: query instance */
738 doc_id_t doc_id, /*!< in: the doc id to add */
739 fts_rank_t rank) /*!< in: if non-zero, it is the
740 rank associated with the doc_id */
741 {
742 ib_rbt_bound_t parent;
743 ulint size = ib_vector_size(query->deleted->doc_ids);
744 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
745
746 /* Check if the doc id is deleted and it's not already in our set. */
747 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
748 && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
749
750 fts_ranking_t ranking;
751
752 ranking.rank = rank;
753 ranking.doc_id = doc_id;
754 fts_ranking_words_create(query, &ranking);
755
756 rbt_add_node(query->doc_ids, &parent, &ranking);
757
758 query->total_size += SIZEOF_RBT_NODE_ADD
759 + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
760 }
761 }
762
763 /*******************************************************************//**
764 Remove the doc id from the query set only if it's not in the
765 deleted set. */
766 static
767 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)768 fts_query_remove_doc_id(
769 /*====================*/
770 fts_query_t* query, /*!< in: query instance */
771 doc_id_t doc_id) /*!< in: the doc id to add */
772 {
773 ib_rbt_bound_t parent;
774 ulint size = ib_vector_size(query->deleted->doc_ids);
775 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
776
777 /* Check if the doc id is deleted and it's in our set. */
778 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
779 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
780 ut_free(rbt_remove_node(query->doc_ids, parent.last));
781
782 ut_ad(query->total_size >=
783 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
784 query->total_size -= SIZEOF_RBT_NODE_ADD
785 + sizeof(fts_ranking_t);
786 }
787 }
788
789 /*******************************************************************//**
790 Find the doc id in the query set but not in the deleted set, artificialy
791 downgrade or upgrade its ranking by a value and make/initialize its ranking
792 under or above its normal range 0 to 1. This is used for Boolean Search
793 operator such as Negation operator, which makes word's contribution to the
794 row's relevance to be negative */
795 static
796 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)797 fts_query_change_ranking(
798 /*====================*/
799 fts_query_t* query, /*!< in: query instance */
800 doc_id_t doc_id, /*!< in: the doc id to add */
801 ibool downgrade) /*!< in: Whether to downgrade ranking */
802 {
803 ib_rbt_bound_t parent;
804 ulint size = ib_vector_size(query->deleted->doc_ids);
805 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
806
807 /* Check if the doc id is deleted and it's in our set. */
808 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
809 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
810
811 fts_ranking_t* ranking;
812
813 ranking = rbt_value(fts_ranking_t, parent.last);
814
815 ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
816
817 /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
818 and RANK_UPGRADE (0.5) */
819 if (ranking->rank >= 1.0F) {
820 ranking->rank = 1.0F;
821 } else if (ranking->rank <= -1.0F) {
822 ranking->rank = -1.0F;
823 }
824 }
825 }
826
827 /*******************************************************************//**
828 Check the doc id in the query set only if it's not in the
829 deleted array. The doc ids that were found are stored in
830 another rb tree (fts_query_t::intersect). */
831 static
832 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)833 fts_query_intersect_doc_id(
834 /*=======================*/
835 fts_query_t* query, /*!< in: query instance */
836 doc_id_t doc_id, /*!< in: the doc id to add */
837 fts_rank_t rank) /*!< in: if non-zero, it is the
838 rank associated with the doc_id */
839 {
840 ib_rbt_bound_t parent;
841 ulint size = ib_vector_size(query->deleted->doc_ids);
842 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
843 fts_ranking_t* ranking= NULL;
844
845 /* There are three types of intersect:
846 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
847 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
848 if it matches 'b'. if the doc is also in doc_ids, then change the
849 doc's rank, and add 'a' in doc's words.
850 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
851 if it matches 'b' and it's in doc_ids.(multi_exist = true). */
852
853 /* Check if the doc id is deleted and it's in our set */
854 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
855 fts_ranking_t new_ranking;
856
857 if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
858 if (query->multi_exist) {
859 return;
860 } else {
861 new_ranking.words = NULL;
862 }
863 } else {
864 ranking = rbt_value(fts_ranking_t, parent.last);
865
866 /* We've just checked the doc id before */
867 if (ranking->words == NULL) {
868 ut_ad(rbt_search(query->intersection, &parent,
869 ranking) == 0);
870 return;
871 }
872
873 /* Merge rank */
874 rank += ranking->rank;
875 if (rank >= 1.0F) {
876 rank = 1.0F;
877 } else if (rank <= -1.0F) {
878 rank = -1.0F;
879 }
880
881 /* Take words */
882 new_ranking.words = ranking->words;
883 new_ranking.words_len = ranking->words_len;
884 }
885
886 new_ranking.rank = rank;
887 new_ranking.doc_id = doc_id;
888
889 if (rbt_search(query->intersection, &parent,
890 &new_ranking) != 0) {
891 if (new_ranking.words == NULL) {
892 fts_ranking_words_create(query, &new_ranking);
893
894 query->total_size += RANKING_WORDS_INIT_LEN;
895 } else {
896 /* Note that the intersection has taken
897 ownership of the ranking data. */
898 ranking->words = NULL;
899 }
900
901 rbt_add_node(query->intersection,
902 &parent, &new_ranking);
903
904 query->total_size += SIZEOF_RBT_NODE_ADD
905 + sizeof(fts_ranking_t);
906 }
907 }
908 }
909
910 /*******************************************************************//**
911 Free the document ranking rb tree. */
912 static
913 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)914 fts_query_free_doc_ids(
915 /*===================*/
916 fts_query_t* query, /*!< in: query instance */
917 ib_rbt_t* doc_ids) /*!< in: rb tree to free */
918 {
919 const ib_rbt_node_t* node;
920
921 for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
922
923 fts_ranking_t* ranking;
924
925 ranking = rbt_value(fts_ranking_t, node);
926
927 if (ranking->words) {
928 ranking->words = NULL;
929 }
930
931 ut_free(rbt_remove_node(doc_ids, node));
932
933 ut_ad(query->total_size >=
934 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
935 query->total_size -= SIZEOF_RBT_NODE_ADD
936 + sizeof(fts_ranking_t);
937 }
938
939 rbt_free(doc_ids);
940
941 ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
942 query->total_size -= SIZEOF_RBT_CREATE;
943 }
944
945 /*******************************************************************//**
946 Add the word to the documents "list" of matching words from
947 the query. We make a copy of the word from the query heap. */
948 static
949 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)950 fts_query_add_word_to_document(
951 /*===========================*/
952 fts_query_t* query, /*!< in: query to update */
953 doc_id_t doc_id, /*!< in: the document to update */
954 const fts_string_t* word) /*!< in: the token to add */
955 {
956 ib_rbt_bound_t parent;
957 fts_ranking_t* ranking = NULL;
958
959 if (query->flags == FTS_OPT_RANKING) {
960 return;
961 }
962
963 /* First we search the intersection RB tree as it could have
964 taken ownership of the words rb tree instance. */
965 if (query->intersection
966 && rbt_search(query->intersection, &parent, &doc_id) == 0) {
967
968 ranking = rbt_value(fts_ranking_t, parent.last);
969 }
970
971 if (ranking == NULL
972 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
973
974 ranking = rbt_value(fts_ranking_t, parent.last);
975 }
976
977 if (ranking != NULL) {
978 fts_ranking_words_add(query, ranking, word);
979 }
980 }
981
982 /*******************************************************************//**
983 Check the node ilist. */
984 static
985 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)986 fts_query_check_node(
987 /*=================*/
988 fts_query_t* query, /*!< in: query to update */
989 const fts_string_t* token, /*!< in: the token to search */
990 const fts_node_t* node) /*!< in: node to check */
991 {
992 /* Skip nodes whose doc ids are out range. */
993 if (query->oper == FTS_EXIST
994 && ((query->upper_doc_id > 0
995 && node->first_doc_id > query->upper_doc_id)
996 || (query->lower_doc_id > 0
997 && node->last_doc_id < query->lower_doc_id))) {
998
999 /* Ignore */
1000
1001 } else {
1002 int ret;
1003 ib_rbt_bound_t parent;
1004 ulint ilist_size = node->ilist_size;
1005 fts_word_freq_t*word_freqs;
1006
1007 /* The word must exist. */
1008 ret = rbt_search(query->word_freqs, &parent, token);
1009 ut_a(ret == 0);
1010
1011 word_freqs = rbt_value(fts_word_freq_t, parent.last);
1012
1013 query->error = fts_query_filter_doc_ids(
1014 query, token, word_freqs, node,
1015 node->ilist, ilist_size, TRUE);
1016 }
1017 }
1018
1019 /*****************************************************************//**
1020 Search index cache for word with wildcard match.
1021 @return number of words matched */
1022 static
1023 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1024 fts_cache_find_wildcard(
1025 /*====================*/
1026 fts_query_t* query, /*!< in: query instance */
1027 const fts_index_cache_t*index_cache, /*!< in: cache to search */
1028 const fts_string_t* token) /*!< in: token to search */
1029 {
1030 ib_rbt_bound_t parent;
1031 const ib_vector_t* nodes = NULL;
1032 fts_string_t srch_text;
1033 byte term[FTS_MAX_WORD_LEN + 1];
1034 ulint num_word = 0;
1035
1036 srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1037 ? token->f_len - 1
1038 : token->f_len;
1039
1040 strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1041 term[srch_text.f_len] = '\0';
1042 srch_text.f_str = term;
1043
1044 /* Lookup the word in the rb tree */
1045 if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1046 innobase_fts_text_cmp_prefix) == 0) {
1047 const fts_tokenizer_word_t* word;
1048 ulint i;
1049 const ib_rbt_node_t* cur_node;
1050 ibool forward = FALSE;
1051
1052 word = rbt_value(fts_tokenizer_word_t, parent.last);
1053 cur_node = parent.last;
1054
1055 while (innobase_fts_text_cmp_prefix(
1056 index_cache->charset, &srch_text, &word->text) == 0) {
1057
1058 nodes = word->nodes;
1059
1060 for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1061 int ret;
1062 const fts_node_t* node;
1063 ib_rbt_bound_t freq_parent;
1064 fts_word_freq_t* word_freqs;
1065
1066 node = static_cast<const fts_node_t*>(
1067 ib_vector_get_const(nodes, i));
1068
1069 ret = rbt_search(query->word_freqs,
1070 &freq_parent,
1071 &srch_text);
1072
1073 ut_a(ret == 0);
1074
1075 word_freqs = rbt_value(
1076 fts_word_freq_t,
1077 freq_parent.last);
1078
1079 query->error = fts_query_filter_doc_ids(
1080 query, &srch_text,
1081 word_freqs, node,
1082 node->ilist, node->ilist_size, TRUE);
1083
1084 if (query->error != DB_SUCCESS) {
1085 return(0);
1086 }
1087 }
1088
1089 num_word++;
1090
1091 if (!forward) {
1092 cur_node = rbt_prev(
1093 index_cache->words, cur_node);
1094 } else {
1095 cont_search:
1096 cur_node = rbt_next(
1097 index_cache->words, cur_node);
1098 }
1099
1100 if (!cur_node) {
1101 break;
1102 }
1103
1104 word = rbt_value(fts_tokenizer_word_t, cur_node);
1105 }
1106
1107 if (!forward) {
1108 forward = TRUE;
1109 cur_node = parent.last;
1110 goto cont_search;
1111 }
1112 }
1113
1114 return(num_word);
1115 }
1116
1117 /*****************************************************************//**
1118 Set difference.
1119 @return DB_SUCCESS if all go well */
1120 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1121 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1122 fts_query_difference(
1123 /*=================*/
1124 fts_query_t* query, /*!< in: query instance */
1125 const fts_string_t* token) /*!< in: token to search */
1126 {
1127 ulint n_doc_ids= 0;
1128 trx_t* trx = query->trx;
1129 dict_table_t* table = query->index->table;
1130
1131 ut_a(query->oper == FTS_IGNORE);
1132
1133 #ifdef FTS_INTERNAL_DIAG_PRINT
1134 {
1135 ib::info out;
1136 out << "DIFFERENCE: Searching: '";
1137 out.write(token->f_str, token->f_len);
1138 out << "'";
1139 }
1140 #endif
1141
1142 if (query->doc_ids) {
1143 n_doc_ids = rbt_size(query->doc_ids);
1144 }
1145
1146 /* There is nothing we can substract from an empty set. */
1147 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1148 ulint i;
1149 fts_fetch_t fetch;
1150 const ib_vector_t* nodes;
1151 const fts_index_cache_t*index_cache;
1152 que_t* graph = NULL;
1153 fts_cache_t* cache = table->fts->cache;
1154 dberr_t error;
1155
1156 rw_lock_x_lock(&cache->lock);
1157
1158 index_cache = fts_find_index_cache(cache, query->index);
1159
1160 /* Must find the index cache */
1161 ut_a(index_cache != NULL);
1162
1163 /* Search the cache for a matching word first. */
1164 if (query->cur_node->term.wildcard
1165 && query->flags != FTS_PROXIMITY
1166 && query->flags != FTS_PHRASE) {
1167 fts_cache_find_wildcard(query, index_cache, token);
1168 } else {
1169 nodes = fts_cache_find_word(index_cache, token);
1170
1171 for (i = 0; nodes && i < ib_vector_size(nodes)
1172 && query->error == DB_SUCCESS; ++i) {
1173 const fts_node_t* node;
1174
1175 node = static_cast<const fts_node_t*>(
1176 ib_vector_get_const(nodes, i));
1177
1178 fts_query_check_node(query, token, node);
1179 }
1180 }
1181
1182 rw_lock_x_unlock(&cache->lock);
1183
1184 /* error is passed by 'query->error' */
1185 if (query->error != DB_SUCCESS) {
1186 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1187 return(query->error);
1188 }
1189
1190 /* Setup the callback args for filtering and
1191 consolidating the ilist. */
1192 fetch.read_arg = query;
1193 fetch.read_record = fts_query_index_fetch_nodes;
1194
1195 error = fts_index_fetch_nodes(
1196 trx, &graph, &query->fts_index_table, token, &fetch);
1197
1198 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1199 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1200 if (error != DB_SUCCESS) {
1201 query->error = error;
1202 }
1203
1204 fts_que_graph_free(graph);
1205 }
1206
1207 /* The size can't increase. */
1208 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1209
1210 return(query->error);
1211 }
1212
1213 /*****************************************************************//**
1214 Intersect the token doc ids with the current set.
1215 @return DB_SUCCESS if all go well */
1216 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1217 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1218 fts_query_intersect(
1219 /*================*/
1220 fts_query_t* query, /*!< in: query instance */
1221 const fts_string_t* token) /*!< in: the token to search */
1222 {
1223 trx_t* trx = query->trx;
1224 dict_table_t* table = query->index->table;
1225
1226 ut_a(query->oper == FTS_EXIST);
1227
1228 #ifdef FTS_INTERNAL_DIAG_PRINT
1229 {
1230 ib::info out;
1231 out << "INTERSECT: Searching: '";
1232 out.write(token->f_str, token->f_len);
1233 out << "'";
1234 }
1235 #endif
1236
1237 /* If the words set is not empty and multi exist is true,
1238 we know the intersection set is empty in advance. */
1239 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1240 ulint n_doc_ids = 0;
1241 ulint i;
1242 fts_fetch_t fetch;
1243 const ib_vector_t* nodes;
1244 const fts_index_cache_t*index_cache;
1245 que_t* graph = NULL;
1246 fts_cache_t* cache = table->fts->cache;
1247 dberr_t error;
1248
1249 ut_a(!query->intersection);
1250
1251 n_doc_ids = rbt_size(query->doc_ids);
1252
1253 /* Create the rb tree that will hold the doc ids of
1254 the intersection. */
1255 query->intersection = rbt_create(
1256 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1257
1258 query->total_size += SIZEOF_RBT_CREATE;
1259
1260 /* This is to avoid decompressing the ilist if the
1261 node's ilist doc ids are out of range. */
1262 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1263 const ib_rbt_node_t* node;
1264 doc_id_t* doc_id;
1265
1266 node = rbt_first(query->doc_ids);
1267 doc_id = rbt_value(doc_id_t, node);
1268 query->lower_doc_id = *doc_id;
1269
1270 node = rbt_last(query->doc_ids);
1271 doc_id = rbt_value(doc_id_t, node);
1272 query->upper_doc_id = *doc_id;
1273
1274 } else {
1275 query->lower_doc_id = 0;
1276 query->upper_doc_id = 0;
1277 }
1278
1279 /* Search the cache for a matching word first. */
1280
1281 rw_lock_x_lock(&cache->lock);
1282
1283 /* Search for the index specific cache. */
1284 index_cache = fts_find_index_cache(cache, query->index);
1285
1286 /* Must find the index cache. */
1287 ut_a(index_cache != NULL);
1288
1289 if (query->cur_node->term.wildcard) {
1290 /* Wildcard search the index cache */
1291 fts_cache_find_wildcard(query, index_cache, token);
1292 } else {
1293 nodes = fts_cache_find_word(index_cache, token);
1294
1295 for (i = 0; nodes && i < ib_vector_size(nodes)
1296 && query->error == DB_SUCCESS; ++i) {
1297 const fts_node_t* node;
1298
1299 node = static_cast<const fts_node_t*>(
1300 ib_vector_get_const(nodes, i));
1301
1302 fts_query_check_node(query, token, node);
1303 }
1304 }
1305
1306 rw_lock_x_unlock(&cache->lock);
1307
1308 /* error is passed by 'query->error' */
1309 if (query->error != DB_SUCCESS) {
1310 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1311 return(query->error);
1312 }
1313
1314 /* Setup the callback args for filtering and
1315 consolidating the ilist. */
1316 fetch.read_arg = query;
1317 fetch.read_record = fts_query_index_fetch_nodes;
1318
1319 error = fts_index_fetch_nodes(
1320 trx, &graph, &query->fts_index_table, token, &fetch);
1321
1322 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1323 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1324 if (error != DB_SUCCESS) {
1325 query->error = error;
1326 }
1327
1328 fts_que_graph_free(graph);
1329
1330 if (query->error == DB_SUCCESS) {
1331 /* Make the intesection (rb tree) the current doc id
1332 set and free the old set. */
1333 fts_query_free_doc_ids(query, query->doc_ids);
1334 query->doc_ids = query->intersection;
1335 query->intersection = NULL;
1336
1337 ut_a(!query->multi_exist || (query->multi_exist
1338 && rbt_size(query->doc_ids) <= n_doc_ids));
1339 }
1340 }
1341
1342 return(query->error);
1343 }
1344
1345 /*****************************************************************//**
1346 Query index cache.
1347 @return DB_SUCCESS if all go well */
1348 static
1349 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1350 fts_query_cache(
1351 /*============*/
1352 fts_query_t* query, /*!< in/out: query instance */
1353 const fts_string_t* token) /*!< in: token to search */
1354 {
1355 const fts_index_cache_t*index_cache;
1356 dict_table_t* table = query->index->table;
1357 fts_cache_t* cache = table->fts->cache;
1358
1359 /* Search the cache for a matching word first. */
1360 rw_lock_x_lock(&cache->lock);
1361
1362 /* Search for the index specific cache. */
1363 index_cache = fts_find_index_cache(cache, query->index);
1364
1365 /* Must find the index cache. */
1366 ut_a(index_cache != NULL);
1367
1368 if (query->cur_node->term.wildcard
1369 && query->flags != FTS_PROXIMITY
1370 && query->flags != FTS_PHRASE) {
1371 /* Wildcard search the index cache */
1372 fts_cache_find_wildcard(query, index_cache, token);
1373 } else {
1374 const ib_vector_t* nodes;
1375 ulint i;
1376
1377 nodes = fts_cache_find_word(index_cache, token);
1378
1379 for (i = 0; nodes && i < ib_vector_size(nodes)
1380 && query->error == DB_SUCCESS; ++i) {
1381 const fts_node_t* node;
1382
1383 node = static_cast<const fts_node_t*>(
1384 ib_vector_get_const(nodes, i));
1385
1386 fts_query_check_node(query, token, node);
1387 }
1388 }
1389
1390 rw_lock_x_unlock(&cache->lock);
1391
1392 return(query->error);
1393 }
1394
1395 /*****************************************************************//**
1396 Set union.
1397 @return DB_SUCCESS if all go well */
1398 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1399 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1400 fts_query_union(
1401 /*============*/
1402 fts_query_t* query, /*!< in: query instance */
1403 fts_string_t* token) /*!< in: token to search */
1404 {
1405 fts_fetch_t fetch;
1406 ulint n_doc_ids = 0;
1407 trx_t* trx = query->trx;
1408 que_t* graph = NULL;
1409 dberr_t error;
1410
1411 ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1412 query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1413
1414 #ifdef FTS_INTERNAL_DIAG_PRINT
1415 {
1416 ib::info out;
1417 out << "UNION: Searching: '";
1418 out.write(token->f_str, token->f_len);
1419 out << "'";
1420 }
1421 #endif
1422
1423 if (query->doc_ids) {
1424 n_doc_ids = rbt_size(query->doc_ids);
1425 }
1426
1427 if (token->f_len == 0) {
1428 return(query->error);
1429 }
1430
1431 fts_query_cache(query, token);
1432
1433 /* Setup the callback args for filtering and
1434 consolidating the ilist. */
1435 fetch.read_arg = query;
1436 fetch.read_record = fts_query_index_fetch_nodes;
1437
1438 /* Read the nodes from disk. */
1439 error = fts_index_fetch_nodes(
1440 trx, &graph, &query->fts_index_table, token, &fetch);
1441
1442 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1443 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1444 if (error != DB_SUCCESS) {
1445 query->error = error;
1446 }
1447
1448 fts_que_graph_free(graph);
1449
1450 if (query->error == DB_SUCCESS) {
1451
1452 /* The size can't decrease. */
1453 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1454
1455 /* Calulate the number of doc ids that were added to
1456 the current doc id set. */
1457 if (query->doc_ids) {
1458 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1459 }
1460 }
1461
1462 return(query->error);
1463 }
1464
1465 /*****************************************************************//**
1466 Depending upon the current query operator process the doc id.
1467 return DB_SUCCESS if all go well
1468 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1469 static
1470 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1471 fts_query_process_doc_id(
1472 /*=====================*/
1473 fts_query_t* query, /*!< in: query instance */
1474 doc_id_t doc_id, /*!< in: doc id to process */
1475 fts_rank_t rank) /*!< in: if non-zero, it is the
1476 rank associated with the doc_id */
1477 {
1478 if (query->flags == FTS_OPT_RANKING) {
1479 return(DB_SUCCESS);
1480 }
1481
1482 switch (query->oper) {
1483 case FTS_NONE:
1484 fts_query_union_doc_id(query, doc_id, rank);
1485 break;
1486
1487 case FTS_EXIST:
1488 fts_query_intersect_doc_id(query, doc_id, rank);
1489 break;
1490
1491 case FTS_IGNORE:
1492 fts_query_remove_doc_id(query, doc_id);
1493 break;
1494
1495 case FTS_NEGATE:
1496 fts_query_change_ranking(query, doc_id, TRUE);
1497 break;
1498
1499 case FTS_DECR_RATING:
1500 fts_query_union_doc_id(query, doc_id, rank);
1501 fts_query_change_ranking(query, doc_id, TRUE);
1502 break;
1503
1504 case FTS_INCR_RATING:
1505 fts_query_union_doc_id(query, doc_id, rank);
1506 fts_query_change_ranking(query, doc_id, FALSE);
1507 break;
1508
1509 default:
1510 ut_error;
1511 }
1512
1513 if (query->total_size > fts_result_cache_limit) {
1514 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1515 } else {
1516 return(DB_SUCCESS);
1517 }
1518 }
1519
1520 /*****************************************************************//**
1521 Merge two result sets. */
1522 static
1523 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1524 fts_merge_doc_ids(
1525 /*==============*/
1526 fts_query_t* query, /*!< in,out: query instance */
1527 const ib_rbt_t* doc_ids) /*!< in: result set to merge */
1528 {
1529 const ib_rbt_node_t* node;
1530
1531 DBUG_ENTER("fts_merge_doc_ids");
1532
1533 ut_a(!query->intersection);
1534
1535 /* To process FTS_EXIST operation (intersection), we need
1536 to create a new result set for fts_query_intersect(). */
1537 if (query->oper == FTS_EXIST) {
1538
1539 query->intersection = rbt_create(
1540 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1541
1542 query->total_size += SIZEOF_RBT_CREATE;
1543 }
1544
1545 /* Merge the elements to the result set. */
1546 for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1547 fts_ranking_t* ranking;
1548 ulint pos = 0;
1549 fts_string_t word;
1550
1551 ranking = rbt_value(fts_ranking_t, node);
1552
1553 query->error = fts_query_process_doc_id(
1554 query, ranking->doc_id, ranking->rank);
1555
1556 if (query->error != DB_SUCCESS) {
1557 DBUG_RETURN(query->error);
1558 }
1559
1560 /* Merge words. Don't need to take operator into account. */
1561 ut_a(ranking->words);
1562 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1563 fts_query_add_word_to_document(query, ranking->doc_id,
1564 &word);
1565 }
1566 }
1567
1568 /* If it is an intersection operation, reset query->doc_ids
1569 to query->intersection and free the old result list. */
1570 if (query->oper == FTS_EXIST && query->intersection != NULL) {
1571 fts_query_free_doc_ids(query, query->doc_ids);
1572 query->doc_ids = query->intersection;
1573 query->intersection = NULL;
1574 }
1575
1576 DBUG_RETURN(DB_SUCCESS);
1577 }
1578
1579 /*****************************************************************//**
1580 Skip non-whitespace in a string. Move ptr to the next word boundary.
1581 @return pointer to first whitespace character or end */
1582 UNIV_INLINE
1583 byte*
fts_query_skip_word(byte * ptr,const byte * end)1584 fts_query_skip_word(
1585 /*================*/
1586 byte* ptr, /*!< in: start of scan */
1587 const byte* end) /*!< in: pointer to end of string */
1588 {
1589 /* TODO: Does this have to be UTF-8 too ? */
1590 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1591 ++ptr;
1592 }
1593
1594 return(ptr);
1595 }
1596
1597 /*****************************************************************//**
1598 Check whether the remaining terms in the phrase match the text.
1599 @return TRUE if matched else FALSE */
1600 static
1601 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1602 fts_query_match_phrase_terms(
1603 /*=========================*/
1604 fts_phrase_t* phrase, /*!< in: phrase to match */
1605 byte** start, /*!< in/out: text to search, we can't
1606 make this const becase we need to
1607 first convert the string to
1608 lowercase */
1609 const byte* end, /*!< in: pointer to the end of
1610 the string to search */
1611 mem_heap_t* heap) /*!< in: heap */
1612 {
1613 ulint i;
1614 byte* ptr = *start;
1615 const ib_vector_t* tokens = phrase->tokens;
1616 ulint distance = phrase->distance;
1617
1618 /* We check only from the second term onwards, since the first
1619 must have matched otherwise we wouldn't be here. */
1620 for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1621 fts_string_t match;
1622 fts_string_t cmp_str;
1623 const fts_string_t* token;
1624 int result;
1625 ulint ret;
1626
1627 ret = innobase_mysql_fts_get_token(
1628 phrase->charset, ptr,
1629 const_cast<byte*>(end), &match);
1630
1631 if (match.f_len > 0) {
1632 /* Get next token to match. */
1633 token = static_cast<const fts_string_t*>(
1634 ib_vector_get_const(tokens, i));
1635
1636 fts_string_dup(&cmp_str, &match, heap);
1637
1638 result = innobase_fts_text_case_cmp(
1639 phrase->charset, token, &cmp_str);
1640
1641 /* Skip the rest of the tokens if this one doesn't
1642 match and the proximity distance is exceeded. */
1643 if (result
1644 && (distance == ULINT_UNDEFINED
1645 || distance == 0)) {
1646
1647 break;
1648 }
1649
1650 /* This token matched move to the next token. */
1651 if (result == 0) {
1652 /* Advance the text to search by the length
1653 of the last token. */
1654 ptr += ret;
1655
1656 /* Advance to the next token. */
1657 ++i;
1658 } else {
1659
1660 ut_a(distance != ULINT_UNDEFINED);
1661
1662 ptr = fts_query_skip_word(ptr, end);
1663 }
1664
1665 /* Distance can be 0 for exact matches. */
1666 if (distance != ULINT_UNDEFINED && distance > 0) {
1667 --distance;
1668 }
1669 } else {
1670 ptr += ret;
1671 }
1672 }
1673
1674 *start = ptr;
1675
1676 /* Can't be greater than the number of elements. */
1677 ut_a(i <= ib_vector_size(tokens));
1678
1679 /* This is the case for multiple words. */
1680 if (i == ib_vector_size(tokens)) {
1681 phrase->found = TRUE;
1682 }
1683
1684 return(phrase->found);
1685 }
1686
1687 /*****************************************************************//**
1688 Callback function to count the number of words in position ranges,
1689 and see whether the word count is in specified "phrase->distance"
1690 @return true if the number of characters is less than the "distance" */
1691 static
1692 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1693 fts_proximity_is_word_in_range(
1694 /*===========================*/
1695 const fts_phrase_t*
1696 phrase, /*!< in: phrase with the search info */
1697 byte* start, /*!< in: text to search */
1698 ulint total_len) /*!< in: length of text */
1699 {
1700 fts_proximity_t* proximity_pos = phrase->proximity_pos;
1701
1702 ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1703 ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1704
1705 /* Search each matched position pair (with min and max positions)
1706 and count the number of words in the range */
1707 for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1708 ulint cur_pos = proximity_pos->min_pos[i];
1709 ulint n_word = 0;
1710
1711 ut_ad(proximity_pos->max_pos[i] <= total_len);
1712
1713 /* Walk through words in the range and count them */
1714 while (cur_pos <= proximity_pos->max_pos[i]) {
1715 ulint len;
1716 fts_string_t str;
1717
1718 len = innobase_mysql_fts_get_token(
1719 phrase->charset,
1720 start + cur_pos,
1721 start + total_len, &str);
1722
1723 if (len == 0) {
1724 break;
1725 }
1726
1727 /* Advances position with "len" bytes */
1728 cur_pos += len;
1729
1730 /* Record the number of words */
1731 if (str.f_n_char > 0) {
1732 n_word++;
1733 }
1734
1735 if (n_word > phrase->distance) {
1736 break;
1737 }
1738 }
1739
1740 /* Check if the number of words is less than specified
1741 "distance" */
1742 if (n_word && n_word <= phrase->distance) {
1743 return(true);
1744 }
1745 }
1746
1747 return(false);
1748 }
1749
1750 /*****************************************************************//**
1751 FTS plugin parser 'myql_add_word' callback function for phrase match
1752 Refer to 'st_mysql_ftparser_param' for more detail.
1753 @return 0 if match, or return non-zero */
1754 static
1755 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)1756 fts_query_match_phrase_add_word_for_parser(
1757 /*=======================================*/
1758 MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
1759 char* word, /*!< in: token */
1760 int word_len, /*!< in: token length */
1761 MYSQL_FTPARSER_BOOLEAN_INFO* info) /*!< in: token info */
1762 {
1763 fts_phrase_param_t* phrase_param;
1764 fts_phrase_t* phrase;
1765 const ib_vector_t* tokens;
1766 fts_string_t match;
1767 fts_string_t cmp_str;
1768 const fts_string_t* token;
1769 int result;
1770 mem_heap_t* heap;
1771
1772 phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1773 heap = phrase_param->heap;
1774 phrase = phrase_param->phrase;
1775 tokens = phrase->tokens;
1776
1777 /* In case plugin parser doesn't check return value */
1778 if (phrase_param->token_index == ib_vector_size(tokens)) {
1779 return(1);
1780 }
1781
1782 match.f_str = reinterpret_cast<byte*>(word);
1783 match.f_len = word_len;
1784 match.f_n_char = fts_get_token_size(phrase->charset, word, word_len);
1785
1786 if (match.f_len > 0) {
1787 /* Get next token to match. */
1788 ut_a(phrase_param->token_index < ib_vector_size(tokens));
1789 token = static_cast<const fts_string_t*>(
1790 ib_vector_get_const(tokens, phrase_param->token_index));
1791
1792 fts_string_dup(&cmp_str, &match, heap);
1793
1794 result = innobase_fts_text_case_cmp(
1795 phrase->charset, token, &cmp_str);
1796
1797 if (result == 0) {
1798 phrase_param->token_index++;
1799 } else {
1800 return(1);
1801 }
1802 }
1803
1804 /* Can't be greater than the number of elements. */
1805 ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1806
1807 /* This is the case for multiple words. */
1808 if (phrase_param->token_index == ib_vector_size(tokens)) {
1809 phrase->found = TRUE;
1810 }
1811
1812 return(static_cast<int>(phrase->found));
1813 }
1814
1815 /*****************************************************************//**
1816 Check whether the terms in the phrase match the text.
1817 @return TRUE if matched else FALSE */
1818 static
1819 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1820 fts_query_match_phrase_terms_by_parser(
1821 /*===================================*/
1822 fts_phrase_param_t* phrase_param, /* in/out: phrase param */
1823 st_mysql_ftparser* parser, /* in: plugin fts parser */
1824 byte* text, /* in: text to check */
1825 ulint len) /* in: text length */
1826 {
1827 MYSQL_FTPARSER_PARAM param;
1828
1829 ut_a(parser);
1830
1831 /* Set paramters for param */
1832 param.mysql_parse = fts_tokenize_document_internal;
1833 param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1834 param.mysql_ftparam = phrase_param;
1835 param.cs = phrase_param->phrase->charset;
1836 param.doc = reinterpret_cast<char*>(text);
1837 param.length = static_cast<int>(len);
1838 param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1839
1840 PARSER_INIT(parser, ¶m);
1841 parser->parse(¶m);
1842 PARSER_DEINIT(parser, ¶m);
1843
1844 return(phrase_param->phrase->found);
1845 }
1846
1847 /*****************************************************************//**
1848 Callback function to fetch and search the document.
1849 @return TRUE if matched else FALSE */
1850 static
1851 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1852 fts_query_match_phrase(
1853 /*===================*/
1854 fts_phrase_t* phrase, /*!< in: phrase to match */
1855 byte* start, /*!< in: text to search, we can't make
1856 this const becase we need to first
1857 convert the string to lowercase */
1858 ulint cur_len, /*!< in: length of text */
1859 ulint prev_len, /*!< in: total length for searched
1860 doc fields*/
1861 mem_heap_t* heap) /* heap */
1862 {
1863 ulint i;
1864 const fts_string_t* first;
1865 const byte* end = start + cur_len;
1866 const ib_vector_t* tokens = phrase->tokens;
1867 const ib_vector_t* positions = phrase->match->positions;
1868
1869 ut_a(!phrase->found);
1870 ut_a(phrase->match->doc_id > 0);
1871 ut_a(ib_vector_size(tokens) > 0);
1872 ut_a(ib_vector_size(positions) > 0);
1873
1874 first = static_cast<const fts_string_t*>(
1875 ib_vector_get_const(tokens, 0));
1876
1877 ut_a(phrase->match->start < ib_vector_size(positions));
1878
1879 for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1880 ulint pos;
1881 byte* ptr = start;
1882
1883 pos = *(ulint*) ib_vector_get_const(positions, i);
1884
1885 if (pos == ULINT_UNDEFINED) {
1886 break;
1887 }
1888
1889 if (pos < prev_len) {
1890 continue;
1891 }
1892
1893 /* Document positions are calculated from the beginning
1894 of the first field, need to save the length for each
1895 searched field to adjust the doc position when search
1896 phrases. */
1897 pos -= prev_len;
1898 ptr = start + pos;
1899
1900 /* Within limits ? */
1901 if (ptr >= end) {
1902 break;
1903 }
1904
1905 if (phrase->parser) {
1906 fts_phrase_param_t phrase_param;
1907
1908 phrase_param.phrase = phrase;
1909 phrase_param.token_index = 0;
1910 phrase_param.heap = heap;
1911
1912 if (fts_query_match_phrase_terms_by_parser(
1913 &phrase_param,
1914 phrase->parser,
1915 ptr,
1916 (end - ptr))) {
1917 break;
1918 }
1919 } else {
1920 fts_string_t match;
1921 fts_string_t cmp_str;
1922 ulint ret;
1923
1924 match.f_str = ptr;
1925 ret = innobase_mysql_fts_get_token(
1926 phrase->charset, start + pos,
1927 const_cast<byte*>(end), &match);
1928
1929 if (match.f_len == 0) {
1930 break;
1931 }
1932
1933 fts_string_dup(&cmp_str, &match, heap);
1934
1935 if (innobase_fts_text_case_cmp(
1936 phrase->charset, first, &cmp_str) == 0) {
1937
1938 /* This is the case for the single word
1939 in the phrase. */
1940 if (ib_vector_size(phrase->tokens) == 1) {
1941 phrase->found = TRUE;
1942 break;
1943 }
1944
1945 ptr += ret;
1946
1947 /* Match the remaining terms in the phrase. */
1948 if (fts_query_match_phrase_terms(phrase, &ptr,
1949 end, heap)) {
1950 break;
1951 }
1952 }
1953 }
1954 }
1955
1956 return(phrase->found);
1957 }
1958
1959 /*****************************************************************//**
1960 Callback function to fetch and search the document.
1961 @return whether the phrase is found */
1962 static
1963 ibool
fts_query_fetch_document(void * row,void * user_arg)1964 fts_query_fetch_document(
1965 /*=====================*/
1966 void* row, /*!< in: sel_node_t* */
1967 void* user_arg) /*!< in: fts_doc_t* */
1968 {
1969
1970 que_node_t* exp;
1971 sel_node_t* node = static_cast<sel_node_t*>(row);
1972 fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1973 ulint prev_len = 0;
1974 ulint total_len = 0;
1975 byte* document_text = NULL;
1976
1977 exp = node->select_list;
1978
1979 phrase->found = FALSE;
1980
1981 /* For proximity search, we will need to get the whole document
1982 from all fields, so first count the total length of the document
1983 from all the fields */
1984 if (phrase->proximity_pos) {
1985 while (exp) {
1986 ulint field_len;
1987 dfield_t* dfield = que_node_get_val(exp);
1988 byte* data = static_cast<byte*>(
1989 dfield_get_data(dfield));
1990
1991 if (dfield_is_ext(dfield)) {
1992 ulint local_len = dfield_get_len(dfield);
1993
1994 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1995
1996 field_len = mach_read_from_4(
1997 data + local_len + BTR_EXTERN_LEN + 4);
1998 } else {
1999 field_len = dfield_get_len(dfield);
2000 }
2001
2002 if (field_len != UNIV_SQL_NULL) {
2003 total_len += field_len + 1;
2004 }
2005
2006 exp = que_node_get_next(exp);
2007 }
2008
2009 document_text = static_cast<byte*>(mem_heap_zalloc(
2010 phrase->heap, total_len));
2011
2012 if (!document_text) {
2013 return(FALSE);
2014 }
2015 }
2016
2017 exp = node->select_list;
2018
2019 while (exp) {
2020 dfield_t* dfield = que_node_get_val(exp);
2021 byte* data = static_cast<byte*>(
2022 dfield_get_data(dfield));
2023 ulint cur_len;
2024
2025 if (dfield_is_ext(dfield)) {
2026 data = btr_copy_externally_stored_field(
2027 &cur_len, data, phrase->page_size,
2028 dfield_get_len(dfield), phrase->heap);
2029 } else {
2030 cur_len = dfield_get_len(dfield);
2031 }
2032
2033 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2034 if (phrase->proximity_pos) {
2035 ut_ad(prev_len + cur_len <= total_len);
2036 memcpy(document_text + prev_len, data, cur_len);
2037 } else {
2038 /* For phrase search */
2039 phrase->found =
2040 fts_query_match_phrase(
2041 phrase,
2042 static_cast<byte*>(data),
2043 cur_len, prev_len,
2044 phrase->heap);
2045 }
2046
2047 /* Document positions are calculated from the beginning
2048 of the first field, need to save the length for each
2049 searched field to adjust the doc position when search
2050 phrases. */
2051 prev_len += cur_len + 1;
2052 }
2053
2054 if (phrase->found) {
2055 break;
2056 }
2057
2058 exp = que_node_get_next(exp);
2059 }
2060
2061 if (phrase->proximity_pos) {
2062 ut_ad(prev_len <= total_len);
2063
2064 phrase->found = fts_proximity_is_word_in_range(
2065 phrase, document_text, total_len);
2066 }
2067
2068 return(phrase->found);
2069 }
2070
2071 #if 0
2072 /********************************************************************
2073 Callback function to check whether a record was found or not. */
2074 static
2075 ibool
2076 fts_query_select(
2077 /*=============*/
2078 void* row, /*!< in: sel_node_t* */
2079 void* user_arg) /*!< in: fts_doc_t* */
2080 {
2081 int i;
2082 que_node_t* exp;
2083 sel_node_t* node = row;
2084 fts_select_t* select = user_arg;
2085
2086 ut_a(select->word_freq);
2087 ut_a(select->word_freq->doc_freqs);
2088
2089 exp = node->select_list;
2090
2091 for (i = 0; exp && !select->found; ++i) {
2092 dfield_t* dfield = que_node_get_val(exp);
2093 void* data = dfield_get_data(dfield);
2094 ulint len = dfield_get_len(dfield);
2095
2096 switch (i) {
2097 case 0: /* DOC_COUNT */
2098 if (len != UNIV_SQL_NULL && len != 0) {
2099
2100 select->word_freq->doc_count +=
2101 mach_read_from_4(data);
2102 }
2103 break;
2104
2105 case 1: /* ILIST */
2106 if (len != UNIV_SQL_NULL && len != 0) {
2107
2108 fts_query_find_doc_id(select, data, len);
2109 }
2110 break;
2111
2112 default:
2113 ut_error;
2114 }
2115
2116 exp = que_node_get_next(exp);
2117 }
2118
2119 return(FALSE);
2120 }
2121
2122 /********************************************************************
2123 Read the rows from the FTS index, that match word and where the
2124 doc id is between first and last doc id.
2125 @return DB_SUCCESS if all go well else error code */
2126 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2127 dberr_t
2128 fts_query_find_term(
2129 /*================*/
2130 fts_query_t* query, /*!< in: FTS query state */
2131 que_t** graph, /*!< in: prepared statement */
2132 const fts_string_t* word, /*!< in: the word to fetch */
2133 doc_id_t doc_id, /*!< in: doc id to match */
2134 ulint* min_pos,/*!< in/out: pos found must be
2135 greater than this minimum value. */
2136 ibool* found) /*!< out: TRUE if found else FALSE */
2137 {
2138 pars_info_t* info;
2139 dberr_t error;
2140 fts_select_t select;
2141 doc_id_t match_doc_id;
2142 trx_t* trx = query->trx;
2143 char table_name[MAX_FULL_NAME_LEN];
2144
2145 trx->op_info = "fetching FTS index matching nodes";
2146
2147 if (*graph) {
2148 info = (*graph)->info;
2149 } else {
2150 ulint selected;
2151
2152 info = pars_info_create();
2153
2154 selected = fts_select_index(*word->f_str);
2155 query->fts_index_table.suffix = fts_get_suffix(selected);
2156
2157 fts_get_table_name(&query->fts_index_table, table_name);
2158 pars_info_bind_id(info, true, "index_table_name", table_name);
2159 }
2160
2161 select.found = FALSE;
2162 select.doc_id = doc_id;
2163 select.min_pos = *min_pos;
2164 select.word_freq = fts_query_add_word_freq(query, word->f_str);
2165
2166 pars_info_bind_function(info, "my_func", fts_query_select, &select);
2167 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2168
2169 /* Convert to "storage" byte order. */
2170 fts_write_doc_id((byte*) &match_doc_id, doc_id);
2171
2172 fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2173
2174 fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2175
2176 if (!*graph) {
2177
2178 *graph = fts_parse_sql(
2179 &query->fts_index_table,
2180 info,
2181 "DECLARE FUNCTION my_func;\n"
2182 "DECLARE CURSOR c IS"
2183 " SELECT doc_count, ilist\n"
2184 " FROM $index_table_name\n"
2185 " WHERE word LIKE :word AND"
2186 " first_doc_id <= :min_doc_id AND"
2187 " last_doc_id >= :max_doc_id\n"
2188 " ORDER BY first_doc_id;\n"
2189 "BEGIN\n"
2190 "\n"
2191 "OPEN c;\n"
2192 "WHILE 1 = 1 LOOP\n"
2193 " FETCH c INTO my_func();\n"
2194 " IF c % NOTFOUND THEN\n"
2195 " EXIT;\n"
2196 " END IF;\n"
2197 "END LOOP;\n"
2198 "CLOSE c;");
2199 }
2200
2201 for (;;) {
2202 error = fts_eval_sql(trx, *graph);
2203
2204 if (error == DB_SUCCESS) {
2205
2206 break; /* Exit the loop. */
2207 } else {
2208
2209 if (error == DB_LOCK_WAIT_TIMEOUT) {
2210 ib::warn() << "lock wait timeout reading FTS"
2211 " index. Retrying!";
2212
2213 trx->error_state = DB_SUCCESS;
2214 } else {
2215 ib::error() << error
2216 << " while reading FTS index.";
2217
2218 break; /* Exit the loop. */
2219 }
2220 }
2221 }
2222
2223 /* Value to return */
2224 *found = select.found;
2225
2226 if (*found) {
2227 *min_pos = select.min_pos;
2228 }
2229
2230 return(error);
2231 }
2232
2233 /********************************************************************
2234 Callback aggregator for int columns. */
2235 static
2236 ibool
2237 fts_query_sum(
2238 /*==========*/
2239 /*!< out: always returns TRUE */
2240 void* row, /*!< in: sel_node_t* */
2241 void* user_arg) /*!< in: ulint* */
2242 {
2243
2244 que_node_t* exp;
2245 sel_node_t* node = row;
2246 ulint* total = user_arg;
2247
2248 exp = node->select_list;
2249
2250 while (exp) {
2251 dfield_t* dfield = que_node_get_val(exp);
2252 void* data = dfield_get_data(dfield);
2253 ulint len = dfield_get_len(dfield);
2254
2255 if (len != UNIV_SQL_NULL && len != 0) {
2256 *total += mach_read_from_4(data);
2257 }
2258
2259 exp = que_node_get_next(exp);
2260 }
2261
2262 return(TRUE);
2263 }
2264
2265 /********************************************************************
2266 Calculate the total documents that contain a particular word (term).
2267 @return DB_SUCCESS if all go well else error code */
2268 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2269 dberr_t
2270 fts_query_total_docs_containing_term(
2271 /*=================================*/
2272 fts_query_t* query, /*!< in: FTS query state */
2273 const fts_string_t* word, /*!< in: the word to check */
2274 ulint* total) /*!< out: documents containing word */
2275 {
2276 pars_info_t* info;
2277 dberr_t error;
2278 que_t* graph;
2279 ulint selected;
2280 trx_t* trx = query->trx;
2281 char table_name[MAX_FULL_NAME_LEN]
2282
2283 trx->op_info = "fetching FTS index document count";
2284
2285 *total = 0;
2286
2287 info = pars_info_create();
2288
2289 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2290 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2291
2292 selected = fts_select_index(*word->f_str);
2293
2294 query->fts_index_table.suffix = fts_get_suffix(selected);
2295
2296 fts_get_table_name(&query->fts_index_table, table_name);
2297
2298 pars_info_bind_id(info, true, "index_table_name", table_name);
2299
2300 graph = fts_parse_sql(
2301 &query->fts_index_table,
2302 info,
2303 "DECLARE FUNCTION my_func;\n"
2304 "DECLARE CURSOR c IS"
2305 " SELECT doc_count\n"
2306 " FROM $index_table_name\n"
2307 " WHERE word = :word"
2308 " ORDER BY first_doc_id;\n"
2309 "BEGIN\n"
2310 "\n"
2311 "OPEN c;\n"
2312 "WHILE 1 = 1 LOOP\n"
2313 " FETCH c INTO my_func();\n"
2314 " IF c % NOTFOUND THEN\n"
2315 " EXIT;\n"
2316 " END IF;\n"
2317 "END LOOP;\n"
2318 "CLOSE c;");
2319
2320 for (;;) {
2321 error = fts_eval_sql(trx, graph);
2322
2323 if (error == DB_SUCCESS) {
2324
2325 break; /* Exit the loop. */
2326 } else {
2327
2328 if (error == DB_LOCK_WAIT_TIMEOUT) {
2329 ib::warn() << "lock wait timeout reading FTS"
2330 " index. Retrying!";
2331
2332 trx->error_state = DB_SUCCESS;
2333 } else {
2334 ib::error() << error
2335 << " while reading FTS index.";
2336
2337 break; /* Exit the loop. */
2338 }
2339 }
2340 }
2341
2342 fts_que_graph_free(graph);
2343
2344 return(error);
2345 }
2346
2347 /********************************************************************
2348 Get the total number of words in a documents.
2349 @return DB_SUCCESS if all go well else error code */
2350 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2351 dberr_t
2352 fts_query_terms_in_document(
2353 /*========================*/
2354 fts_query_t* query, /*!< in: FTS query state */
2355 doc_id_t doc_id, /*!< in: the word to check */
2356 ulint* total) /*!< out: total words in document */
2357 {
2358 pars_info_t* info;
2359 dberr_t error;
2360 que_t* graph;
2361 doc_id_t read_doc_id;
2362 trx_t* trx = query->trx;
2363 char table_name[MAX_FULL_NAME_LEN];
2364
2365 trx->op_info = "fetching FTS document term count";
2366
2367 *total = 0;
2368
2369 info = pars_info_create();
2370
2371 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2372
2373 /* Convert to "storage" byte order. */
2374 fts_write_doc_id((byte*) &read_doc_id, doc_id);
2375 fts_bind_doc_id(info, "doc_id", &read_doc_id);
2376
2377 query->fts_index_table.suffix = "DOC_ID";
2378
2379 fts_get_table_name(&query->fts_index_table, table_name);
2380
2381 pars_info_bind_id(info, true, "index_table_name", table_name);
2382
2383 graph = fts_parse_sql(
2384 &query->fts_index_table,
2385 info,
2386 "DECLARE FUNCTION my_func;\n"
2387 "DECLARE CURSOR c IS"
2388 " SELECT count\n"
2389 " FROM $index_table_name\n"
2390 " WHERE doc_id = :doc_id"
2391 " BEGIN\n"
2392 "\n"
2393 "OPEN c;\n"
2394 "WHILE 1 = 1 LOOP\n"
2395 " FETCH c INTO my_func();\n"
2396 " IF c % NOTFOUND THEN\n"
2397 " EXIT;\n"
2398 " END IF;\n"
2399 "END LOOP;\n"
2400 "CLOSE c;");
2401
2402 for (;;) {
2403 error = fts_eval_sql(trx, graph);
2404
2405 if (error == DB_SUCCESS) {
2406
2407 break; /* Exit the loop. */
2408 } else {
2409
2410 if (error == DB_LOCK_WAIT_TIMEOUT) {
2411 ib::warn() << "lock wait timeout reading FTS"
2412 " doc id table. Retrying!";
2413
2414 trx->error_state = DB_SUCCESS;
2415 } else {
2416 ib::error() << error << " while reading FTS"
2417 " doc id table.";
2418
2419 break; /* Exit the loop. */
2420 }
2421 }
2422 }
2423
2424 fts_que_graph_free(graph);
2425
2426 return(error);
2427 }
2428 #endif
2429
2430 /*****************************************************************//**
2431 Retrieve the document and match the phrase tokens.
2432 @return DB_SUCCESS or error code */
2433 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2434 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2435 fts_query_match_document(
2436 /*=====================*/
2437 ib_vector_t* tokens, /*!< in: phrase tokens */
2438 fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
2439 fts_match_t* match, /*!< in: doc id and positions */
2440 ulint distance, /*!< in: proximity distance */
2441 st_mysql_ftparser* parser, /*!< in: fts plugin parser */
2442 ibool* found) /*!< out: TRUE if phrase found */
2443 {
2444 dberr_t error;
2445 fts_phrase_t phrase(get_doc->index_cache->index->table);
2446
2447 phrase.match = match; /* Positions to match */
2448 phrase.tokens = tokens; /* Tokens to match */
2449 phrase.distance = distance;
2450 phrase.charset = get_doc->index_cache->charset;
2451 phrase.heap = mem_heap_create(512);
2452 phrase.parser = parser;
2453
2454 *found = phrase.found = FALSE;
2455
2456 error = fts_doc_fetch_by_doc_id(
2457 get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2458 fts_query_fetch_document, &phrase);
2459
2460 if (error != DB_SUCCESS) {
2461 ib::error() << "(" << ut_strerr(error)
2462 << ") matching document.";
2463 } else {
2464 *found = phrase.found;
2465 }
2466
2467 mem_heap_free(phrase.heap);
2468
2469 return(error);
2470 }
2471
2472 /*****************************************************************//**
2473 This function fetches the original documents and count the
2474 words in between matching words to see that is in specified distance
2475 @return DB_SUCCESS if all OK */
2476 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2477 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2478 fts_query_is_in_proximity_range(
2479 /*============================*/
2480 const fts_query_t* query, /*!< in: query instance */
2481 fts_match_t** match, /*!< in: query instance */
2482 fts_proximity_t* qualified_pos) /*!< in: position info for
2483 qualified ranges */
2484 {
2485 fts_get_doc_t get_doc;
2486 fts_cache_t* cache = query->index->table->fts->cache;
2487 dberr_t err;
2488
2489 memset(&get_doc, 0x0, sizeof(get_doc));
2490
2491 rw_lock_x_lock(&cache->lock);
2492 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2493 rw_lock_x_unlock(&cache->lock);
2494 ut_a(get_doc.index_cache != NULL);
2495
2496 fts_phrase_t phrase(get_doc.index_cache->index->table);
2497
2498 phrase.distance = query->distance;
2499 phrase.charset = get_doc.index_cache->charset;
2500 phrase.heap = mem_heap_create(512);
2501 phrase.proximity_pos = qualified_pos;
2502 phrase.found = FALSE;
2503
2504 err = fts_doc_fetch_by_doc_id(
2505 &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2506 fts_query_fetch_document, &phrase);
2507
2508 if (err != DB_SUCCESS) {
2509 ib::error() << "(" << ut_strerr(err) << ") in verification"
2510 " phase of proximity search";
2511 }
2512
2513 /* Free the prepared statement. */
2514 if (get_doc.get_document_graph) {
2515 fts_que_graph_free(get_doc.get_document_graph);
2516 get_doc.get_document_graph = NULL;
2517 }
2518
2519 mem_heap_free(phrase.heap);
2520
2521 return(err == DB_SUCCESS && phrase.found);
2522 }
2523
2524 /*****************************************************************//**
2525 Iterate over the matched document ids and search the for the
2526 actual phrase in the text.
2527 @return DB_SUCCESS if all OK */
2528 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2529 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2530 fts_query_search_phrase(
2531 /*====================*/
2532 fts_query_t* query, /*!< in: query instance */
2533 ib_vector_t* orig_tokens, /*!< in: tokens to search,
2534 with any stopwords in the
2535 original phrase */
2536 ib_vector_t* tokens) /*!< in: tokens that does
2537 not include stopwords and
2538 can be used to calculate
2539 ranking */
2540 {
2541 ulint i;
2542 fts_get_doc_t get_doc;
2543 ulint n_matched;
2544 fts_cache_t* cache = query->index->table->fts->cache;
2545
2546 n_matched = ib_vector_size(query->matched);
2547
2548 /* Setup the doc retrieval infrastructure. */
2549 memset(&get_doc, 0x0, sizeof(get_doc));
2550
2551 rw_lock_x_lock(&cache->lock);
2552
2553 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2554
2555 /* Must find the index cache */
2556 ut_a(get_doc.index_cache != NULL);
2557
2558 rw_lock_x_unlock(&cache->lock);
2559
2560 #ifdef FTS_INTERNAL_DIAG_PRINT
2561 ib::info() << "Start phrase search";
2562 #endif
2563
2564 /* Read the document from disk and do the actual
2565 match, matching documents will be added to the current
2566 doc id set. */
2567 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2568 fts_match_t* match;
2569 ibool found = FALSE;
2570
2571 match = static_cast<fts_match_t*>(
2572 ib_vector_get(query->matched, i));
2573
2574 /* Skip the document ids that were filtered out by
2575 an earlier pass. */
2576 if (match->doc_id != 0) {
2577
2578 query->error = fts_query_match_document(
2579 orig_tokens, &get_doc, match,
2580 query->distance, query->parser, &found);
2581
2582 if (query->error == DB_SUCCESS && found) {
2583 ulint z;
2584
2585 query->error = fts_query_process_doc_id(query,
2586 match->doc_id, 0);
2587 if (query->error != DB_SUCCESS) {
2588 goto func_exit;
2589 }
2590
2591 for (z = 0; z < ib_vector_size(tokens); z++) {
2592 fts_string_t* token;
2593 token = static_cast<fts_string_t*>(
2594 ib_vector_get(tokens, z));
2595 fts_query_add_word_to_document(
2596 query, match->doc_id, token);
2597 }
2598 }
2599 }
2600 }
2601
2602 func_exit:
2603 /* Free the prepared statement. */
2604 if (get_doc.get_document_graph) {
2605 fts_que_graph_free(get_doc.get_document_graph);
2606 get_doc.get_document_graph = NULL;
2607 }
2608
2609 return(query->error);
2610 }
2611
2612 /** Split the phrase into tokens
2613 @param[in,out] query query instance
2614 @param[in] node query node to search
2615 @param[in,out] tokens token vector
2616 @param[in,out] orig_tokens original node tokens include stopword
2617 @param[in,out] heap mem heap */
2618 static
2619 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2620 fts_query_phrase_split(
2621 fts_query_t* query,
2622 const fts_ast_node_t* node,
2623 ib_vector_t* tokens,
2624 ib_vector_t* orig_tokens,
2625 mem_heap_t* heap)
2626 {
2627 fts_string_t phrase;
2628 ulint len = 0;
2629 ulint cur_pos = 0;
2630 fts_ast_node_t* term_node = NULL;
2631
2632 if (node->type == FTS_AST_TEXT) {
2633 phrase.f_str = node->text.ptr->str;
2634 phrase.f_len = node->text.ptr->len;
2635 len = phrase.f_len;
2636 } else {
2637 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2638 phrase.f_str = NULL;
2639 phrase.f_len = 0;
2640 term_node = node->list.head;
2641 }
2642
2643 while (true) {
2644 fts_cache_t* cache = query->index->table->fts->cache;
2645 ulint cur_len;
2646 fts_string_t result_str;
2647
2648 if (node->type == FTS_AST_TEXT) {
2649 if (cur_pos >= len) {
2650 break;
2651 }
2652
2653 cur_len = innobase_mysql_fts_get_token(
2654 query->fts_index_table.charset,
2655 reinterpret_cast<const byte*>(phrase.f_str)
2656 + cur_pos,
2657 reinterpret_cast<const byte*>(phrase.f_str)
2658 + len,
2659 &result_str);
2660
2661 if (cur_len == 0) {
2662 break;
2663 }
2664
2665 cur_pos += cur_len;
2666 } else {
2667 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2668 /* Term node in parser phrase list */
2669 if (term_node == NULL) {
2670 break;
2671 }
2672
2673 ut_a(term_node->type == FTS_AST_TERM);
2674 result_str.f_str = term_node->term.ptr->str;
2675 result_str.f_len = term_node->term.ptr->len;
2676 result_str.f_n_char = fts_get_token_size(
2677 query->fts_index_table.charset,
2678 reinterpret_cast<char*>(result_str.f_str),
2679 result_str.f_len);
2680
2681 term_node = term_node->next;
2682 }
2683
2684 if (result_str.f_n_char == 0) {
2685 continue;
2686 }
2687
2688 fts_string_t* token = static_cast<fts_string_t*>(
2689 ib_vector_push(tokens, NULL));
2690 fts_string_dup(token, &result_str, heap);
2691
2692 if (fts_check_token(
2693 &result_str,
2694 cache->stopword_info.cached_stopword,
2695 query->index->is_ngram,
2696 query->fts_index_table.charset)) {
2697 /* Add the word to the RB tree so that we can
2698 calculate it's frequencey within a document. */
2699 fts_query_add_word_freq(query, token);
2700 } else {
2701 ib_vector_pop(tokens);
2702 }
2703
2704 /* we will start to store all words including stopwords
2705 in the "orig_tokens" vector, but skip any leading words
2706 that are stopwords */
2707 if (!ib_vector_is_empty(tokens)) {
2708 fts_string_t* orig_token = static_cast<fts_string_t*>(
2709 ib_vector_push(orig_tokens, NULL));
2710
2711 orig_token->f_str = token->f_str;
2712 orig_token->f_len = token->f_len;
2713 }
2714 }
2715 }
2716
2717 /*****************************************************************//**
2718 Text/Phrase search.
2719 @return DB_SUCCESS or error code */
2720 static MY_ATTRIBUTE((warn_unused_result))
2721 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2722 fts_query_phrase_search(
2723 /*====================*/
2724 fts_query_t* query, /*!< in: query instance */
2725 const fts_ast_node_t* node) /*!< in: node to search */
2726 {
2727 ib_vector_t* tokens;
2728 ib_vector_t* orig_tokens;
2729 mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2730 ib_alloc_t* heap_alloc;
2731 ulint num_token;
2732
2733 heap_alloc = ib_heap_allocator_create(heap);
2734
2735 tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2736 orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2737
2738 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2739 query->flags = FTS_PROXIMITY;
2740 } else {
2741 query->flags = FTS_PHRASE;
2742 }
2743
2744 /* Split the phrase into tokens. */
2745 fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2746
2747 num_token = ib_vector_size(tokens);
2748 if (num_token > MAX_PROXIMITY_ITEM) {
2749 query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2750 goto func_exit;
2751 }
2752
2753 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2754
2755 /* Ignore empty strings. */
2756 if (num_token > 0) {
2757 fts_string_t* token;
2758 fts_fetch_t fetch;
2759 trx_t* trx = query->trx;
2760 fts_ast_oper_t oper = query->oper;
2761 que_t* graph = NULL;
2762 ulint i;
2763 dberr_t error;
2764
2765 /* Create the vector for storing matching document ids
2766 and the positions of the first token of the phrase. */
2767 if (!query->matched) {
2768 ib_alloc_t* heap_alloc;
2769
2770 heap_alloc = ib_heap_allocator_create(heap);
2771
2772 if (!(query->flags & FTS_PROXIMITY)
2773 && !(query->flags & FTS_PHRASE)) {
2774 query->matched = ib_vector_create(
2775 heap_alloc, sizeof(fts_match_t),
2776 64);
2777 } else {
2778 ut_a(num_token <= MAX_PROXIMITY_ITEM);
2779 query->match_array =
2780 (ib_vector_t**) mem_heap_alloc(
2781 heap,
2782 num_token *
2783 sizeof(query->matched));
2784
2785 for (i = 0; i < num_token; i++) {
2786 query->match_array[i] =
2787 ib_vector_create(
2788 heap_alloc, sizeof(fts_match_t),
2789 64);
2790 }
2791
2792 query->matched = query->match_array[0];
2793 }
2794 }
2795
2796 /* Setup the callback args for filtering and consolidating
2797 the ilist. */
2798 fetch.read_arg = query;
2799 fetch.read_record = fts_query_index_fetch_nodes;
2800
2801 for (i = 0; i < num_token; i++) {
2802 /* Search for the first word from the phrase. */
2803 token = static_cast<fts_string_t*>(
2804 ib_vector_get(tokens, i));
2805
2806 if (query->flags & FTS_PROXIMITY
2807 || query->flags & FTS_PHRASE) {
2808 query->matched = query->match_array[i];
2809 }
2810
2811 error = fts_index_fetch_nodes(
2812 trx, &graph, &query->fts_index_table,
2813 token, &fetch);
2814
2815 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2816 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2817 if (error != DB_SUCCESS) {
2818 query->error = error;
2819 }
2820
2821 fts_que_graph_free(graph);
2822 graph = NULL;
2823
2824 fts_query_cache(query, token);
2825
2826 if (!(query->flags & FTS_PHRASE)
2827 && !(query->flags & FTS_PROXIMITY)) {
2828 break;
2829 }
2830
2831 /* If any of the token can't be found,
2832 no need to continue match */
2833 if (ib_vector_is_empty(query->match_array[i])
2834 || query->error != DB_SUCCESS) {
2835 goto func_exit;
2836 }
2837 }
2838
2839 /* Just a single word, no need to fetch the original
2840 documents to do phrase matching */
2841 if (ib_vector_size(orig_tokens) == 1
2842 && !ib_vector_is_empty(query->match_array[0])) {
2843 fts_match_t* match;
2844 ulint n_matched;
2845
2846 n_matched = ib_vector_size(query->match_array[0]);
2847
2848 for (i = 0; i < n_matched; i++) {
2849 match = static_cast<fts_match_t*>(
2850 ib_vector_get(
2851 query->match_array[0], i));
2852
2853 query->error = fts_query_process_doc_id(
2854 query, match->doc_id, 0);
2855 if (query->error != DB_SUCCESS) {
2856 goto func_exit;
2857 }
2858
2859 fts_query_add_word_to_document(
2860 query, match->doc_id, token);
2861 }
2862 query->oper = oper;
2863 goto func_exit;
2864 }
2865
2866 /* If we are doing proximity search, verify the distance
2867 between all words, and check they are in specified distance. */
2868 if (query->flags & FTS_PROXIMITY) {
2869 fts_phrase_or_proximity_search(query, tokens);
2870 } else {
2871 ibool matched;
2872
2873 /* Phrase Search case:
2874 We filter out the doc ids that don't contain
2875 all the tokens in the phrase. It's cheaper to
2876 search the ilist than bringing the documents in
2877 and then doing a search through the text. Isolated
2878 testing shows this also helps in mitigating disruption
2879 of the buffer cache. */
2880 matched = fts_phrase_or_proximity_search(query, tokens);
2881 query->matched = query->match_array[0];
2882
2883 /* Read the actual text in and search for the phrase. */
2884 if (matched) {
2885 ut_ad(query->error == DB_SUCCESS);
2886 query->error = fts_query_search_phrase(
2887 query, orig_tokens, tokens);
2888 }
2889 }
2890
2891 /* Restore original operation. */
2892 query->oper = oper;
2893
2894 if (query->error != DB_SUCCESS) {
2895 goto func_exit;
2896 }
2897 }
2898
2899 func_exit:
2900 mem_heap_free(heap);
2901
2902 /* Don't need it anymore. */
2903 query->matched = NULL;
2904
2905 return(query->error);
2906 }
2907
2908 /*****************************************************************//**
2909 Find the word and evaluate.
2910 @return DB_SUCCESS if all go well */
2911 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2912 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2913 fts_query_execute(
2914 /*==============*/
2915 fts_query_t* query, /*!< in: query instance */
2916 fts_string_t* token) /*!< in: token to search */
2917 {
2918 switch (query->oper) {
2919 case FTS_NONE:
2920 case FTS_NEGATE:
2921 case FTS_INCR_RATING:
2922 case FTS_DECR_RATING:
2923 query->error = fts_query_union(query, token);
2924 break;
2925
2926 case FTS_EXIST:
2927 query->error = fts_query_intersect(query, token);
2928 break;
2929
2930 case FTS_IGNORE:
2931 query->error = fts_query_difference(query, token);
2932 break;
2933
2934 default:
2935 ut_error;
2936 }
2937
2938 return(query->error);
2939 }
2940
2941 /*****************************************************************//**
2942 Create a wildcard string. It's the responsibility of the caller to
2943 free the byte* pointer. It's allocated using ut_malloc_nokey().
2944 @return ptr to allocated memory */
2945 static
2946 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2947 fts_query_get_token(
2948 /*================*/
2949 fts_ast_node_t* node, /*!< in: the current sub tree */
2950 fts_string_t* token) /*!< in: token to create */
2951 {
2952 ulint str_len;
2953 byte* new_ptr = NULL;
2954
2955 str_len = node->term.ptr->len;
2956
2957 ut_a(node->type == FTS_AST_TERM);
2958
2959 token->f_len = str_len;
2960 token->f_str = node->term.ptr->str;
2961
2962 if (node->term.wildcard) {
2963
2964 token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2965 token->f_len = str_len + 1;
2966
2967 memcpy(token->f_str, node->term.ptr->str, str_len);
2968
2969 token->f_str[str_len] = '%';
2970 token->f_str[token->f_len] = 0;
2971
2972 new_ptr = token->f_str;
2973 }
2974
2975 return(new_ptr);
2976 }
2977
2978 /*****************************************************************//**
2979 Visit every node of the AST. */
2980 static
2981 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2982 fts_query_visitor(
2983 /*==============*/
2984 fts_ast_oper_t oper, /*!< in: current operator */
2985 fts_ast_node_t* node, /*!< in: The root of the current subtree*/
2986 void* arg) /*!< in: callback arg*/
2987 {
2988 byte* ptr;
2989 fts_string_t token;
2990 fts_query_t* query = static_cast<fts_query_t*>(arg);
2991
2992 ut_a(node);
2993 DBUG_ENTER("fts_query_visitor");
2994 DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2995
2996 token.f_n_char = 0;
2997 query->oper = oper;
2998 query->cur_node = node;
2999
3000 switch (node->type) {
3001 case FTS_AST_TEXT:
3002 case FTS_AST_PARSER_PHRASE_LIST:
3003
3004 if (query->oper == FTS_EXIST) {
3005 ut_ad(query->intersection == NULL);
3006 query->intersection = rbt_create(
3007 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3008
3009 query->total_size += SIZEOF_RBT_CREATE;
3010 }
3011
3012 /* Set the current proximity distance. */
3013 query->distance = node->text.distance;
3014
3015 /* Force collection of doc ids and the positions. */
3016 query->collect_positions = TRUE;
3017
3018 query->error = fts_query_phrase_search(query, node);
3019
3020 query->collect_positions = FALSE;
3021
3022 if (query->oper == FTS_EXIST) {
3023 fts_query_free_doc_ids(query, query->doc_ids);
3024 query->doc_ids = query->intersection;
3025 query->intersection = NULL;
3026 }
3027
3028 break;
3029
3030 case FTS_AST_TERM:
3031 token.f_str = node->term.ptr->str;
3032 token.f_len = node->term.ptr->len;
3033
3034 /* Collect wildcard words for QUERY EXPANSION. */
3035 if (node->term.wildcard && query->wildcard_words != NULL) {
3036 ib_rbt_bound_t parent;
3037
3038 if (rbt_search(query->wildcard_words, &parent, &token)
3039 != 0) {
3040 fts_string_t word;
3041
3042 fts_string_dup(&word, &token, query->heap);
3043 rbt_add_node(query->wildcard_words, &parent,
3044 &word);
3045 }
3046 }
3047
3048 /* Add the word to our RB tree that will be used to
3049 calculate this terms per document frequency. */
3050 fts_query_add_word_freq(query, &token);
3051
3052 ptr = fts_query_get_token(node, &token);
3053 query->error = fts_query_execute(query, &token);
3054
3055 if (ptr) {
3056 ut_free(ptr);
3057 }
3058
3059 break;
3060
3061 case FTS_AST_SUBEXP_LIST:
3062 query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3063 break;
3064
3065 default:
3066 ut_error;
3067 }
3068
3069 if (query->oper == FTS_EXIST) {
3070 query->multi_exist = true;
3071 }
3072
3073 DBUG_RETURN(query->error);
3074 }
3075
3076 /*****************************************************************//**
3077 Process (nested) sub-expression, create a new result set to store the
3078 sub-expression result by processing nodes under current sub-expression
3079 list. Merge the sub-expression result with that of parent expression list.
3080 @return DB_SUCCESS if all go well */
3081 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3082 fts_ast_visit_sub_exp(
3083 /*==================*/
3084 fts_ast_node_t* node, /*!< in,out: current root node */
3085 fts_ast_callback visitor, /*!< in: callback function */
3086 void* arg) /*!< in,out: arg for callback */
3087 {
3088 fts_ast_oper_t cur_oper;
3089 fts_query_t* query = static_cast<fts_query_t*>(arg);
3090 ib_rbt_t* parent_doc_ids;
3091 ib_rbt_t* subexpr_doc_ids;
3092 dberr_t error = DB_SUCCESS;
3093 bool will_be_ignored = false;
3094 bool multi_exist;
3095
3096 DBUG_ENTER("fts_ast_visit_sub_exp");
3097
3098 /* sub-expression list may contains sub-expressions.
3099 So we increase sub-expression depth counter.
3100 If this counter reaches to the threshold then
3101 we abort the search opertion and reports an error */
3102 if (query->nested_exp_count > FTS_MAX_NESTED_EXP) {
3103 query->error = DB_FTS_TOO_MANY_NESTED_EXP;
3104 DBUG_RETURN(query->error);
3105 }
3106 query->nested_exp_count++;
3107
3108
3109 ut_a(node->type == FTS_AST_SUBEXP_LIST);
3110
3111 cur_oper = query->oper;
3112
3113 /* Save current result set */
3114 parent_doc_ids = query->doc_ids;
3115
3116 /* Create new result set to store the sub-expression result. We
3117 will merge this result set with the parent after processing. */
3118 query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3119 fts_ranking_doc_id_cmp);
3120
3121 query->total_size += SIZEOF_RBT_CREATE;
3122
3123 multi_exist = query->multi_exist;
3124 query->multi_exist = false;
3125 /* Process nodes in current sub-expression and store its
3126 result set in query->doc_ids we created above. */
3127 error = fts_ast_visit(FTS_NONE, node, visitor,
3128 arg, &will_be_ignored);
3129
3130 /* Reinstate parent node state */
3131 query->multi_exist = multi_exist;
3132 query->oper = cur_oper;
3133
3134 /* Merge the sub-expression result with the parent result set. */
3135 subexpr_doc_ids = query->doc_ids;
3136 query->doc_ids = parent_doc_ids;
3137 if (error == DB_SUCCESS) {
3138 error = fts_merge_doc_ids(query, subexpr_doc_ids);
3139 }
3140
3141 /* Free current result set. Result already merged into parent. */
3142 fts_query_free_doc_ids(query, subexpr_doc_ids);
3143
3144 DBUG_RETURN(error);
3145 }
3146
3147 #if 0
3148 /*****************************************************************//***
3149 Check if the doc id exists in the ilist.
3150 @return TRUE if doc id found */
3151 static
3152 ulint
3153 fts_query_find_doc_id(
3154 /*==================*/
3155 fts_select_t* select, /*!< in/out: contains the doc id to
3156 find, we update the word freq if
3157 document found */
3158 void* data, /*!< in: doc id ilist */
3159 ulint len) /*!< in: doc id ilist size */
3160 {
3161 byte* ptr = data;
3162 doc_id_t doc_id = 0;
3163 ulint decoded = 0;
3164
3165 /* Decode the ilist and search for selected doc_id. We also
3166 calculate the frequency of the word in the document if found. */
3167 while (decoded < len && !select->found) {
3168 ulint freq = 0;
3169 ulint min_pos = 0;
3170 ulint last_pos = 0;
3171 ulint pos = fts_decode_vlc(&ptr);
3172
3173 /* Add the delta. */
3174 doc_id += pos;
3175
3176 while (*ptr) {
3177 ++freq;
3178 last_pos += fts_decode_vlc(&ptr);
3179
3180 /* Only if min_pos is not set and the current
3181 term exists in a position greater than the
3182 min_pos of the previous term. */
3183 if (min_pos == 0 && last_pos > select->min_pos) {
3184 min_pos = last_pos;
3185 }
3186 }
3187
3188 /* Skip the end of word position marker. */
3189 ++ptr;
3190
3191 /* Bytes decoded so far. */
3192 decoded = ptr - (byte*) data;
3193
3194 /* A word may exist in the document but we only consider a
3195 match if it exists in a position that is greater than the
3196 position of the previous term. */
3197 if (doc_id == select->doc_id && min_pos > 0) {
3198 fts_doc_freq_t* doc_freq;
3199
3200 /* Add the doc id to the doc freq rb tree, if
3201 the doc id doesn't exist it will be created. */
3202 doc_freq = fts_query_add_doc_freq(
3203 select->word_freq->doc_freqs, doc_id);
3204
3205 /* Avoid duplicating the frequency tally */
3206 if (doc_freq->freq == 0) {
3207 doc_freq->freq = freq;
3208 }
3209
3210 select->found = TRUE;
3211 select->min_pos = min_pos;
3212 }
3213 }
3214
3215 return(select->found);
3216 }
3217 #endif
3218
3219 /*****************************************************************//**
3220 Read and filter nodes.
3221 @return DB_SUCCESS if all go well,
3222 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3223 static
3224 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3225 fts_query_filter_doc_ids(
3226 /*=====================*/
3227 fts_query_t* query, /*!< in: query instance */
3228 const fts_string_t* word, /*!< in: the current word */
3229 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
3230 const fts_node_t* node, /*!< in: current FTS node */
3231 void* data, /*!< in: doc id ilist */
3232 ulint len, /*!< in: doc id ilist size */
3233 ibool calc_doc_count) /*!< in: whether to remember doc count */
3234 {
3235 byte* ptr = static_cast<byte*>(data);
3236 doc_id_t doc_id = 0;
3237 ulint decoded = 0;
3238 ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3239
3240 if (query->limit != ULONG_UNDEFINED
3241 && query->n_docs >= query->limit) {
3242 return(DB_SUCCESS);
3243 }
3244
3245 /* Decode the ilist and add the doc ids to the query doc_id set. */
3246 while (decoded < len) {
3247 ulint freq = 0;
3248 fts_doc_freq_t* doc_freq;
3249 fts_match_t* match = NULL;
3250 ulint last_pos = 0;
3251 ulint pos = fts_decode_vlc(&ptr);
3252
3253 /* Some sanity checks. */
3254 if (doc_id == 0) {
3255 ut_a(pos == node->first_doc_id);
3256 }
3257
3258 /* Add the delta. */
3259 doc_id += pos;
3260
3261 if (calc_doc_count) {
3262 word_freq->doc_count++;
3263 }
3264
3265 /* We simply collect the matching instances here. */
3266 if (query->collect_positions) {
3267 ib_alloc_t* heap_alloc;
3268
3269 /* Create a new fts_match_t instance. */
3270 match = static_cast<fts_match_t*>(
3271 ib_vector_push(query->matched, NULL));
3272
3273 match->start = 0;
3274 match->doc_id = doc_id;
3275 heap_alloc = ib_vector_allocator(query->matched);
3276
3277 /* Allocate from the same heap as the
3278 parent container. */
3279 match->positions = ib_vector_create(
3280 heap_alloc, sizeof(ulint), 64);
3281
3282 query->total_size += sizeof(fts_match_t)
3283 + sizeof(ib_vector_t)
3284 + sizeof(ulint) * 64;
3285 }
3286
3287 /* Unpack the positions within the document. */
3288 while (*ptr) {
3289 last_pos += fts_decode_vlc(&ptr);
3290
3291 /* Collect the matching word positions, for phrase
3292 matching later. */
3293 if (query->collect_positions) {
3294 ib_vector_push(match->positions, &last_pos);
3295 }
3296
3297 ++freq;
3298 }
3299
3300 /* End of list marker. */
3301 last_pos = (ulint) -1;
3302
3303 if (query->collect_positions) {
3304 ut_a(match != NULL);
3305 ib_vector_push(match->positions, &last_pos);
3306 }
3307
3308 /* Add the doc id to the doc freq rb tree, if the doc id
3309 doesn't exist it will be created. */
3310 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3311
3312 /* Avoid duplicating frequency tally. */
3313 if (doc_freq->freq == 0) {
3314 doc_freq->freq = freq;
3315 }
3316
3317 /* Skip the end of word position marker. */
3318 ++ptr;
3319
3320 /* Bytes decoded so far */
3321 decoded = ptr - (byte*) data;
3322
3323 /* We simply collect the matching documents and the
3324 positions here and match later. */
3325 if (!query->collect_positions) {
3326 /* We ignore error here and will check it later */
3327 fts_query_process_doc_id(query, doc_id, 0);
3328
3329 /* Add the word to the document's matched RB tree. */
3330 fts_query_add_word_to_document(query, doc_id, word);
3331 }
3332
3333 if (query->limit != ULONG_UNDEFINED
3334 && query->limit <= ++query->n_docs) {
3335 goto func_exit;
3336 }
3337 }
3338
3339 /* Some sanity checks. */
3340 ut_a(doc_id == node->last_doc_id);
3341
3342 func_exit:
3343 if (query->total_size > fts_result_cache_limit) {
3344 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3345 } else {
3346 query->n_docs = 0;
3347 return(DB_SUCCESS);
3348 }
3349 }
3350
3351 /*****************************************************************//**
3352 Read the FTS INDEX row.
3353 @return DB_SUCCESS if all go well. */
3354 static
3355 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3356 fts_query_read_node(
3357 /*================*/
3358 fts_query_t* query, /*!< in: query instance */
3359 const fts_string_t* word, /*!< in: current word */
3360 que_node_t* exp) /*!< in: query graph node */
3361 {
3362 int i;
3363 int ret;
3364 fts_node_t node;
3365 ib_rbt_bound_t parent;
3366 fts_word_freq_t* word_freq;
3367 ibool skip = FALSE;
3368 fts_string_t term;
3369 byte buf[FTS_MAX_WORD_LEN + 1];
3370 dberr_t error = DB_SUCCESS;
3371
3372 ut_a(query->cur_node->type == FTS_AST_TERM
3373 || query->cur_node->type == FTS_AST_TEXT
3374 || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3375
3376 memset(&node, 0, sizeof(node));
3377 term.f_str = buf;
3378
3379 /* Need to consider the wildcard search case, the word frequency
3380 is created on the search string not the actual word. So we need
3381 to assign the frequency on search string behalf. */
3382 if (query->cur_node->type == FTS_AST_TERM
3383 && query->cur_node->term.wildcard) {
3384
3385 term.f_len = query->cur_node->term.ptr->len;
3386 ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3387 memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3388 } else {
3389 term.f_len = word->f_len;
3390 ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3391 memcpy(term.f_str, word->f_str, word->f_len);
3392 }
3393
3394 /* Lookup the word in our rb tree, it must exist. */
3395 ret = rbt_search(query->word_freqs, &parent, &term);
3396
3397 ut_a(ret == 0);
3398
3399 word_freq = rbt_value(fts_word_freq_t, parent.last);
3400
3401 /* Start from 1 since the first column has been read by the caller.
3402 Also, we rely on the order of the columns projected, to filter
3403 out ilists that are out of range and we always want to read
3404 the doc_count irrespective of the suitablility of the row. */
3405
3406 for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3407
3408 dfield_t* dfield = que_node_get_val(exp);
3409 byte* data = static_cast<byte*>(
3410 dfield_get_data(dfield));
3411 ulint len = dfield_get_len(dfield);
3412
3413 ut_a(len != UNIV_SQL_NULL);
3414
3415 /* Note: The column numbers below must match the SELECT. */
3416
3417 switch (i) {
3418 case 1: /* DOC_COUNT */
3419 word_freq->doc_count += mach_read_from_4(data);
3420 break;
3421
3422 case 2: /* FIRST_DOC_ID */
3423 node.first_doc_id = fts_read_doc_id(data);
3424
3425 /* Skip nodes whose doc ids are out range. */
3426 if (query->oper == FTS_EXIST
3427 && query->upper_doc_id > 0
3428 && node.first_doc_id > query->upper_doc_id) {
3429 skip = TRUE;
3430 }
3431 break;
3432
3433 case 3: /* LAST_DOC_ID */
3434 node.last_doc_id = fts_read_doc_id(data);
3435
3436 /* Skip nodes whose doc ids are out range. */
3437 if (query->oper == FTS_EXIST
3438 && query->lower_doc_id > 0
3439 && node.last_doc_id < query->lower_doc_id) {
3440 skip = TRUE;
3441 }
3442 break;
3443
3444 case 4: /* ILIST */
3445
3446 error = fts_query_filter_doc_ids(
3447 query, &word_freq->word, word_freq,
3448 &node, data, len, FALSE);
3449
3450 break;
3451
3452 default:
3453 ut_error;
3454 }
3455 }
3456
3457 if (!skip) {
3458 /* Make sure all columns were read. */
3459
3460 ut_a(i == 5);
3461 }
3462
3463 return error;
3464 }
3465
3466 /*****************************************************************//**
3467 Callback function to fetch the rows in an FTS INDEX record.
3468 @return always returns TRUE */
3469 static
3470 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3471 fts_query_index_fetch_nodes(
3472 /*========================*/
3473 void* row, /*!< in: sel_node_t* */
3474 void* user_arg) /*!< in: pointer to fts_fetch_t */
3475 {
3476 fts_string_t key;
3477 sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3478 fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3479 fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3480 que_node_t* exp = sel_node->select_list;
3481 dfield_t* dfield = que_node_get_val(exp);
3482 void* data = dfield_get_data(dfield);
3483 ulint dfield_len = dfield_get_len(dfield);
3484
3485 key.f_str = static_cast<byte*>(data);
3486 key.f_len = dfield_len;
3487
3488 ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3489
3490 /* Note: we pass error out by 'query->error' */
3491 query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3492
3493 if (query->error != DB_SUCCESS) {
3494 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3495 return(FALSE);
3496 } else {
3497 return(TRUE);
3498 }
3499 }
3500
3501 /*****************************************************************//**
3502 Calculate the inverse document frequency (IDF) for all the terms. */
3503 static
3504 void
fts_query_calculate_idf(fts_query_t * query)3505 fts_query_calculate_idf(
3506 /*====================*/
3507 fts_query_t* query) /*!< in: Query state */
3508 {
3509 const ib_rbt_node_t* node;
3510 ib_uint64_t total_docs = query->total_docs;
3511
3512 /* We need to free any instances of fts_doc_freq_t that we
3513 may have allocated. */
3514 for (node = rbt_first(query->word_freqs);
3515 node;
3516 node = rbt_next(query->word_freqs, node)) {
3517
3518 fts_word_freq_t* word_freq;
3519
3520 word_freq = rbt_value(fts_word_freq_t, node);
3521
3522 if (word_freq->doc_count > 0) {
3523 if (total_docs == word_freq->doc_count) {
3524 /* QP assume ranking > 0 if we find
3525 a match. Since Log10(1) = 0, we cannot
3526 make IDF a zero value if do find a
3527 word in all documents. So let's make
3528 it an arbitrary very small number */
3529 word_freq->idf = log10(1.0001);
3530 } else {
3531 word_freq->idf = log10(
3532 total_docs
3533 / (double) word_freq->doc_count);
3534 }
3535 }
3536
3537 if (fts_enable_diag_print) {
3538 ib::info() << "'" << word_freq->word.f_str << "' -> "
3539 << query->total_docs << "/"
3540 << word_freq->doc_count << " "
3541 << std::setw(6) << std::setprecision(5)
3542 << word_freq->idf;
3543 }
3544 }
3545 }
3546
3547 /*****************************************************************//**
3548 Calculate the ranking of the document. */
3549 static
3550 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3551 fts_query_calculate_ranking(
3552 /*========================*/
3553 const fts_query_t* query, /*!< in: query state */
3554 fts_ranking_t* ranking) /*!< in: Document to rank */
3555 {
3556 ulint pos = 0;
3557 fts_string_t word;
3558
3559 /* At this stage, ranking->rank should not exceed the 1.0
3560 bound */
3561 ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3562 ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3563
3564 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3565 int ret;
3566 ib_rbt_bound_t parent;
3567 double weight;
3568 fts_doc_freq_t* doc_freq;
3569 fts_word_freq_t* word_freq;
3570
3571 ret = rbt_search(query->word_freqs, &parent, &word);
3572
3573 /* It must exist. */
3574 ut_a(ret == 0);
3575
3576 word_freq = rbt_value(fts_word_freq_t, parent.last);
3577
3578 ret = rbt_search(
3579 word_freq->doc_freqs, &parent, &ranking->doc_id);
3580
3581 /* It must exist. */
3582 ut_a(ret == 0);
3583
3584 doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3585
3586 weight = (double) doc_freq->freq * word_freq->idf;
3587
3588 ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3589 }
3590 }
3591
3592 /*****************************************************************//**
3593 Add ranking to the result set. */
3594 static
3595 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3596 fts_query_add_ranking(
3597 /*==================*/
3598 fts_query_t* query, /*!< in: query state */
3599 ib_rbt_t* ranking_tree, /*!< in: ranking tree */
3600 const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
3601 {
3602 ib_rbt_bound_t parent;
3603
3604 /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3605 if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3606 fts_ranking_t* ranking;
3607
3608 ranking = rbt_value(fts_ranking_t, parent.last);
3609
3610 ranking->rank += new_ranking->rank;
3611
3612 ut_a(ranking->words == NULL);
3613 } else {
3614 rbt_add_node(ranking_tree, &parent, new_ranking);
3615
3616 query->total_size += SIZEOF_RBT_NODE_ADD
3617 + sizeof(fts_ranking_t);
3618 }
3619 }
3620
3621 /*****************************************************************//**
3622 Retrieve the FTS Relevance Ranking result for doc with doc_id
3623 @return the relevance ranking value, 0 if no ranking value
3624 present. */
3625 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3626 fts_retrieve_ranking(
3627 /*=================*/
3628 fts_result_t* result, /*!< in: FTS result structure */
3629 doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
3630 {
3631 ib_rbt_bound_t parent;
3632 fts_ranking_t new_ranking;
3633
3634 DBUG_ENTER("fts_retrieve_ranking");
3635
3636 if (!result || !result->rankings_by_id) {
3637 DBUG_RETURN(0);
3638 }
3639
3640 new_ranking.doc_id = doc_id;
3641
3642 /* Lookup the ranking in our rb tree */
3643 if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3644 fts_ranking_t* ranking;
3645
3646 ranking = rbt_value(fts_ranking_t, parent.last);
3647
3648 DBUG_RETURN(ranking->rank);
3649 }
3650
3651 DBUG_RETURN(0);
3652 }
3653
3654 /*****************************************************************//**
3655 Create the result and copy the data to it. */
3656 static
3657 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3658 fts_query_prepare_result(
3659 /*=====================*/
3660 fts_query_t* query, /*!< in: Query state */
3661 fts_result_t* result) /*!< in: result this can contain
3662 data from a previous search on
3663 another FTS index */
3664 {
3665 const ib_rbt_node_t* node;
3666 bool result_is_null = false;
3667
3668 DBUG_ENTER("fts_query_prepare_result");
3669
3670 if (result == NULL) {
3671 result = static_cast<fts_result_t*>(
3672 ut_zalloc_nokey(sizeof(*result)));
3673
3674 result->rankings_by_id = rbt_create(
3675 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3676
3677 query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3678 result_is_null = true;
3679 }
3680
3681 if (query->flags == FTS_OPT_RANKING) {
3682 fts_word_freq_t* word_freq;
3683 ulint size = ib_vector_size(query->deleted->doc_ids);
3684 fts_update_t* array =
3685 (fts_update_t*) query->deleted->doc_ids->data;
3686
3687 node = rbt_first(query->word_freqs);
3688 ut_ad(node);
3689 word_freq = rbt_value(fts_word_freq_t, node);
3690
3691 for (node = rbt_first(word_freq->doc_freqs);
3692 node;
3693 node = rbt_next(word_freq->doc_freqs, node)) {
3694 fts_doc_freq_t* doc_freq;
3695 fts_ranking_t ranking;
3696
3697 doc_freq = rbt_value(fts_doc_freq_t, node);
3698
3699 /* Don't put deleted docs into result */
3700 if (fts_bsearch(array, 0, static_cast<int>(size),
3701 doc_freq->doc_id) >= 0) {
3702 /* one less matching doc count */
3703 --word_freq->doc_count;
3704 continue;
3705 }
3706
3707 ranking.doc_id = doc_freq->doc_id;
3708 ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3709 ranking.words = NULL;
3710
3711 fts_query_add_ranking(query, result->rankings_by_id,
3712 &ranking);
3713
3714 if (query->total_size > fts_result_cache_limit) {
3715 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3716 fts_query_free_result(result);
3717 DBUG_RETURN(NULL);
3718 }
3719 }
3720
3721 /* Calculate IDF only after we exclude the deleted items */
3722 fts_query_calculate_idf(query);
3723
3724 node = rbt_first(query->word_freqs);
3725 word_freq = rbt_value(fts_word_freq_t, node);
3726
3727 /* Calculate the ranking for each doc */
3728 for (node = rbt_first(result->rankings_by_id);
3729 node != NULL;
3730 node = rbt_next(result->rankings_by_id, node)) {
3731
3732 fts_ranking_t* ranking;
3733
3734 ranking = rbt_value(fts_ranking_t, node);
3735
3736 ranking->rank = static_cast<fts_rank_t>(
3737 ranking->rank * word_freq->idf * word_freq->idf);
3738 }
3739
3740 DBUG_RETURN(result);
3741 }
3742
3743 ut_a(rbt_size(query->doc_ids) > 0);
3744
3745 for (node = rbt_first(query->doc_ids);
3746 node;
3747 node = rbt_next(query->doc_ids, node)) {
3748
3749 fts_ranking_t* ranking;
3750
3751 ranking = rbt_value(fts_ranking_t, node);
3752 fts_query_calculate_ranking(query, ranking);
3753
3754 // FIXME: I think we may requre this information to improve the
3755 // ranking of doc ids which have more word matches from
3756 // different FTS indexes.
3757
3758 /* We don't need these anymore free the resources. */
3759 ranking->words = NULL;
3760
3761 if (!result_is_null) {
3762 fts_query_add_ranking(query, result->rankings_by_id, ranking);
3763
3764 if (query->total_size > fts_result_cache_limit) {
3765 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3766 fts_query_free_result(result);
3767 DBUG_RETURN(NULL);
3768 }
3769 }
3770 }
3771
3772 if (result_is_null) {
3773 /* Use doc_ids directly */
3774 rbt_free(result->rankings_by_id);
3775 result->rankings_by_id = query->doc_ids;
3776 query->doc_ids = NULL;
3777 }
3778
3779 DBUG_RETURN(result);
3780 }
3781
3782 /*****************************************************************//**
3783 Get the result of the query. Calculate the similarity coefficient. */
3784 static
3785 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3786 fts_query_get_result(
3787 /*=================*/
3788 fts_query_t* query, /*!< in: query instance */
3789 fts_result_t* result) /*!< in: result */
3790 {
3791 DBUG_ENTER("fts_query_get_result");
3792
3793 if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3794 /* Copy the doc ids to the result. */
3795 result = fts_query_prepare_result(query, result);
3796 } else {
3797 /* Create an empty result instance. */
3798 result = static_cast<fts_result_t*>(
3799 ut_zalloc_nokey(sizeof(*result)));
3800 }
3801
3802 DBUG_RETURN(result);
3803 }
3804
3805 /*****************************************************************//**
3806 FTS Query free resources and reset. */
3807 static
3808 void
fts_query_free(fts_query_t * query)3809 fts_query_free(
3810 /*===========*/
3811 fts_query_t* query) /*!< in: query instance to free*/
3812 {
3813
3814 if (query->read_nodes_graph) {
3815 fts_que_graph_free(query->read_nodes_graph);
3816 }
3817
3818 if (query->root) {
3819 fts_ast_free_node(query->root);
3820 }
3821
3822 if (query->deleted) {
3823 fts_doc_ids_free(query->deleted);
3824 }
3825
3826 if (query->intersection) {
3827 fts_query_free_doc_ids(query, query->intersection);
3828 query->intersection = NULL;
3829 }
3830
3831 if (query->doc_ids) {
3832 fts_query_free_doc_ids(query, query->doc_ids);
3833 }
3834
3835 if (query->word_freqs) {
3836 const ib_rbt_node_t* node;
3837
3838 /* We need to free any instances of fts_doc_freq_t that we
3839 may have allocated. */
3840 for (node = rbt_first(query->word_freqs);
3841 node;
3842 node = rbt_next(query->word_freqs, node)) {
3843
3844 fts_word_freq_t* word_freq;
3845
3846 word_freq = rbt_value(fts_word_freq_t, node);
3847
3848 /* We need to cast away the const. */
3849 rbt_free(word_freq->doc_freqs);
3850 }
3851
3852 rbt_free(query->word_freqs);
3853 }
3854
3855 if (query->wildcard_words != NULL) {
3856 rbt_free(query->wildcard_words);
3857 }
3858
3859 ut_a(!query->intersection);
3860
3861 if (query->word_map) {
3862 rbt_free(query->word_map);
3863 }
3864
3865 if (query->word_vector != NULL) {
3866 UT_DELETE(query->word_vector);
3867 }
3868
3869 if (query->heap) {
3870 mem_heap_free(query->heap);
3871 }
3872
3873 memset(query, 0, sizeof(*query));
3874 }
3875
3876 /*****************************************************************//**
3877 Parse the query using flex/bison or plugin parser.
3878 @return parse tree node. */
3879 static
3880 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3881 fts_query_parse(
3882 /*============*/
3883 fts_query_t* query, /*!< in: query instance */
3884 byte* query_str, /*!< in: query string */
3885 ulint query_len) /*!< in: query string length */
3886 {
3887 int error;
3888 fts_ast_state_t state;
3889 bool mode = query->boolean_mode;
3890 DBUG_ENTER("fts_query_parse");
3891
3892 memset(&state, 0x0, sizeof(state));
3893
3894 state.charset = query->fts_index_table.charset;
3895
3896 DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3897 query->parser = NULL;);
3898
3899 if (query->parser) {
3900 state.root = state.cur_node =
3901 fts_ast_create_node_list(&state, NULL);
3902 error = fts_parse_by_parser(mode, query_str, query_len,
3903 query->parser, &state);
3904 } else {
3905 /* Setup the scanner to use, this depends on the mode flag. */
3906 state.lexer = fts_lexer_create(mode, query_str, query_len);
3907 state.charset = query->fts_index_table.charset;
3908 error = fts_parse(&state);
3909 fts_lexer_free(state.lexer);
3910 state.lexer = NULL;
3911 }
3912
3913 /* Error during parsing ? */
3914 if (error) {
3915 /* Free the nodes that were allocated during parsing. */
3916 fts_ast_state_free(&state);
3917 } else {
3918 query->root = state.root;
3919
3920 if (fts_enable_diag_print && query->root != NULL) {
3921 fts_ast_node_print(query->root);
3922 }
3923 }
3924
3925 DBUG_RETURN(state.root);
3926 }
3927
3928 /*******************************************************************//**
3929 FTS Query optimization
3930 Set FTS_OPT_RANKING if it is a simple term query */
3931 static
3932 void
fts_query_can_optimize(fts_query_t * query,uint flags)3933 fts_query_can_optimize(
3934 /*===================*/
3935 fts_query_t* query, /*!< in/out: query instance */
3936 uint flags) /*!< In: FTS search mode */
3937 {
3938 fts_ast_node_t* node = query->root;
3939
3940 if (flags & FTS_EXPAND) {
3941 return;
3942 }
3943
3944 /* Check if it has only a term without oper */
3945 ut_ad(node->type == FTS_AST_LIST);
3946 node = node->list.head;
3947 if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3948 query->flags = FTS_OPT_RANKING;
3949 }
3950 }
3951
3952
3953
3954 /** FTS Query entry point.
3955 @param[in] trx transaction
3956 @param[in] index fts index to search
3957 @param[in] flags FTS search mode
3958 @param[in] query_str FTS query
3959 @param[in] query_len FTS query string len in bytes
3960 @param[in,out] result result doc ids
3961 @param[in] limit limit value
3962 @return DB_SUCCESS if successful otherwise error code */
3963 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result,ulonglong limit)3964 fts_query(
3965 trx_t* trx,
3966 dict_index_t* index,
3967 uint flags,
3968 const byte* query_str,
3969 ulint query_len,
3970 fts_result_t** result,
3971 ulonglong limit)
3972 {
3973 fts_query_t query;
3974 dberr_t error = DB_SUCCESS;
3975 byte* lc_query_str;
3976 ulint lc_query_str_len;
3977 ulint result_len;
3978 bool boolean_mode;
3979 trx_t* query_trx;
3980 CHARSET_INFO* charset;
3981 ib_time_monotonic_ms_t start_time_ms;
3982 bool will_be_ignored = false;
3983
3984 boolean_mode = flags & FTS_BOOL;
3985
3986 *result = NULL;
3987 memset(&query, 0x0, sizeof(query));
3988 query_trx = trx_allocate_for_background();
3989 query_trx->op_info = "FTS query";
3990
3991 start_time_ms = ut_time_monotonic_ms();
3992
3993 query.trx = query_trx;
3994 query.index = index;
3995 query.boolean_mode = boolean_mode;
3996 query.deleted = fts_doc_ids_create();
3997 query.cur_node = NULL;
3998
3999 query.fts_common_table.type = FTS_COMMON_TABLE;
4000 query.fts_common_table.table_id = index->table->id;
4001 query.fts_common_table.parent = index->table->name.m_name;
4002 query.fts_common_table.table = index->table;
4003
4004 charset = fts_index_get_charset(index);
4005
4006 query.fts_index_table.type = FTS_INDEX_TABLE;
4007 query.fts_index_table.index_id = index->id;
4008 query.fts_index_table.table_id = index->table->id;
4009 query.fts_index_table.parent = index->table->name.m_name;
4010 query.fts_index_table.charset = charset;
4011 query.fts_index_table.table = index->table;
4012
4013 query.word_map = rbt_create_arg_cmp(
4014 sizeof(fts_string_t), innobase_fts_text_cmp, charset);
4015 query.word_vector = UT_NEW_NOKEY(word_vector_t());
4016 query.error = DB_SUCCESS;
4017
4018 /* Setup the RB tree that will be used to collect per term
4019 statistics. */
4020 query.word_freqs = rbt_create_arg_cmp(
4021 sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset);
4022
4023 if (flags & FTS_EXPAND) {
4024 query.wildcard_words = rbt_create_arg_cmp(
4025 sizeof(fts_string_t), innobase_fts_text_cmp, charset);
4026 }
4027
4028 query.total_size += SIZEOF_RBT_CREATE;
4029
4030 query.total_docs = dict_table_get_n_rows(index->table);
4031
4032 query.limit = limit;
4033
4034 query.n_docs = 0;
4035 #ifdef FTS_DOC_STATS_DEBUG
4036 if (ft_enable_diag_print) {
4037 error = fts_get_total_word_count(
4038 trx, query.index, &query.total_words);
4039
4040 if (error != DB_SUCCESS) {
4041 goto func_exit;
4042 }
4043
4044 ib::info() << "Total docs: " << query.total_docs
4045 << " Total words: " << query.total_words;
4046 }
4047 #endif /* FTS_DOC_STATS_DEBUG */
4048
4049 query.fts_common_table.suffix = "DELETED";
4050
4051 /* Read the deleted doc_ids, we need these for filtering. */
4052 error = fts_table_fetch_doc_ids(
4053 NULL, &query.fts_common_table, query.deleted);
4054
4055 if (error != DB_SUCCESS) {
4056 goto func_exit;
4057 }
4058
4059 query.fts_common_table.suffix = "DELETED_CACHE";
4060
4061 error = fts_table_fetch_doc_ids(
4062 NULL, &query.fts_common_table, query.deleted);
4063
4064 if (error != DB_SUCCESS) {
4065 goto func_exit;
4066 }
4067
4068 /* Get the deleted doc ids that are in the cache. */
4069 fts_cache_append_deleted_doc_ids(
4070 index->table->fts->cache, query.deleted->doc_ids);
4071 DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4072
4073 /* Sort the vector so that we can do a binary search over the ids. */
4074 ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
4075
4076 /* Convert the query string to lower case before parsing. We own
4077 the ut_malloc'ed result and so remember to free it before return. */
4078
4079 lc_query_str_len = query_len * charset->casedn_multiply + 1;
4080 lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4081
4082 /* For binary collations, a case sensitive search is
4083 performed. Hence don't convert to lower case. */
4084 if (my_binary_compare(charset)) {
4085 memcpy(lc_query_str, query_str, query_len);
4086 lc_query_str[query_len]= 0;
4087 result_len= query_len;
4088 } else {
4089 result_len = innobase_fts_casedn_str(
4090 charset, (char*)( query_str), query_len,
4091 (char*)(lc_query_str), lc_query_str_len);
4092 }
4093
4094 ut_ad(result_len < lc_query_str_len);
4095
4096 lc_query_str[result_len] = 0;
4097
4098 query.heap = mem_heap_create(128);
4099
4100 /* Create the rb tree for the doc id (current) set. */
4101 query.doc_ids = rbt_create(
4102 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4103 query.parser = index->parser;
4104
4105 query.total_size += SIZEOF_RBT_CREATE;
4106
4107 /* Parse the input query string. */
4108 if (fts_query_parse(&query, lc_query_str, result_len)) {
4109 fts_ast_node_t* ast = query.root;
4110 ast->trx = trx;
4111
4112 /* Optimize query to check if it's a single term */
4113 fts_query_can_optimize(&query, flags);
4114
4115 DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4116 fts_result_cache_limit = 2048;
4117 );
4118
4119 /* Optimisation is allowed for limit value
4120 when
4121 i) No ranking involved
4122 ii) Only FTS Union operations involved. */
4123 if (query.limit != ULONG_UNDEFINED
4124 && !fts_ast_node_check_union(ast)) {
4125 query.limit = ULONG_UNDEFINED;
4126 }
4127
4128 DBUG_EXECUTE_IF("fts_union_limit_off",
4129 query.limit = ULONG_UNDEFINED;
4130 );
4131
4132 /* Traverse the Abstract Syntax Tree (AST) and execute
4133 the query. */
4134 query.error = fts_ast_visit(
4135 FTS_NONE, ast, fts_query_visitor,
4136 &query, &will_be_ignored);
4137 if (query.error == DB_INTERRUPTED) {
4138 error = DB_INTERRUPTED;
4139 ut_free(lc_query_str);
4140 goto func_exit;
4141 }
4142
4143 /* If query expansion is requested, extend the search
4144 with first search pass result */
4145 if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4146 query.error = fts_expand_query(index, &query);
4147 }
4148
4149 /* Calculate the inverse document frequency of the terms. */
4150 if (query.error == DB_SUCCESS
4151 && query.flags != FTS_OPT_RANKING) {
4152 fts_query_calculate_idf(&query);
4153 }
4154
4155 /* Copy the result from the query state, so that we can
4156 return it to the caller. */
4157 if (query.error == DB_SUCCESS) {
4158 *result = fts_query_get_result(&query, *result);
4159 }
4160
4161 error = query.error;
4162 } else {
4163 /* still return an empty result set */
4164 *result = static_cast<fts_result_t*>(
4165 ut_zalloc_nokey(sizeof(**result)));
4166 }
4167
4168 if (trx_is_interrupted(trx)) {
4169 error = DB_INTERRUPTED;
4170 ut_free(lc_query_str);
4171 if (result != NULL) {
4172 fts_query_free_result(*result);
4173 }
4174 goto func_exit;
4175 }
4176
4177 ut_free(lc_query_str);
4178
4179 if (fts_enable_diag_print && (*result)) {
4180 uint64_t diff_time = ut_time_monotonic_ms() - start_time_ms;
4181
4182 ib::info() << "FTS Search Processing time: "
4183 << diff_time / 1000 << " secs: " << diff_time % 1000
4184 << " millisec: row(s) "
4185 << ((*result)->rankings_by_id
4186 ? rbt_size((*result)->rankings_by_id)
4187 : -1);
4188
4189 /* Log memory consumption & result size */
4190 ib::info() << "Full Search Memory: " << query.total_size
4191 << " (bytes), Row: "
4192 << ((*result)->rankings_by_id
4193 ? rbt_size((*result)->rankings_by_id)
4194 : 0)
4195 << ".";
4196 }
4197
4198 func_exit:
4199 fts_query_free(&query);
4200
4201 trx_free_for_background(query_trx);
4202
4203 return(error);
4204 }
4205
4206 /*****************************************************************//**
4207 FTS Query free result, returned by fts_query(). */
4208 void
fts_query_free_result(fts_result_t * result)4209 fts_query_free_result(
4210 /*==================*/
4211 fts_result_t* result) /*!< in: result instance to free.*/
4212 {
4213 if (result) {
4214 if (result->rankings_by_id != NULL) {
4215 rbt_free(result->rankings_by_id);
4216 result->rankings_by_id = NULL;
4217 }
4218 if (result->rankings_by_rank != NULL) {
4219 rbt_free(result->rankings_by_rank);
4220 result->rankings_by_rank = NULL;
4221 }
4222
4223 ut_free(result);
4224 result = NULL;
4225 }
4226 }
4227
4228 /*****************************************************************//**
4229 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4230 void
fts_query_sort_result_on_rank(fts_result_t * result)4231 fts_query_sort_result_on_rank(
4232 /*==========================*/
4233 fts_result_t* result) /*!< out: result instance to sort.*/
4234 {
4235 const ib_rbt_node_t* node;
4236 ib_rbt_t* ranked;
4237
4238 ut_a(result->rankings_by_id != NULL);
4239 if (result->rankings_by_rank) {
4240 rbt_free(result->rankings_by_rank);
4241 }
4242
4243 ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4244
4245 /* We need to free any instances of fts_doc_freq_t that we
4246 may have allocated. */
4247 for (node = rbt_first(result->rankings_by_id);
4248 node;
4249 node = rbt_next(result->rankings_by_id, node)) {
4250
4251 fts_ranking_t* ranking;
4252
4253 ranking = rbt_value(fts_ranking_t, node);
4254
4255 ut_a(ranking->words == NULL);
4256
4257 rbt_insert(ranked, ranking, ranking);
4258 }
4259
4260 /* Reset the current node too. */
4261 result->current = NULL;
4262 result->rankings_by_rank = ranked;
4263 }
4264
4265 /*******************************************************************//**
4266 A debug function to print result doc_id set. */
4267 static
4268 void
fts_print_doc_id(fts_query_t * query)4269 fts_print_doc_id(
4270 /*=============*/
4271 fts_query_t* query) /*!< in : tree that stores doc_ids.*/
4272 {
4273 const ib_rbt_node_t* node;
4274
4275 /* Iterate each member of the doc_id set */
4276 for (node = rbt_first(query->doc_ids);
4277 node;
4278 node = rbt_next(query->doc_ids, node)) {
4279 fts_ranking_t* ranking;
4280 ranking = rbt_value(fts_ranking_t, node);
4281
4282 ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4283
4284 ulint pos = 0;
4285 fts_string_t word;
4286
4287 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4288 ib::info() << "doc_ids info, value: " << word.f_str;
4289 }
4290 }
4291 }
4292
4293 /*************************************************************//**
4294 This function implements a simple "blind" query expansion search:
4295 words in documents found in the first search pass will be used as
4296 search arguments to search the document again, thus "expand"
4297 the search result set.
4298 @return DB_SUCCESS if success, otherwise the error code */
4299 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4300 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4301 fts_expand_query(
4302 /*=============*/
4303 dict_index_t* index, /*!< in: FTS index to search */
4304 fts_query_t* query) /*!< in: FTS query instance */
4305 {
4306 const ib_rbt_node_t* node;
4307 const ib_rbt_node_t* token_node;
4308 fts_doc_t result_doc;
4309 dberr_t error = DB_SUCCESS;
4310 const fts_index_cache_t*index_cache;
4311
4312 /* If no doc is found in first search pass, return */
4313 if (!rbt_size(query->doc_ids)) {
4314 return(error);
4315 }
4316
4317 /* Init "result_doc", to hold words from the first search pass */
4318 fts_doc_init(&result_doc);
4319
4320 rw_lock_x_lock(&index->table->fts->cache->lock);
4321 index_cache = fts_find_index_cache(index->table->fts->cache, index);
4322 rw_lock_x_unlock(&index->table->fts->cache->lock);
4323
4324 ut_a(index_cache);
4325
4326 result_doc.tokens = rbt_create_arg_cmp(
4327 sizeof(fts_token_t), innobase_fts_text_cmp,
4328 index_cache->charset);
4329
4330 result_doc.charset = index_cache->charset;
4331 result_doc.parser = index_cache->index->parser;
4332 result_doc.is_ngram = index_cache->index->is_ngram;
4333
4334 query->total_size += SIZEOF_RBT_CREATE;
4335
4336 if (fts_enable_diag_print) {
4337 fts_print_doc_id(query);
4338 }
4339
4340 for (node = rbt_first(query->doc_ids);
4341 node;
4342 node = rbt_next(query->doc_ids, node)) {
4343
4344 fts_ranking_t* ranking;
4345 ulint prev_token_size;
4346 ulint estimate_size;
4347
4348 prev_token_size = rbt_size(result_doc.tokens);
4349
4350 ranking = rbt_value(fts_ranking_t, node);
4351
4352 /* Fetch the documents with the doc_id from the
4353 result of first seach pass. Since we do not
4354 store document-to-word mapping, we need to
4355 fetch the original document and parse them.
4356 Future optimization could be done here if we
4357 support some forms of document-to-word mapping */
4358 fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4359 FTS_FETCH_DOC_BY_ID_EQUAL,
4360 fts_query_expansion_fetch_doc,
4361 &result_doc);
4362
4363 /* Estimate memory used, see fts_process_token and fts_token_t.
4364 We ignore token size here. */
4365 estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4366 * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4367 + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4368 query->total_size += estimate_size;
4369
4370 if (query->total_size > fts_result_cache_limit) {
4371 error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4372 goto func_exit;
4373 }
4374 }
4375
4376 /* Remove words that have already been searched in the first pass */
4377 for (ulint i = 0; i < query->word_vector->size(); i++) {
4378 fts_string_t word = query->word_vector->at(i);
4379 ib_rbt_bound_t parent;
4380
4381 if (query->wildcard_words
4382 && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4383 /* If it's a wildcard word, remove words having
4384 it as prefix. */
4385 while (rbt_search_cmp(result_doc.tokens,
4386 &parent, &word, NULL,
4387 innobase_fts_text_cmp_prefix)
4388 == 0) {
4389 ut_free(rbt_remove_node(result_doc.tokens,
4390 parent.last));
4391 }
4392 } else {
4393 /* We don't check return value, because the word may
4394 have been deleted by a previous wildcard word as its
4395 prefix, e.g. ('g * good'). */
4396 rbt_delete(result_doc.tokens, &word);
4397 }
4398 }
4399
4400 /* Search the table the second time with expanded search list */
4401 for (token_node = rbt_first(result_doc.tokens);
4402 token_node;
4403 token_node = rbt_next(result_doc.tokens, token_node)) {
4404 fts_token_t* mytoken;
4405 mytoken = rbt_value(fts_token_t, token_node);
4406
4407 /* '%' in the end is treated as prefix search,
4408 it can cause assert failure, so we skip it. */
4409 if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4410 continue;
4411 }
4412
4413 ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4414 fts_query_add_word_freq(query, &mytoken->text);
4415 error = fts_query_union(query, &mytoken->text);
4416
4417 if (error != DB_SUCCESS) {
4418 break;
4419 }
4420 }
4421
4422 func_exit:
4423 fts_doc_free(&result_doc);
4424
4425 return(error);
4426 }
4427 /*************************************************************//**
4428 This function finds documents that contain all words in a
4429 phrase or proximity search. And if proximity search, verify
4430 the words are close enough to each other, as in specified distance.
4431 This function is called for phrase and proximity search.
4432 @return TRUE if documents are found, FALSE if otherwise */
4433 static
4434 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4435 fts_phrase_or_proximity_search(
4436 /*===========================*/
4437 fts_query_t* query, /*!< in/out: query instance.
4438 query->doc_ids might be instantiated
4439 with qualified doc IDs */
4440 ib_vector_t* tokens) /*!< in: Tokens contain words */
4441 {
4442 ulint n_matched;
4443 ulint i;
4444 ibool matched = FALSE;
4445 ulint num_token = ib_vector_size(tokens);
4446 fts_match_t* match[MAX_PROXIMITY_ITEM];
4447 ibool end_list = FALSE;
4448
4449 /* Number of matched documents for the first token */
4450 n_matched = ib_vector_size(query->match_array[0]);
4451
4452 /* We have a set of match list for each word, we shall
4453 walk through the list and find common documents that
4454 contain all the matching words. */
4455 for (i = 0; i < n_matched; i++) {
4456 ulint j;
4457 ulint k = 0;
4458 fts_proximity_t qualified_pos;
4459
4460 match[0] = static_cast<fts_match_t*>(
4461 ib_vector_get(query->match_array[0], i));
4462
4463 /* For remaining match list for the token(word), we
4464 try to see if there is a document with the same
4465 doc id */
4466 for (j = 1; j < num_token; j++) {
4467 match[j] = static_cast<fts_match_t*>(
4468 ib_vector_get(query->match_array[j], k));
4469
4470 while (match[j]->doc_id < match[0]->doc_id
4471 && k < ib_vector_size(query->match_array[j])) {
4472 match[j] = static_cast<fts_match_t*>(
4473 ib_vector_get(
4474 query->match_array[j], k));
4475 k++;
4476 }
4477
4478 if (match[j]->doc_id > match[0]->doc_id) {
4479 /* no match */
4480 if (query->flags & FTS_PHRASE) {
4481 match[0]->doc_id = 0;
4482 }
4483 break;
4484 }
4485
4486 if (k == ib_vector_size(query->match_array[j])) {
4487 end_list = TRUE;
4488
4489 if (query->flags & FTS_PHRASE) {
4490 ulint s;
4491 /* Since i is the last doc id in the
4492 match_array[j], remove all doc ids > i
4493 from the match_array[0]. */
4494 fts_match_t* match_temp;
4495 for (s = i + 1; s < n_matched; s++) {
4496 match_temp = static_cast<
4497 fts_match_t*>(ib_vector_get(
4498 query->match_array[0], s));
4499 match_temp->doc_id = 0;
4500 }
4501
4502 if (match[j]->doc_id !=
4503 match[0]->doc_id) {
4504 /* no match */
4505 match[0]->doc_id = 0;
4506 }
4507 }
4508
4509 if (match[j]->doc_id != match[0]->doc_id) {
4510 goto func_exit;
4511 }
4512 }
4513
4514 /* FIXME: A better solution will be a counter array
4515 remember each run's last position. So we don't
4516 reset it here very time */
4517 k = 0;
4518 }
4519
4520 if (j != num_token) {
4521 continue;
4522 }
4523
4524 /* For this matching doc, we need to further
4525 verify whether the words in the doc are close
4526 to each other, and within the distance specified
4527 in the proximity search */
4528 if (query->flags & FTS_PHRASE) {
4529 matched = TRUE;
4530 } else if (fts_proximity_get_positions(
4531 match, num_token, ULINT_MAX, &qualified_pos)) {
4532
4533 /* Fetch the original documents and count the
4534 words in between matching words to see that is in
4535 specified distance */
4536 if (fts_query_is_in_proximity_range(
4537 query, match, &qualified_pos)) {
4538 /* If so, mark we find a matching doc */
4539 query->error = fts_query_process_doc_id(
4540 query, match[0]->doc_id, 0);
4541 if (query->error != DB_SUCCESS) {
4542 matched = FALSE;
4543 goto func_exit;
4544 }
4545
4546 matched = TRUE;
4547 for (ulint z = 0; z < num_token; z++) {
4548 fts_string_t* token;
4549 token = static_cast<fts_string_t*>(
4550 ib_vector_get(tokens, z));
4551 fts_query_add_word_to_document(
4552 query, match[0]->doc_id, token);
4553 }
4554 }
4555 }
4556
4557 if (end_list) {
4558 break;
4559 }
4560 }
4561
4562 func_exit:
4563 return(matched);
4564 }
4565
4566 /*************************************************************//**
4567 This function checks whether words in result documents are close to
4568 each other (within proximity range as specified by "distance").
4569 If "distance" is MAX_ULINT, then it will find all combinations of
4570 positions of matching words and store min and max positions
4571 in the "qualified_pos" for later verification.
4572 @return true if words are close to each other, false if otherwise */
4573 static
4574 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4575 fts_proximity_get_positions(
4576 /*========================*/
4577 fts_match_t** match, /*!< in: query instance */
4578 ulint num_match, /*!< in: number of matching
4579 items */
4580 ulint distance, /*!< in: distance value
4581 for proximity search */
4582 fts_proximity_t* qualified_pos) /*!< out: the position info
4583 records ranges containing
4584 all matching words. */
4585 {
4586 ulint i;
4587 ulint idx[MAX_PROXIMITY_ITEM];
4588 ulint num_pos[MAX_PROXIMITY_ITEM];
4589 ulint min_idx;
4590
4591 qualified_pos->n_pos = 0;
4592
4593 ut_a(num_match <= MAX_PROXIMITY_ITEM);
4594
4595 /* Each word could appear multiple times in a doc. So
4596 we need to walk through each word's position list, and find
4597 closest distance between different words to see if
4598 they are in the proximity distance. */
4599
4600 /* Assume each word's position list is sorted, we
4601 will just do a walk through to all words' lists
4602 similar to a the merge phase of a merge sort */
4603 for (i = 0; i < num_match; i++) {
4604 /* idx is the current position we are checking
4605 for a particular word */
4606 idx[i] = 0;
4607
4608 /* Number of positions for this word */
4609 num_pos[i] = ib_vector_size(match[i]->positions);
4610 }
4611
4612 /* Start with the first word */
4613 min_idx = 0;
4614
4615 while (idx[min_idx] < num_pos[min_idx]) {
4616 ulint position[MAX_PROXIMITY_ITEM];
4617 ulint min_pos = ULINT_MAX;
4618 ulint max_pos = 0;
4619
4620 /* Check positions in each word position list, and
4621 record the max/min position */
4622 for (i = 0; i < num_match; i++) {
4623 position[i] = *(ulint*) ib_vector_get_const(
4624 match[i]->positions, idx[i]);
4625
4626 if (position[i] == ULINT_UNDEFINED) {
4627 break;
4628 }
4629
4630 if (position[i] < min_pos) {
4631 min_pos = position[i];
4632 min_idx = i;
4633 }
4634
4635 if (position[i] > max_pos) {
4636 max_pos = position[i];
4637 }
4638 }
4639
4640 /* If max and min position are within range, we
4641 find a good match */
4642 if (max_pos - min_pos <= distance
4643 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4644 /* The charset has variable character
4645 length encoding, record the min_pos and
4646 max_pos, we will need to verify the actual
4647 number of characters */
4648 qualified_pos->min_pos.push_back(min_pos);
4649 qualified_pos->max_pos.push_back(max_pos);
4650 qualified_pos->n_pos++;
4651 }
4652
4653 /* Otherwise, move to the next position is the
4654 list for the word with the smallest position */
4655 idx[min_idx]++;
4656 }
4657
4658 return(qualified_pos->n_pos != 0);
4659 }
4660