1 /*****************************************************************************
2
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /**************************************************//**
21 @file fts/fts0que.cc
22 Full Text Search functionality.
23
24 Created 2007/03/27 Sunny Bains
25 Completed 2011/7/10 Sunny and Jimmy Yang
26 *******************************************************/
27
28 #include "dict0dict.h"
29 #include "ut0rbt.h"
30 #include "row0sel.h"
31 #include "fts0fts.h"
32 #include "fts0priv.h"
33 #include "fts0ast.h"
34 #include "fts0pars.h"
35 #include "fts0types.h"
36 #include "fts0plugin.h"
37 #include "fts0vlc.h"
38
39 #include <iomanip>
40 #include <vector>
41
42 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
43
44 #define RANK_DOWNGRADE (-1.0F)
45 #define RANK_UPGRADE (1.0F)
46
47 /* Maximum number of words supported in a phrase or proximity search. */
48 #define MAX_PROXIMITY_ITEM 128
49
50 /* Memory used by rbt itself for create and node add */
51 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
52 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
53
54 /*Initial byte length for 'words' in fts_ranking_t */
55 #define RANKING_WORDS_INIT_LEN 4
56
57 // FIXME: Need to have a generic iterator that traverses the ilist.
58
59 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
60
61 struct fts_word_freq_t;
62
63 /** State of an FTS query. */
64 struct fts_query_t {
65 mem_heap_t* heap; /*!< Heap to use for allocations */
66
67 trx_t* trx; /*!< The query transaction */
68
69 dict_index_t* index; /*!< The FTS index to search */
70 /*!< FTS auxiliary common table def */
71
72 fts_table_t fts_common_table;
73
74 fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
75
76 size_t total_size; /*!< total memory size used by query */
77
78 fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
79 filtered from the output */
80
81 fts_ast_node_t* root; /*!< Abstract syntax tree */
82
83 fts_ast_node_t* cur_node; /*!< Current tree node */
84
85 ib_rbt_t* word_map; /*!< Matched word map for
86 searching by word*/
87
88 word_vector_t* word_vector; /*!< Matched word vector for
89 searching by index */
90
91 ib_rbt_t* doc_ids; /*!< The current set of matching
92 doc ids, elements are of
93 type fts_ranking_t */
94
95 ib_rbt_t* intersection; /*!< The doc ids that were found in
96 doc_ids, this tree will become
97 the new doc_ids, elements are of type
98 fts_ranking_t */
99
100 /*!< Prepared statement to read the
101 nodes from the FTS INDEX */
102 que_t* read_nodes_graph;
103
104 fts_ast_oper_t oper; /*!< Current boolean mode operator */
105
106 /*!< TRUE if we want to collect the
107 word positions within the document */
108 ibool collect_positions;
109
110 ulint flags; /*!< Specify the full text search type,
111 such as boolean search, phrase
112 search, proximity search etc. */
113
114 ulint distance; /*!< The proximity distance of a
115 phrase search. */
116
117 /*!< These doc ids are used as a
118 boundary condition when searching the
119 FTS index rows */
120
121 doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
122
123 doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
124
125 bool boolean_mode; /*!< TRUE if boolean mode query */
126
127 ib_vector_t* matched; /*!< Array of matching documents
128 (fts_match_t) to search for a phrase */
129
130 ib_vector_t** match_array; /*!< Used for proximity search, contains
131 position info for each matched word
132 in the word list */
133
134 ib_uint64_t total_docs; /*!< The total number of documents */
135
136 ulint total_words; /*!< The total number of words */
137
138 dberr_t error; /*!< Error code if any, that is
139 encountered during query processing */
140
141 ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
142 document, its elements are of type
143 fts_word_freq_t */
144
145 ib_rbt_t* wildcard_words; /*!< words with wildcard */
146
147 bool multi_exist; /*!< multiple FTS_EXIST oper */
148 byte visiting_sub_exp; /*!< count of nested
149 fts_ast_visit_sub_exp() */
150
151 st_mysql_ftparser* parser; /*!< fts plugin parser */
152 };
153
154 /** For phrase matching, first we collect the documents and the positions
155 then we match. */
156 struct fts_match_t {
157 doc_id_t doc_id; /*!< Document id */
158
159 ulint start; /*!< Start the phrase match from
160 this offset within the positions
161 vector. */
162
163 ib_vector_t* positions; /*!< Offsets of a word in a
164 document */
165 };
166
167 /** For matching tokens in a phrase search. We use this data structure in
168 the callback that determines whether a document should be accepted or
169 rejected for a phrase search. */
170 struct fts_select_t {
171 doc_id_t doc_id; /*!< The document id to match */
172
173 ulint min_pos; /*!< For found to be TRUE at least
174 one position must be greater than
175 min_pos. */
176
177 ibool found; /*!< TRUE if found */
178
179 fts_word_freq_t*
180 word_freq; /*!< Word frequency instance of the
181 current word being looked up in
182 the FTS index */
183 };
184
185 typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
186
187 /** structure defines a set of ranges for original documents, each of which
188 has a minimum position and maximum position. Text in such range should
189 contain all words in the proximity search. We will need to count the
190 words in such range to make sure it is less than the specified distance
191 of the proximity search */
192 struct fts_proximity_t {
193 ulint n_pos; /*!< number of position set, defines
194 a range (min to max) containing all
195 matching words */
196 pos_vector_t min_pos; /*!< the minimum position (in bytes)
197 of the range */
198 pos_vector_t max_pos; /*!< the maximum position (in bytes)
199 of the range */
200 };
201
202 /** The match positions and tokesn to match */
203 struct fts_phrase_t {
fts_phrase_tfts_phrase_t204 fts_phrase_t(const dict_table_t* table)
205 :
206 found(false),
207 match(NULL),
208 tokens(NULL),
209 distance(0),
210 charset(NULL),
211 heap(NULL),
212 zip_size(table->space->zip_size()),
213 proximity_pos(NULL),
214 parser(NULL)
215 {
216 }
217
218 /** Match result */
219 ibool found;
220
221 /** Positions within text */
222 const fts_match_t* match;
223
224 /** Tokens to match */
225 const ib_vector_t* tokens;
226
227 /** For matching on proximity distance. Can be 0 for exact match */
228 ulint distance;
229
230 /** Phrase match charset */
231 CHARSET_INFO* charset;
232
233 /** Heap for word processing */
234 mem_heap_t* heap;
235
236 /** ROW_FORMAT=COMPRESSED page size, or 0 */
237 const ulint zip_size;
238
239 /** Position info for proximity search verification. Records the
240 min and max position of words matched */
241 fts_proximity_t* proximity_pos;
242
243 /** FTS plugin parser */
244 st_mysql_ftparser* parser;
245 };
246
247 /** Paramter passed to fts phrase match by parser */
248 struct fts_phrase_param_t {
249 fts_phrase_t* phrase; /*!< Match phrase instance */
250 ulint token_index; /*!< Index of token to match next */
251 mem_heap_t* heap; /*!< Heap for word processing */
252 };
253
254 /** For storing the frequncy of a word/term in a document */
255 struct fts_doc_freq_t {
256 doc_id_t doc_id; /*!< Document id */
257 ulint freq; /*!< Frequency of a word in a document */
258 };
259
260 /** To determine the word frequency per document. */
261 struct fts_word_freq_t {
262 fts_string_t word; /*!< Word for which we need the freq,
263 it's allocated on the query heap */
264
265 ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
266 word frequencies. The elements are
267 of type fts_doc_freq_t */
268 ib_uint64_t doc_count; /*!< Total number of documents that
269 contain this word */
270 double idf; /*!< Inverse document frequency */
271 };
272
273 /********************************************************************
274 Callback function to fetch the rows in an FTS INDEX record.
275 @return always TRUE */
276 static
277 ibool
278 fts_query_index_fetch_nodes(
279 /*========================*/
280 void* row, /*!< in: sel_node_t* */
281 void* user_arg); /*!< in: pointer to ib_vector_t */
282
283 /********************************************************************
284 Read and filter nodes.
285 @return fts_node_t instance */
286 static
287 dberr_t
288 fts_query_filter_doc_ids(
289 /*=====================*/
290 fts_query_t* query, /*!< in: query instance */
291 const fts_string_t* word, /*!< in: the current word */
292 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
293 const fts_node_t* node, /*!< in: current FTS node */
294 void* data, /*!< in: doc id ilist */
295 ulint len, /*!< in: doc id ilist size */
296 ibool calc_doc_count);/*!< in: whether to remember doc
297 count */
298
299 /** Process (nested) sub-expression, create a new result set to store the
300 sub-expression result by processing nodes under current sub-expression
301 list. Merge the sub-expression result with that of parent expression list.
302 @param[in,out] node current root node
303 @param[in,out] visitor callback function
304 @param[in,out] arg argument for callback
305 @return DB_SUCCESS if all go well */
306 static
307 dberr_t
308 fts_ast_visit_sub_exp(
309 fts_ast_node_t* node,
310 fts_ast_callback visitor,
311 void* arg);
312
313 #if 0
314 /*****************************************************************//***
315 Find a doc_id in a word's ilist.
316 @return TRUE if found. */
317 static
318 ibool
319 fts_query_find_doc_id(
320 /*==================*/
321 fts_select_t* select, /*!< in/out: search the doc id selected,
322 update the frequency if found. */
323 void* data, /*!< in: doc id ilist */
324 ulint len); /*!< in: doc id ilist size */
325 #endif
326
327 /*************************************************************//**
328 This function implements a simple "blind" query expansion search:
329 words in documents found in the first search pass will be used as
330 search arguments to search the document again, thus "expand"
331 the search result set.
332 @return DB_SUCCESS if success, otherwise the error code */
333 static
334 dberr_t
335 fts_expand_query(
336 /*=============*/
337 dict_index_t* index, /*!< in: FTS index to search */
338 fts_query_t* query) /*!< in: query result, to be freed
339 by the client */
340 MY_ATTRIBUTE((nonnull, warn_unused_result));
341 /*************************************************************//**
342 This function finds documents that contain all words in a
343 phrase or proximity search. And if proximity search, verify
344 the words are close enough to each other, as in specified distance.
345 This function is called for phrase and proximity search.
346 @return TRUE if documents are found, FALSE if otherwise */
347 static
348 ibool
349 fts_phrase_or_proximity_search(
350 /*===========================*/
351 fts_query_t* query, /*!< in/out: query instance
352 query->doc_ids might be instantiated
353 with qualified doc IDs */
354 ib_vector_t* tokens); /*!< in: Tokens contain words */
355 /*************************************************************//**
356 This function checks whether words in result documents are close to
357 each other (within proximity range as specified by "distance").
358 If "distance" is MAX_ULINT, then it will find all combinations of
359 positions of matching words and store min and max positions
360 in the "qualified_pos" for later verification.
361 @return true if words are close to each other, false if otherwise */
362 static
363 bool
364 fts_proximity_get_positions(
365 /*========================*/
366 fts_match_t** match, /*!< in: query instance */
367 ulint num_match, /*!< in: number of matching
368 items */
369 ulint distance, /*!< in: distance value
370 for proximity search */
371 fts_proximity_t* qualified_pos); /*!< out: the position info
372 records ranges containing
373 all matching words. */
374 #if 0
375 /********************************************************************
376 Get the total number of words in a documents. */
377 static
378 ulint
379 fts_query_terms_in_document(
380 /*========================*/
381 /*!< out: DB_SUCCESS if all go well
382 else error code */
383 fts_query_t* query, /*!< in: FTS query state */
384 doc_id_t doc_id, /*!< in: the word to check */
385 ulint* total); /*!< out: total words in document */
386 #endif
387
388 /********************************************************************
389 Compare two fts_doc_freq_t doc_ids.
390 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
391 UNIV_INLINE
392 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)393 fts_freq_doc_id_cmp(
394 /*================*/
395 const void* p1, /*!< in: id1 */
396 const void* p2) /*!< in: id2 */
397 {
398 const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
399 const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
400
401 return((int) (fq1->doc_id - fq2->doc_id));
402 }
403
404 #if 0
405 /*******************************************************************//**
406 Print the table used for calculating LCS. */
407 static
408 void
409 fts_print_lcs_table(
410 /*================*/
411 const ulint* table, /*!< in: array to print */
412 ulint n_rows, /*!< in: total no. of rows */
413 ulint n_cols) /*!< in: total no. of cols */
414 {
415 ulint i;
416
417 for (i = 0; i < n_rows; ++i) {
418 ulint j;
419
420 printf("\n");
421
422 for (j = 0; j < n_cols; ++j) {
423
424 printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425 }
426 }
427 }
428
429 /********************************************************************
430 Find the longest common subsequence between the query string and
431 the document. */
432 static
433 ulint
434 fts_query_lcs(
435 /*==========*/
436 /*!< out: LCS (length) between
437 two ilists */
438 const ulint* p1, /*!< in: word positions of query */
439 ulint len_p1, /*!< in: no. of elements in p1 */
440 const ulint* p2, /*!< in: word positions within document */
441 ulint len_p2) /*!< in: no. of elements in p2 */
442 {
443 int i;
444 ulint len = 0;
445 ulint r = len_p1;
446 ulint c = len_p2;
447 ulint size = (r + 1) * (c + 1) * sizeof(ulint);
448 ulint* table = (ulint*) ut_malloc_nokey(size);
449
450 /* Traverse the table backwards, from the last row to the first and
451 also from the last column to the first. We compute the smaller
452 common subsequeces first, then use the caluclated values to determine
453 the longest common subsequence. The result will be in TABLE[0][0]. */
454 for (i = r; i >= 0; --i) {
455 int j;
456
457 for (j = c; j >= 0; --j) {
458
459 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
460
461 FTS_ELEM(table, c, i, j) = 0;
462
463 } else if (p1[i] == p2[j]) {
464
465 FTS_ELEM(table, c, i, j) = FTS_ELEM(
466 table, c, i + 1, j + 1) + 1;
467
468 } else {
469
470 ulint value;
471
472 value = ut_max(
473 FTS_ELEM(table, c, i + 1, j),
474 FTS_ELEM(table, c, i, j + 1));
475
476 FTS_ELEM(table, c, i, j) = value;
477 }
478 }
479 }
480
481 len = FTS_ELEM(table, c, 0, 0);
482
483 fts_print_lcs_table(table, r, c);
484 printf("\nLen=" ULINTPF "\n", len);
485
486 ut_free(table);
487
488 return(len);
489 }
490 #endif
491
492 /*******************************************************************//**
493 Compare two fts_ranking_t instance on their rank value and doc ids in
494 descending order on the rank and ascending order on doc id.
495 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
496 static
497 int
fts_query_compare_rank(const void * p1,const void * p2)498 fts_query_compare_rank(
499 /*===================*/
500 const void* p1, /*!< in: pointer to elem */
501 const void* p2) /*!< in: pointer to elem */
502 {
503 const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
504 const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
505
506 if (r2->rank < r1->rank) {
507 return(-1);
508 } else if (r2->rank == r1->rank) {
509
510 if (r1->doc_id < r2->doc_id) {
511 return(1);
512 } else if (r1->doc_id > r2->doc_id) {
513 return(1);
514 }
515
516 return(0);
517 }
518
519 return(1);
520 }
521
522 /*******************************************************************//**
523 Create words in ranking */
524 static
525 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)526 fts_ranking_words_create(
527 /*=====================*/
528 fts_query_t* query, /*!< in: query instance */
529 fts_ranking_t* ranking) /*!< in: ranking instance */
530 {
531 ranking->words = static_cast<byte*>(
532 mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533 ranking->words_len = RANKING_WORDS_INIT_LEN;
534 }
535
536 /*
537 The optimization here is using a char array(bitmap) to replace words rb tree
538 in fts_ranking_t.
539
540 It can save lots of memory except in some cases of QUERY EXPANSION.
541
542 'word_map' is used as a word dictionary, in which the key is a word, the value
543 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544 if not, we add it into 'word_map', and give it a position(actually a number).
545 then we set the corresponding bit to '1' at the position in the char array 'words'.
546
547 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548 more quickly than searching by value in 'word_map'. we use 'word_vector'
549 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551 by the word.
552 */
553
554 /*******************************************************************//**
555 Add a word into ranking */
556 static
557 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)558 fts_ranking_words_add(
559 /*==================*/
560 fts_query_t* query, /*!< in: query instance */
561 fts_ranking_t* ranking, /*!< in: ranking instance */
562 const fts_string_t* word) /*!< in: term/word to add */
563 {
564 ulint pos;
565 ulint byte_offset;
566 ulint bit_offset;
567 ib_rbt_bound_t parent;
568
569 /* Note: we suppose the word map and vector are append-only. */
570 ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571
572 /* We use ib_rbt to simulate a map, f_n_char means position. */
573 if (rbt_search(query->word_map, &parent, word) == 0) {
574 fts_string_t* result_word;
575
576 result_word = rbt_value(fts_string_t, parent.last);
577 pos = result_word->f_n_char;
578 ut_ad(pos < rbt_size(query->word_map));
579 } else {
580 /* Add the word to map. */
581 fts_string_t new_word;
582
583 pos = rbt_size(query->word_map);
584
585 fts_string_dup(&new_word, word, query->heap);
586 new_word.f_n_char = pos;
587
588 rbt_add_node(query->word_map, &parent, &new_word);
589 ut_ad(rbt_validate(query->word_map));
590 query->word_vector->push_back(new_word);
591 }
592
593 /* Check words len */
594 byte_offset = pos / CHAR_BIT;
595 if (byte_offset >= ranking->words_len) {
596 byte* words = ranking->words;
597 ulint words_len = ranking->words_len;
598
599 while (byte_offset >= words_len) {
600 words_len *= 2;
601 }
602
603 ranking->words = static_cast<byte*>(
604 mem_heap_zalloc(query->heap, words_len));
605 memcpy(ranking->words, words, ranking->words_len);
606 ranking->words_len = words_len;
607 }
608
609 /* Set ranking words */
610 ut_ad(byte_offset < ranking->words_len);
611 bit_offset = pos % CHAR_BIT;
612 ranking->words[byte_offset] = static_cast<byte>(
613 ranking->words[byte_offset] | 1 << bit_offset);
614 }
615
616 /*******************************************************************//**
617 Get a word from a ranking
618 @return true if it's successful */
619 static
620 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)621 fts_ranking_words_get_next(
622 /*=======================*/
623 const fts_query_t* query, /*!< in: query instance */
624 fts_ranking_t* ranking,/*!< in: ranking instance */
625 ulint* pos, /*!< in/out: word start pos */
626 fts_string_t* word) /*!< in/out: term/word to add */
627 {
628 bool ret = false;
629 ulint max_pos = ranking->words_len * CHAR_BIT;
630
631 /* Search for next word */
632 while (*pos < max_pos) {
633 ulint byte_offset = *pos / CHAR_BIT;
634 ulint bit_offset = *pos % CHAR_BIT;
635
636 if (ranking->words[byte_offset] & (1 << bit_offset)) {
637 ret = true;
638 break;
639 }
640
641 *pos += 1;
642 };
643
644 /* Get next word from word vector */
645 if (ret) {
646 ut_ad(*pos < query->word_vector->size());
647 *word = query->word_vector->at((size_t)*pos);
648 *pos += 1;
649 }
650
651 return ret;
652 }
653
654 /*******************************************************************//**
655 Add a word if it doesn't exist, to the term freq RB tree. We store
656 a pointer to the word that is passed in as the argument.
657 @return pointer to word */
658 static
659 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)660 fts_query_add_word_freq(
661 /*====================*/
662 fts_query_t* query, /*!< in: query instance */
663 const fts_string_t* word) /*!< in: term/word to add */
664 {
665 ib_rbt_bound_t parent;
666
667 /* Lookup the word in our rb tree and add if it doesn't exist. */
668 if (rbt_search(query->word_freqs, &parent, word) != 0) {
669 fts_word_freq_t word_freq;
670
671 memset(&word_freq, 0, sizeof(word_freq));
672
673 fts_string_dup(&word_freq.word, word, query->heap);
674
675 word_freq.doc_count = 0;
676
677 word_freq.doc_freqs = rbt_create(
678 sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
679
680 parent.last = rbt_add_node(
681 query->word_freqs, &parent, &word_freq);
682
683 query->total_size += word->f_len
684 + SIZEOF_RBT_CREATE
685 + SIZEOF_RBT_NODE_ADD
686 + sizeof(fts_word_freq_t);
687 }
688
689 return(rbt_value(fts_word_freq_t, parent.last));
690 }
691
692 /*******************************************************************//**
693 Add a doc id if it doesn't exist, to the doc freq RB tree.
694 @return pointer to word */
695 static
696 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)697 fts_query_add_doc_freq(
698 /*===================*/
699 fts_query_t* query, /*!< in: query instance */
700 ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
701 doc_id_t doc_id) /*!< in: doc id to add */
702 {
703 ib_rbt_bound_t parent;
704
705 /* Lookup the doc id in our rb tree and add if it doesn't exist. */
706 if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
707 fts_doc_freq_t doc_freq;
708
709 memset(&doc_freq, 0, sizeof(doc_freq));
710
711 doc_freq.freq = 0;
712 doc_freq.doc_id = doc_id;
713
714 parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
715
716 query->total_size += SIZEOF_RBT_NODE_ADD
717 + sizeof(fts_doc_freq_t);
718 }
719
720 return(rbt_value(fts_doc_freq_t, parent.last));
721 }
722
723 /*******************************************************************//**
724 Add the doc id to the query set only if it's not in the
725 deleted array. */
726 static
727 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)728 fts_query_union_doc_id(
729 /*===================*/
730 fts_query_t* query, /*!< in: query instance */
731 doc_id_t doc_id, /*!< in: the doc id to add */
732 fts_rank_t rank) /*!< in: if non-zero, it is the
733 rank associated with the doc_id */
734 {
735 ib_rbt_bound_t parent;
736 ulint size = ib_vector_size(query->deleted->doc_ids);
737 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
738
739 /* Check if the doc id is deleted and it's not already in our set. */
740 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
741 && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
742
743 fts_ranking_t ranking;
744
745 ranking.rank = rank;
746 ranking.doc_id = doc_id;
747 fts_ranking_words_create(query, &ranking);
748
749 rbt_add_node(query->doc_ids, &parent, &ranking);
750
751 query->total_size += SIZEOF_RBT_NODE_ADD
752 + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
753 }
754 }
755
756 /*******************************************************************//**
757 Remove the doc id from the query set only if it's not in the
758 deleted set. */
759 static
760 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)761 fts_query_remove_doc_id(
762 /*====================*/
763 fts_query_t* query, /*!< in: query instance */
764 doc_id_t doc_id) /*!< in: the doc id to add */
765 {
766 ib_rbt_bound_t parent;
767 ulint size = ib_vector_size(query->deleted->doc_ids);
768 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
769
770 /* Check if the doc id is deleted and it's in our set. */
771 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
772 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
773 ut_free(rbt_remove_node(query->doc_ids, parent.last));
774
775 ut_ad(query->total_size >=
776 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
777 query->total_size -= SIZEOF_RBT_NODE_ADD
778 + sizeof(fts_ranking_t);
779 }
780 }
781
782 /*******************************************************************//**
783 Find the doc id in the query set but not in the deleted set, artificialy
784 downgrade or upgrade its ranking by a value and make/initialize its ranking
785 under or above its normal range 0 to 1. This is used for Boolean Search
786 operator such as Negation operator, which makes word's contribution to the
787 row's relevance to be negative */
788 static
789 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)790 fts_query_change_ranking(
791 /*====================*/
792 fts_query_t* query, /*!< in: query instance */
793 doc_id_t doc_id, /*!< in: the doc id to add */
794 ibool downgrade) /*!< in: Whether to downgrade ranking */
795 {
796 ib_rbt_bound_t parent;
797 ulint size = ib_vector_size(query->deleted->doc_ids);
798 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
799
800 /* Check if the doc id is deleted and it's in our set. */
801 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
802 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
803
804 fts_ranking_t* ranking;
805
806 ranking = rbt_value(fts_ranking_t, parent.last);
807
808 ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
809
810 /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
811 and RANK_UPGRADE (0.5) */
812 if (ranking->rank >= 1.0F) {
813 ranking->rank = 1.0F;
814 } else if (ranking->rank <= -1.0F) {
815 ranking->rank = -1.0F;
816 }
817 }
818 }
819
820 /*******************************************************************//**
821 Check the doc id in the query set only if it's not in the
822 deleted array. The doc ids that were found are stored in
823 another rb tree (fts_query_t::intersect). */
824 static
825 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)826 fts_query_intersect_doc_id(
827 /*=======================*/
828 fts_query_t* query, /*!< in: query instance */
829 doc_id_t doc_id, /*!< in: the doc id to add */
830 fts_rank_t rank) /*!< in: if non-zero, it is the
831 rank associated with the doc_id */
832 {
833 ib_rbt_bound_t parent;
834 ulint size = ib_vector_size(query->deleted->doc_ids);
835 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
836 fts_ranking_t* ranking= NULL;
837
838 /* There are three types of intersect:
839 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
840 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
841 if it matches 'b'. if the doc is also in doc_ids, then change the
842 doc's rank, and add 'a' in doc's words.
843 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
844 if it matches 'b' and it's in doc_ids.(multi_exist = true). */
845
846 /* Check if the doc id is deleted and it's in our set */
847 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
848 fts_ranking_t new_ranking;
849
850 if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
851 if (query->multi_exist) {
852 return;
853 } else {
854 new_ranking.words = NULL;
855 }
856 } else {
857 ranking = rbt_value(fts_ranking_t, parent.last);
858
859 /* We've just checked the doc id before */
860 if (ranking->words == NULL) {
861 ut_ad(rbt_search(query->intersection, &parent,
862 ranking) == 0);
863 return;
864 }
865
866 /* Merge rank */
867 rank += ranking->rank;
868 if (rank >= 1.0F) {
869 rank = 1.0F;
870 } else if (rank <= -1.0F) {
871 rank = -1.0F;
872 }
873
874 /* Take words */
875 new_ranking.words = ranking->words;
876 new_ranking.words_len = ranking->words_len;
877 }
878
879 new_ranking.rank = rank;
880 new_ranking.doc_id = doc_id;
881
882 if (rbt_search(query->intersection, &parent,
883 &new_ranking) != 0) {
884 if (new_ranking.words == NULL) {
885 fts_ranking_words_create(query, &new_ranking);
886
887 query->total_size += RANKING_WORDS_INIT_LEN;
888 } else {
889 /* Note that the intersection has taken
890 ownership of the ranking data. */
891 ranking->words = NULL;
892 }
893
894 rbt_add_node(query->intersection,
895 &parent, &new_ranking);
896
897 query->total_size += SIZEOF_RBT_NODE_ADD
898 + sizeof(fts_ranking_t);
899 }
900 }
901 }
902
903 /*******************************************************************//**
904 Free the document ranking rb tree. */
905 static
906 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)907 fts_query_free_doc_ids(
908 /*===================*/
909 fts_query_t* query, /*!< in: query instance */
910 ib_rbt_t* doc_ids) /*!< in: rb tree to free */
911 {
912 const ib_rbt_node_t* node;
913
914 for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
915
916 fts_ranking_t* ranking;
917
918 ranking = rbt_value(fts_ranking_t, node);
919
920 if (ranking->words) {
921 ranking->words = NULL;
922 }
923
924 ut_free(rbt_remove_node(doc_ids, node));
925
926 ut_ad(query->total_size >=
927 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
928 query->total_size -= SIZEOF_RBT_NODE_ADD
929 + sizeof(fts_ranking_t);
930 }
931
932 rbt_free(doc_ids);
933
934 ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
935 query->total_size -= SIZEOF_RBT_CREATE;
936 }
937
938 /*******************************************************************//**
939 Add the word to the documents "list" of matching words from
940 the query. We make a copy of the word from the query heap. */
941 static
942 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)943 fts_query_add_word_to_document(
944 /*===========================*/
945 fts_query_t* query, /*!< in: query to update */
946 doc_id_t doc_id, /*!< in: the document to update */
947 const fts_string_t* word) /*!< in: the token to add */
948 {
949 ib_rbt_bound_t parent;
950 fts_ranking_t* ranking = NULL;
951
952 if (query->flags == FTS_OPT_RANKING) {
953 return;
954 }
955
956 /* First we search the intersection RB tree as it could have
957 taken ownership of the words rb tree instance. */
958 if (query->intersection
959 && rbt_search(query->intersection, &parent, &doc_id) == 0) {
960
961 ranking = rbt_value(fts_ranking_t, parent.last);
962 }
963
964 if (ranking == NULL
965 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
966
967 ranking = rbt_value(fts_ranking_t, parent.last);
968 }
969
970 if (ranking != NULL) {
971 fts_ranking_words_add(query, ranking, word);
972 }
973 }
974
975 /*******************************************************************//**
976 Check the node ilist. */
977 static
978 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)979 fts_query_check_node(
980 /*=================*/
981 fts_query_t* query, /*!< in: query to update */
982 const fts_string_t* token, /*!< in: the token to search */
983 const fts_node_t* node) /*!< in: node to check */
984 {
985 /* Skip nodes whose doc ids are out range. */
986 if (query->oper == FTS_EXIST
987 && ((query->upper_doc_id > 0
988 && node->first_doc_id > query->upper_doc_id)
989 || (query->lower_doc_id > 0
990 && node->last_doc_id < query->lower_doc_id))) {
991
992 /* Ignore */
993
994 } else {
995 int ret;
996 ib_rbt_bound_t parent;
997 ulint ilist_size = node->ilist_size;
998 fts_word_freq_t*word_freqs;
999
1000 /* The word must exist. */
1001 ret = rbt_search(query->word_freqs, &parent, token);
1002 ut_a(ret == 0);
1003
1004 word_freqs = rbt_value(fts_word_freq_t, parent.last);
1005
1006 query->error = fts_query_filter_doc_ids(
1007 query, token, word_freqs, node,
1008 node->ilist, ilist_size, TRUE);
1009 }
1010 }
1011
1012 /*****************************************************************//**
1013 Search index cache for word with wildcard match.
1014 @return number of words matched */
1015 static
1016 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1017 fts_cache_find_wildcard(
1018 /*====================*/
1019 fts_query_t* query, /*!< in: query instance */
1020 const fts_index_cache_t*index_cache, /*!< in: cache to search */
1021 const fts_string_t* token) /*!< in: token to search */
1022 {
1023 ib_rbt_bound_t parent;
1024 const ib_vector_t* nodes = NULL;
1025 fts_string_t srch_text;
1026 byte term[FTS_MAX_WORD_LEN + 1];
1027 ulint num_word = 0;
1028
1029 srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1030 ? token->f_len - 1
1031 : token->f_len;
1032
1033 strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1034 term[srch_text.f_len] = '\0';
1035 srch_text.f_str = term;
1036
1037 /* Lookup the word in the rb tree */
1038 if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1039 innobase_fts_text_cmp_prefix) == 0) {
1040 const fts_tokenizer_word_t* word;
1041 ulint i;
1042 const ib_rbt_node_t* cur_node;
1043 ibool forward = FALSE;
1044
1045 word = rbt_value(fts_tokenizer_word_t, parent.last);
1046 cur_node = parent.last;
1047
1048 while (innobase_fts_text_cmp_prefix(
1049 index_cache->charset, &srch_text, &word->text) == 0) {
1050
1051 nodes = word->nodes;
1052
1053 for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1054 int ret;
1055 const fts_node_t* node;
1056 ib_rbt_bound_t freq_parent;
1057 fts_word_freq_t* word_freqs;
1058
1059 node = static_cast<const fts_node_t*>(
1060 ib_vector_get_const(nodes, i));
1061
1062 ret = rbt_search(query->word_freqs,
1063 &freq_parent,
1064 &srch_text);
1065
1066 ut_a(ret == 0);
1067
1068 word_freqs = rbt_value(
1069 fts_word_freq_t,
1070 freq_parent.last);
1071
1072 query->error = fts_query_filter_doc_ids(
1073 query, &srch_text,
1074 word_freqs, node,
1075 node->ilist, node->ilist_size, TRUE);
1076
1077 if (query->error != DB_SUCCESS) {
1078 return(0);
1079 }
1080 }
1081
1082 num_word++;
1083
1084 if (!forward) {
1085 cur_node = rbt_prev(
1086 index_cache->words, cur_node);
1087 } else {
1088 cont_search:
1089 cur_node = rbt_next(
1090 index_cache->words, cur_node);
1091 }
1092
1093 if (!cur_node) {
1094 break;
1095 }
1096
1097 word = rbt_value(fts_tokenizer_word_t, cur_node);
1098 }
1099
1100 if (!forward) {
1101 forward = TRUE;
1102 cur_node = parent.last;
1103 goto cont_search;
1104 }
1105 }
1106
1107 return(num_word);
1108 }
1109
1110 /*****************************************************************//**
1111 Set difference.
1112 @return DB_SUCCESS if all go well */
1113 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1114 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1115 fts_query_difference(
1116 /*=================*/
1117 fts_query_t* query, /*!< in: query instance */
1118 const fts_string_t* token) /*!< in: token to search */
1119 {
1120 ulint n_doc_ids= 0;
1121 trx_t* trx = query->trx;
1122 dict_table_t* table = query->index->table;
1123
1124 ut_a(query->oper == FTS_IGNORE);
1125
1126 #ifdef FTS_INTERNAL_DIAG_PRINT
1127 {
1128 ib::info out;
1129 out << "DIFFERENCE: Searching: '";
1130 out.write(token->f_str, token->f_len);
1131 out << "'";
1132 }
1133 #endif
1134
1135 if (query->doc_ids) {
1136 n_doc_ids = rbt_size(query->doc_ids);
1137 }
1138
1139 /* There is nothing we can substract from an empty set. */
1140 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1141 ulint i;
1142 fts_fetch_t fetch;
1143 const ib_vector_t* nodes;
1144 const fts_index_cache_t*index_cache;
1145 que_t* graph = NULL;
1146 fts_cache_t* cache = table->fts->cache;
1147 dberr_t error;
1148
1149 rw_lock_x_lock(&cache->lock);
1150
1151 index_cache = fts_find_index_cache(cache, query->index);
1152
1153 /* Must find the index cache */
1154 ut_a(index_cache != NULL);
1155
1156 /* Search the cache for a matching word first. */
1157 if (query->cur_node->term.wildcard
1158 && query->flags != FTS_PROXIMITY
1159 && query->flags != FTS_PHRASE) {
1160 fts_cache_find_wildcard(query, index_cache, token);
1161 } else {
1162 nodes = fts_cache_find_word(index_cache, token);
1163
1164 for (i = 0; nodes && i < ib_vector_size(nodes)
1165 && query->error == DB_SUCCESS; ++i) {
1166 const fts_node_t* node;
1167
1168 node = static_cast<const fts_node_t*>(
1169 ib_vector_get_const(nodes, i));
1170
1171 fts_query_check_node(query, token, node);
1172 }
1173 }
1174
1175 rw_lock_x_unlock(&cache->lock);
1176
1177 /* error is passed by 'query->error' */
1178 if (query->error != DB_SUCCESS) {
1179 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1180 return(query->error);
1181 }
1182
1183 /* Setup the callback args for filtering and
1184 consolidating the ilist. */
1185 fetch.read_arg = query;
1186 fetch.read_record = fts_query_index_fetch_nodes;
1187
1188 error = fts_index_fetch_nodes(
1189 trx, &graph, &query->fts_index_table, token, &fetch);
1190
1191 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1192 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1193 if (error != DB_SUCCESS) {
1194 query->error = error;
1195 }
1196
1197 fts_que_graph_free(graph);
1198 }
1199
1200 /* The size can't increase. */
1201 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1202
1203 return(query->error);
1204 }
1205
1206 /*****************************************************************//**
1207 Intersect the token doc ids with the current set.
1208 @return DB_SUCCESS if all go well */
1209 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1210 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1211 fts_query_intersect(
1212 /*================*/
1213 fts_query_t* query, /*!< in: query instance */
1214 const fts_string_t* token) /*!< in: the token to search */
1215 {
1216 trx_t* trx = query->trx;
1217 dict_table_t* table = query->index->table;
1218
1219 ut_a(query->oper == FTS_EXIST);
1220
1221 #ifdef FTS_INTERNAL_DIAG_PRINT
1222 {
1223 ib::info out;
1224 out << "INTERSECT: Searching: '";
1225 out.write(token->f_str, token->f_len);
1226 out << "'";
1227 }
1228 #endif
1229
1230 /* If the words set is not empty and multi exist is true,
1231 we know the intersection set is empty in advance. */
1232 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1233 ulint n_doc_ids = 0;
1234 ulint i;
1235 fts_fetch_t fetch;
1236 const ib_vector_t* nodes;
1237 const fts_index_cache_t*index_cache;
1238 que_t* graph = NULL;
1239 fts_cache_t* cache = table->fts->cache;
1240 dberr_t error;
1241
1242 ut_a(!query->intersection);
1243
1244 n_doc_ids = rbt_size(query->doc_ids);
1245
1246 /* Create the rb tree that will hold the doc ids of
1247 the intersection. */
1248 query->intersection = rbt_create(
1249 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1250
1251 query->total_size += SIZEOF_RBT_CREATE;
1252
1253 /* This is to avoid decompressing the ilist if the
1254 node's ilist doc ids are out of range. */
1255 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1256 const ib_rbt_node_t* node;
1257 doc_id_t* doc_id;
1258
1259 node = rbt_first(query->doc_ids);
1260 doc_id = rbt_value(doc_id_t, node);
1261 query->lower_doc_id = *doc_id;
1262
1263 node = rbt_last(query->doc_ids);
1264 doc_id = rbt_value(doc_id_t, node);
1265 query->upper_doc_id = *doc_id;
1266
1267 } else {
1268 query->lower_doc_id = 0;
1269 query->upper_doc_id = 0;
1270 }
1271
1272 /* Search the cache for a matching word first. */
1273
1274 rw_lock_x_lock(&cache->lock);
1275
1276 /* Search for the index specific cache. */
1277 index_cache = fts_find_index_cache(cache, query->index);
1278
1279 /* Must find the index cache. */
1280 ut_a(index_cache != NULL);
1281
1282 if (query->cur_node->term.wildcard) {
1283 /* Wildcard search the index cache */
1284 fts_cache_find_wildcard(query, index_cache, token);
1285 } else {
1286 nodes = fts_cache_find_word(index_cache, token);
1287
1288 for (i = 0; nodes && i < ib_vector_size(nodes)
1289 && query->error == DB_SUCCESS; ++i) {
1290 const fts_node_t* node;
1291
1292 node = static_cast<const fts_node_t*>(
1293 ib_vector_get_const(nodes, i));
1294
1295 fts_query_check_node(query, token, node);
1296 }
1297 }
1298
1299 rw_lock_x_unlock(&cache->lock);
1300
1301 /* error is passed by 'query->error' */
1302 if (query->error != DB_SUCCESS) {
1303 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1304 return(query->error);
1305 }
1306
1307 /* Setup the callback args for filtering and
1308 consolidating the ilist. */
1309 fetch.read_arg = query;
1310 fetch.read_record = fts_query_index_fetch_nodes;
1311
1312 error = fts_index_fetch_nodes(
1313 trx, &graph, &query->fts_index_table, token, &fetch);
1314
1315 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1316 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1317 if (error != DB_SUCCESS) {
1318 query->error = error;
1319 }
1320
1321 fts_que_graph_free(graph);
1322
1323 if (query->error == DB_SUCCESS) {
1324 /* Make the intesection (rb tree) the current doc id
1325 set and free the old set. */
1326 fts_query_free_doc_ids(query, query->doc_ids);
1327 query->doc_ids = query->intersection;
1328 query->intersection = NULL;
1329
1330 ut_a(!query->multi_exist || (query->multi_exist
1331 && rbt_size(query->doc_ids) <= n_doc_ids));
1332 }
1333 }
1334
1335 return(query->error);
1336 }
1337
1338 /*****************************************************************//**
1339 Query index cache.
1340 @return DB_SUCCESS if all go well */
1341 static
1342 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1343 fts_query_cache(
1344 /*============*/
1345 fts_query_t* query, /*!< in/out: query instance */
1346 const fts_string_t* token) /*!< in: token to search */
1347 {
1348 const fts_index_cache_t*index_cache;
1349 dict_table_t* table = query->index->table;
1350 fts_cache_t* cache = table->fts->cache;
1351
1352 /* Search the cache for a matching word first. */
1353 rw_lock_x_lock(&cache->lock);
1354
1355 /* Search for the index specific cache. */
1356 index_cache = fts_find_index_cache(cache, query->index);
1357
1358 /* Must find the index cache. */
1359 ut_a(index_cache != NULL);
1360
1361 if (query->cur_node->term.wildcard
1362 && query->flags != FTS_PROXIMITY
1363 && query->flags != FTS_PHRASE) {
1364 /* Wildcard search the index cache */
1365 fts_cache_find_wildcard(query, index_cache, token);
1366 } else {
1367 const ib_vector_t* nodes;
1368 ulint i;
1369
1370 nodes = fts_cache_find_word(index_cache, token);
1371
1372 for (i = 0; nodes && i < ib_vector_size(nodes)
1373 && query->error == DB_SUCCESS; ++i) {
1374 const fts_node_t* node;
1375
1376 node = static_cast<const fts_node_t*>(
1377 ib_vector_get_const(nodes, i));
1378
1379 fts_query_check_node(query, token, node);
1380 }
1381 }
1382
1383 rw_lock_x_unlock(&cache->lock);
1384
1385 return(query->error);
1386 }
1387
1388 /*****************************************************************//**
1389 Set union.
1390 @return DB_SUCCESS if all go well */
1391 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1392 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1393 fts_query_union(
1394 /*============*/
1395 fts_query_t* query, /*!< in: query instance */
1396 fts_string_t* token) /*!< in: token to search */
1397 {
1398 fts_fetch_t fetch;
1399 ulint n_doc_ids = 0;
1400 trx_t* trx = query->trx;
1401 que_t* graph = NULL;
1402 dberr_t error;
1403
1404 ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1405 query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1406
1407 #ifdef FTS_INTERNAL_DIAG_PRINT
1408 {
1409 ib::info out;
1410 out << "UNION: Searching: '";
1411 out.write(token->f_str, token->f_len);
1412 out << "'";
1413 }
1414 #endif
1415
1416 if (query->doc_ids) {
1417 n_doc_ids = rbt_size(query->doc_ids);
1418 }
1419
1420 if (token->f_len == 0) {
1421 return(query->error);
1422 }
1423
1424 fts_query_cache(query, token);
1425
1426 /* Setup the callback args for filtering and
1427 consolidating the ilist. */
1428 fetch.read_arg = query;
1429 fetch.read_record = fts_query_index_fetch_nodes;
1430
1431 /* Read the nodes from disk. */
1432 error = fts_index_fetch_nodes(
1433 trx, &graph, &query->fts_index_table, token, &fetch);
1434
1435 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1436 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1437 if (error != DB_SUCCESS) {
1438 query->error = error;
1439 }
1440
1441 fts_que_graph_free(graph);
1442
1443 if (query->error == DB_SUCCESS) {
1444
1445 /* The size can't decrease. */
1446 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1447
1448 /* Calulate the number of doc ids that were added to
1449 the current doc id set. */
1450 if (query->doc_ids) {
1451 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1452 }
1453 }
1454
1455 return(query->error);
1456 }
1457
1458 /*****************************************************************//**
1459 Depending upon the current query operator process the doc id.
1460 return DB_SUCCESS if all go well
1461 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1462 static
1463 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1464 fts_query_process_doc_id(
1465 /*=====================*/
1466 fts_query_t* query, /*!< in: query instance */
1467 doc_id_t doc_id, /*!< in: doc id to process */
1468 fts_rank_t rank) /*!< in: if non-zero, it is the
1469 rank associated with the doc_id */
1470 {
1471 if (query->flags == FTS_OPT_RANKING) {
1472 return(DB_SUCCESS);
1473 }
1474
1475 switch (query->oper) {
1476 case FTS_NONE:
1477 fts_query_union_doc_id(query, doc_id, rank);
1478 break;
1479
1480 case FTS_EXIST:
1481 fts_query_intersect_doc_id(query, doc_id, rank);
1482 break;
1483
1484 case FTS_IGNORE:
1485 fts_query_remove_doc_id(query, doc_id);
1486 break;
1487
1488 case FTS_NEGATE:
1489 fts_query_change_ranking(query, doc_id, TRUE);
1490 break;
1491
1492 case FTS_DECR_RATING:
1493 fts_query_union_doc_id(query, doc_id, rank);
1494 fts_query_change_ranking(query, doc_id, TRUE);
1495 break;
1496
1497 case FTS_INCR_RATING:
1498 fts_query_union_doc_id(query, doc_id, rank);
1499 fts_query_change_ranking(query, doc_id, FALSE);
1500 break;
1501
1502 default:
1503 ut_error;
1504 }
1505
1506 if (query->total_size > fts_result_cache_limit) {
1507 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1508 } else {
1509 return(DB_SUCCESS);
1510 }
1511 }
1512
1513 /*****************************************************************//**
1514 Merge two result sets. */
1515 static
1516 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1517 fts_merge_doc_ids(
1518 /*==============*/
1519 fts_query_t* query, /*!< in,out: query instance */
1520 const ib_rbt_t* doc_ids) /*!< in: result set to merge */
1521 {
1522 const ib_rbt_node_t* node;
1523
1524 DBUG_ENTER("fts_merge_doc_ids");
1525
1526 ut_a(!query->intersection);
1527
1528 /* To process FTS_EXIST operation (intersection), we need
1529 to create a new result set for fts_query_intersect(). */
1530 if (query->oper == FTS_EXIST) {
1531
1532 query->intersection = rbt_create(
1533 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1534
1535 query->total_size += SIZEOF_RBT_CREATE;
1536 }
1537
1538 /* Merge the elements to the result set. */
1539 for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1540 fts_ranking_t* ranking;
1541 ulint pos = 0;
1542 fts_string_t word;
1543
1544 ranking = rbt_value(fts_ranking_t, node);
1545
1546 query->error = fts_query_process_doc_id(
1547 query, ranking->doc_id, ranking->rank);
1548
1549 if (query->error != DB_SUCCESS) {
1550 DBUG_RETURN(query->error);
1551 }
1552
1553 /* Merge words. Don't need to take operator into account. */
1554 ut_a(ranking->words);
1555 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1556 fts_query_add_word_to_document(query, ranking->doc_id,
1557 &word);
1558 }
1559 }
1560
1561 /* If it is an intersection operation, reset query->doc_ids
1562 to query->intersection and free the old result list. */
1563 if (query->oper == FTS_EXIST && query->intersection != NULL) {
1564 fts_query_free_doc_ids(query, query->doc_ids);
1565 query->doc_ids = query->intersection;
1566 query->intersection = NULL;
1567 }
1568
1569 DBUG_RETURN(DB_SUCCESS);
1570 }
1571
1572 /*****************************************************************//**
1573 Skip non-whitespace in a string. Move ptr to the next word boundary.
1574 @return pointer to first whitespace character or end */
1575 UNIV_INLINE
1576 byte*
fts_query_skip_word(byte * ptr,const byte * end)1577 fts_query_skip_word(
1578 /*================*/
1579 byte* ptr, /*!< in: start of scan */
1580 const byte* end) /*!< in: pointer to end of string */
1581 {
1582 /* TODO: Does this have to be UTF-8 too ? */
1583 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1584 ++ptr;
1585 }
1586
1587 return(ptr);
1588 }
1589
1590 /*****************************************************************//**
1591 Check whether the remaining terms in the phrase match the text.
1592 @return TRUE if matched else FALSE */
1593 static
1594 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1595 fts_query_match_phrase_terms(
1596 /*=========================*/
1597 fts_phrase_t* phrase, /*!< in: phrase to match */
1598 byte** start, /*!< in/out: text to search, we can't
1599 make this const becase we need to
1600 first convert the string to
1601 lowercase */
1602 const byte* end, /*!< in: pointer to the end of
1603 the string to search */
1604 mem_heap_t* heap) /*!< in: heap */
1605 {
1606 ulint i;
1607 byte* ptr = *start;
1608 const ib_vector_t* tokens = phrase->tokens;
1609 ulint distance = phrase->distance;
1610
1611 /* We check only from the second term onwards, since the first
1612 must have matched otherwise we wouldn't be here. */
1613 for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1614 fts_string_t match;
1615 fts_string_t cmp_str;
1616 const fts_string_t* token;
1617 int result;
1618 ulint ret;
1619
1620 ret = innobase_mysql_fts_get_token(
1621 phrase->charset, ptr,
1622 const_cast<byte*>(end), &match);
1623
1624 if (match.f_len > 0) {
1625 /* Get next token to match. */
1626 token = static_cast<const fts_string_t*>(
1627 ib_vector_get_const(tokens, i));
1628
1629 fts_string_dup(&cmp_str, &match, heap);
1630
1631 result = innobase_fts_text_case_cmp(
1632 phrase->charset, token, &cmp_str);
1633
1634 /* Skip the rest of the tokens if this one doesn't
1635 match and the proximity distance is exceeded. */
1636 if (result
1637 && (distance == ULINT_UNDEFINED
1638 || distance == 0)) {
1639
1640 break;
1641 }
1642
1643 /* This token matched move to the next token. */
1644 if (result == 0) {
1645 /* Advance the text to search by the length
1646 of the last token. */
1647 ptr += ret;
1648
1649 /* Advance to the next token. */
1650 ++i;
1651 } else {
1652
1653 ut_a(distance != ULINT_UNDEFINED);
1654
1655 ptr = fts_query_skip_word(ptr, end);
1656 }
1657
1658 /* Distance can be 0 for exact matches. */
1659 if (distance != ULINT_UNDEFINED && distance > 0) {
1660 --distance;
1661 }
1662 } else {
1663 ptr += ret;
1664 }
1665 }
1666
1667 *start = ptr;
1668
1669 /* Can't be greater than the number of elements. */
1670 ut_a(i <= ib_vector_size(tokens));
1671
1672 /* This is the case for multiple words. */
1673 if (i == ib_vector_size(tokens)) {
1674 phrase->found = TRUE;
1675 }
1676
1677 return(phrase->found);
1678 }
1679
1680 /*****************************************************************//**
1681 Callback function to count the number of words in position ranges,
1682 and see whether the word count is in specified "phrase->distance"
1683 @return true if the number of characters is less than the "distance" */
1684 static
1685 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1686 fts_proximity_is_word_in_range(
1687 /*===========================*/
1688 const fts_phrase_t*
1689 phrase, /*!< in: phrase with the search info */
1690 byte* start, /*!< in: text to search */
1691 ulint total_len) /*!< in: length of text */
1692 {
1693 fts_proximity_t* proximity_pos = phrase->proximity_pos;
1694
1695 ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1696 ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1697
1698 /* Search each matched position pair (with min and max positions)
1699 and count the number of words in the range */
1700 for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1701 ulint cur_pos = proximity_pos->min_pos[i];
1702 ulint n_word = 0;
1703
1704 ut_ad(proximity_pos->max_pos[i] <= total_len);
1705
1706 /* Walk through words in the range and count them */
1707 while (cur_pos <= proximity_pos->max_pos[i]) {
1708 ulint len;
1709 fts_string_t str;
1710
1711 len = innobase_mysql_fts_get_token(
1712 phrase->charset,
1713 start + cur_pos,
1714 start + total_len, &str);
1715
1716 if (len == 0) {
1717 break;
1718 }
1719
1720 /* Advances position with "len" bytes */
1721 cur_pos += len;
1722
1723 /* Record the number of words */
1724 if (str.f_n_char > 0) {
1725 n_word++;
1726 }
1727
1728 if (n_word > phrase->distance) {
1729 break;
1730 }
1731 }
1732
1733 /* Check if the number of words is less than specified
1734 "distance" */
1735 if (n_word && n_word <= phrase->distance) {
1736 return(true);
1737 }
1738 }
1739
1740 return(false);
1741 }
1742
1743 /*****************************************************************//**
1744 FTS plugin parser 'myql_add_word' callback function for phrase match
1745 Refer to 'st_mysql_ftparser_param' for more detail.
1746 @return 0 if match, or return non-zero */
1747 static
1748 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO *)1749 fts_query_match_phrase_add_word_for_parser(
1750 /*=======================================*/
1751 MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
1752 const char* word, /*!< in: token */
1753 int word_len, /*!< in: token length */
1754 MYSQL_FTPARSER_BOOLEAN_INFO*)
1755 {
1756 fts_phrase_param_t* phrase_param;
1757 fts_phrase_t* phrase;
1758 const ib_vector_t* tokens;
1759 fts_string_t match;
1760 fts_string_t cmp_str;
1761 const fts_string_t* token;
1762 int result;
1763 mem_heap_t* heap;
1764
1765 phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1766 heap = phrase_param->heap;
1767 phrase = phrase_param->phrase;
1768 tokens = phrase->tokens;
1769
1770 /* In case plugin parser doesn't check return value */
1771 if (phrase_param->token_index == ib_vector_size(tokens)) {
1772 return(1);
1773 }
1774
1775 match.f_str = (uchar *)(word);
1776 match.f_len = ulint(word_len);
1777 match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1778
1779 if (match.f_len > 0) {
1780 /* Get next token to match. */
1781 ut_a(phrase_param->token_index < ib_vector_size(tokens));
1782 token = static_cast<const fts_string_t*>(
1783 ib_vector_get_const(tokens, phrase_param->token_index));
1784
1785 fts_string_dup(&cmp_str, &match, heap);
1786
1787 result = innobase_fts_text_case_cmp(
1788 phrase->charset, token, &cmp_str);
1789
1790 if (result == 0) {
1791 phrase_param->token_index++;
1792 } else {
1793 return(1);
1794 }
1795 }
1796
1797 /* Can't be greater than the number of elements. */
1798 ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1799
1800 /* This is the case for multiple words. */
1801 if (phrase_param->token_index == ib_vector_size(tokens)) {
1802 phrase->found = TRUE;
1803 }
1804
1805 return(static_cast<int>(phrase->found));
1806 }
1807
1808 /*****************************************************************//**
1809 Check whether the terms in the phrase match the text.
1810 @return TRUE if matched else FALSE */
1811 static
1812 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1813 fts_query_match_phrase_terms_by_parser(
1814 /*===================================*/
1815 fts_phrase_param_t* phrase_param, /* in/out: phrase param */
1816 st_mysql_ftparser* parser, /* in: plugin fts parser */
1817 byte* text, /* in: text to check */
1818 ulint len) /* in: text length */
1819 {
1820 MYSQL_FTPARSER_PARAM param;
1821
1822 ut_a(parser);
1823
1824 /* Set paramters for param */
1825 param.mysql_parse = fts_tokenize_document_internal;
1826 param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1827 param.mysql_ftparam = phrase_param;
1828 param.cs = phrase_param->phrase->charset;
1829 param.doc = reinterpret_cast<char*>(text);
1830 param.length = static_cast<int>(len);
1831 param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1832
1833 PARSER_INIT(parser, ¶m);
1834 parser->parse(¶m);
1835 PARSER_DEINIT(parser, ¶m);
1836
1837 return(phrase_param->phrase->found);
1838 }
1839
1840 /*****************************************************************//**
1841 Callback function to fetch and search the document.
1842 @return TRUE if matched else FALSE */
1843 static
1844 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1845 fts_query_match_phrase(
1846 /*===================*/
1847 fts_phrase_t* phrase, /*!< in: phrase to match */
1848 byte* start, /*!< in: text to search, we can't make
1849 this const becase we need to first
1850 convert the string to lowercase */
1851 ulint cur_len, /*!< in: length of text */
1852 ulint prev_len, /*!< in: total length for searched
1853 doc fields*/
1854 mem_heap_t* heap) /* heap */
1855 {
1856 ulint i;
1857 const fts_string_t* first;
1858 const byte* end = start + cur_len;
1859 const ib_vector_t* tokens = phrase->tokens;
1860 const ib_vector_t* positions = phrase->match->positions;
1861
1862 ut_a(!phrase->found);
1863 ut_a(phrase->match->doc_id > 0);
1864 ut_a(ib_vector_size(tokens) > 0);
1865 ut_a(ib_vector_size(positions) > 0);
1866
1867 first = static_cast<const fts_string_t*>(
1868 ib_vector_get_const(tokens, 0));
1869
1870 ut_a(phrase->match->start < ib_vector_size(positions));
1871
1872 for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1873 ulint pos;
1874 byte* ptr = start;
1875
1876 pos = *(ulint*) ib_vector_get_const(positions, i);
1877
1878 if (pos == ULINT_UNDEFINED) {
1879 break;
1880 }
1881
1882 if (pos < prev_len) {
1883 continue;
1884 }
1885
1886 /* Document positions are calculated from the beginning
1887 of the first field, need to save the length for each
1888 searched field to adjust the doc position when search
1889 phrases. */
1890 pos -= prev_len;
1891 ptr = start + pos;
1892
1893 /* Within limits ? */
1894 if (ptr >= end) {
1895 break;
1896 }
1897
1898 if (phrase->parser) {
1899 fts_phrase_param_t phrase_param;
1900
1901 phrase_param.phrase = phrase;
1902 phrase_param.token_index = 0;
1903 phrase_param.heap = heap;
1904
1905 if (fts_query_match_phrase_terms_by_parser(
1906 &phrase_param,
1907 phrase->parser,
1908 ptr,
1909 ulint(end - ptr))) {
1910 break;
1911 }
1912 } else {
1913 fts_string_t match;
1914 fts_string_t cmp_str;
1915 ulint ret;
1916
1917 match.f_str = ptr;
1918 ret = innobase_mysql_fts_get_token(
1919 phrase->charset, start + pos,
1920 const_cast<byte*>(end), &match);
1921
1922 if (match.f_len == 0) {
1923 break;
1924 }
1925
1926 fts_string_dup(&cmp_str, &match, heap);
1927
1928 if (innobase_fts_text_case_cmp(
1929 phrase->charset, first, &cmp_str) == 0) {
1930
1931 /* This is the case for the single word
1932 in the phrase. */
1933 if (ib_vector_size(phrase->tokens) == 1) {
1934 phrase->found = TRUE;
1935 break;
1936 }
1937
1938 ptr += ret;
1939
1940 /* Match the remaining terms in the phrase. */
1941 if (fts_query_match_phrase_terms(phrase, &ptr,
1942 end, heap)) {
1943 break;
1944 }
1945 }
1946 }
1947 }
1948
1949 return(phrase->found);
1950 }
1951
1952 /*****************************************************************//**
1953 Callback function to fetch and search the document.
1954 @return whether the phrase is found */
1955 static
1956 ibool
fts_query_fetch_document(void * row,void * user_arg)1957 fts_query_fetch_document(
1958 /*=====================*/
1959 void* row, /*!< in: sel_node_t* */
1960 void* user_arg) /*!< in: fts_doc_t* */
1961 {
1962
1963 que_node_t* exp;
1964 sel_node_t* node = static_cast<sel_node_t*>(row);
1965 fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1966 ulint prev_len = 0;
1967 ulint total_len = 0;
1968 byte* document_text = NULL;
1969
1970 exp = node->select_list;
1971
1972 phrase->found = FALSE;
1973
1974 /* For proximity search, we will need to get the whole document
1975 from all fields, so first count the total length of the document
1976 from all the fields */
1977 if (phrase->proximity_pos) {
1978 while (exp) {
1979 ulint field_len;
1980 dfield_t* dfield = que_node_get_val(exp);
1981 byte* data = static_cast<byte*>(
1982 dfield_get_data(dfield));
1983
1984 if (dfield_is_ext(dfield)) {
1985 ulint local_len = dfield_get_len(dfield);
1986
1987 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1988
1989 field_len = mach_read_from_4(
1990 data + local_len + BTR_EXTERN_LEN + 4);
1991 } else {
1992 field_len = dfield_get_len(dfield);
1993 }
1994
1995 if (field_len != UNIV_SQL_NULL) {
1996 total_len += field_len + 1;
1997 }
1998
1999 exp = que_node_get_next(exp);
2000 }
2001
2002 document_text = static_cast<byte*>(mem_heap_zalloc(
2003 phrase->heap, total_len));
2004
2005 if (!document_text) {
2006 return(FALSE);
2007 }
2008 }
2009
2010 exp = node->select_list;
2011
2012 while (exp) {
2013 dfield_t* dfield = que_node_get_val(exp);
2014 byte* data = static_cast<byte*>(
2015 dfield_get_data(dfield));
2016 ulint cur_len;
2017
2018 if (dfield_is_ext(dfield)) {
2019 data = btr_copy_externally_stored_field(
2020 &cur_len, data, phrase->zip_size,
2021 dfield_get_len(dfield), phrase->heap);
2022 } else {
2023 cur_len = dfield_get_len(dfield);
2024 }
2025
2026 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2027 if (phrase->proximity_pos) {
2028 ut_ad(prev_len + cur_len <= total_len);
2029 memcpy(document_text + prev_len, data, cur_len);
2030 } else {
2031 /* For phrase search */
2032 phrase->found =
2033 fts_query_match_phrase(
2034 phrase,
2035 static_cast<byte*>(data),
2036 cur_len, prev_len,
2037 phrase->heap);
2038 }
2039
2040 /* Document positions are calculated from the beginning
2041 of the first field, need to save the length for each
2042 searched field to adjust the doc position when search
2043 phrases. */
2044 prev_len += cur_len + 1;
2045 }
2046
2047 if (phrase->found) {
2048 break;
2049 }
2050
2051 exp = que_node_get_next(exp);
2052 }
2053
2054 if (phrase->proximity_pos) {
2055 ut_ad(prev_len <= total_len);
2056
2057 phrase->found = fts_proximity_is_word_in_range(
2058 phrase, document_text, total_len);
2059 }
2060
2061 return(phrase->found);
2062 }
2063
2064 #if 0
2065 /********************************************************************
2066 Callback function to check whether a record was found or not. */
2067 static
2068 ibool
2069 fts_query_select(
2070 /*=============*/
2071 void* row, /*!< in: sel_node_t* */
2072 void* user_arg) /*!< in: fts_doc_t* */
2073 {
2074 int i;
2075 que_node_t* exp;
2076 sel_node_t* node = row;
2077 fts_select_t* select = user_arg;
2078
2079 ut_a(select->word_freq);
2080 ut_a(select->word_freq->doc_freqs);
2081
2082 exp = node->select_list;
2083
2084 for (i = 0; exp && !select->found; ++i) {
2085 dfield_t* dfield = que_node_get_val(exp);
2086 void* data = dfield_get_data(dfield);
2087 ulint len = dfield_get_len(dfield);
2088
2089 switch (i) {
2090 case 0: /* DOC_COUNT */
2091 if (len != UNIV_SQL_NULL && len != 0) {
2092
2093 select->word_freq->doc_count +=
2094 mach_read_from_4(data);
2095 }
2096 break;
2097
2098 case 1: /* ILIST */
2099 if (len != UNIV_SQL_NULL && len != 0) {
2100
2101 fts_query_find_doc_id(select, data, len);
2102 }
2103 break;
2104
2105 default:
2106 ut_error;
2107 }
2108
2109 exp = que_node_get_next(exp);
2110 }
2111
2112 return(FALSE);
2113 }
2114
2115 /********************************************************************
2116 Read the rows from the FTS index, that match word and where the
2117 doc id is between first and last doc id.
2118 @return DB_SUCCESS if all go well else error code */
2119 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2120 dberr_t
2121 fts_query_find_term(
2122 /*================*/
2123 fts_query_t* query, /*!< in: FTS query state */
2124 que_t** graph, /*!< in: prepared statement */
2125 const fts_string_t* word, /*!< in: the word to fetch */
2126 doc_id_t doc_id, /*!< in: doc id to match */
2127 ulint* min_pos,/*!< in/out: pos found must be
2128 greater than this minimum value. */
2129 ibool* found) /*!< out: TRUE if found else FALSE */
2130 {
2131 pars_info_t* info;
2132 dberr_t error;
2133 fts_select_t select;
2134 doc_id_t match_doc_id;
2135 trx_t* trx = query->trx;
2136 char table_name[MAX_FULL_NAME_LEN];
2137
2138 trx->op_info = "fetching FTS index matching nodes";
2139
2140 if (*graph) {
2141 info = (*graph)->info;
2142 } else {
2143 ulint selected;
2144
2145 info = pars_info_create();
2146
2147 selected = fts_select_index(*word->f_str);
2148 query->fts_index_table.suffix = fts_get_suffix(selected);
2149
2150 fts_get_table_name(&query->fts_index_table, table_name);
2151 pars_info_bind_id(info, "index_table_name", table_name);
2152 }
2153
2154 select.found = FALSE;
2155 select.doc_id = doc_id;
2156 select.min_pos = *min_pos;
2157 select.word_freq = fts_query_add_word_freq(query, word->f_str);
2158
2159 pars_info_bind_function(info, "my_func", fts_query_select, &select);
2160 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2161
2162 /* Convert to "storage" byte order. */
2163 fts_write_doc_id((byte*) &match_doc_id, doc_id);
2164
2165 fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2166
2167 fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2168
2169 if (!*graph) {
2170
2171 *graph = fts_parse_sql(
2172 &query->fts_index_table,
2173 info,
2174 "DECLARE FUNCTION my_func;\n"
2175 "DECLARE CURSOR c IS"
2176 " SELECT doc_count, ilist\n"
2177 " FROM $index_table_name\n"
2178 " WHERE word LIKE :word AND"
2179 " first_doc_id <= :min_doc_id AND"
2180 " last_doc_id >= :max_doc_id\n"
2181 " ORDER BY first_doc_id;\n"
2182 "BEGIN\n"
2183 "\n"
2184 "OPEN c;\n"
2185 "WHILE 1 = 1 LOOP\n"
2186 " FETCH c INTO my_func();\n"
2187 " IF c % NOTFOUND THEN\n"
2188 " EXIT;\n"
2189 " END IF;\n"
2190 "END LOOP;\n"
2191 "CLOSE c;");
2192 }
2193
2194 for (;;) {
2195 error = fts_eval_sql(trx, *graph);
2196
2197 if (error == DB_SUCCESS) {
2198
2199 break; /* Exit the loop. */
2200 } else {
2201
2202 if (error == DB_LOCK_WAIT_TIMEOUT) {
2203 ib::warn() << "lock wait timeout reading FTS"
2204 " index. Retrying!";
2205
2206 trx->error_state = DB_SUCCESS;
2207 } else {
2208 ib::error() << error
2209 << " while reading FTS index.";
2210
2211 break; /* Exit the loop. */
2212 }
2213 }
2214 }
2215
2216 /* Value to return */
2217 *found = select.found;
2218
2219 if (*found) {
2220 *min_pos = select.min_pos;
2221 }
2222
2223 return(error);
2224 }
2225
2226 /********************************************************************
2227 Callback aggregator for int columns. */
2228 static
2229 ibool
2230 fts_query_sum(
2231 /*==========*/
2232 /*!< out: always returns TRUE */
2233 void* row, /*!< in: sel_node_t* */
2234 void* user_arg) /*!< in: ulint* */
2235 {
2236
2237 que_node_t* exp;
2238 sel_node_t* node = row;
2239 ulint* total = user_arg;
2240
2241 exp = node->select_list;
2242
2243 while (exp) {
2244 dfield_t* dfield = que_node_get_val(exp);
2245 void* data = dfield_get_data(dfield);
2246 ulint len = dfield_get_len(dfield);
2247
2248 if (len != UNIV_SQL_NULL && len != 0) {
2249 *total += mach_read_from_4(data);
2250 }
2251
2252 exp = que_node_get_next(exp);
2253 }
2254
2255 return(TRUE);
2256 }
2257
2258 /********************************************************************
2259 Calculate the total documents that contain a particular word (term).
2260 @return DB_SUCCESS if all go well else error code */
2261 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2262 dberr_t
2263 fts_query_total_docs_containing_term(
2264 /*=================================*/
2265 fts_query_t* query, /*!< in: FTS query state */
2266 const fts_string_t* word, /*!< in: the word to check */
2267 ulint* total) /*!< out: documents containing word */
2268 {
2269 pars_info_t* info;
2270 dberr_t error;
2271 que_t* graph;
2272 ulint selected;
2273 trx_t* trx = query->trx;
2274 char table_name[MAX_FULL_NAME_LEN]
2275
2276 trx->op_info = "fetching FTS index document count";
2277
2278 *total = 0;
2279
2280 info = pars_info_create();
2281
2282 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2283 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2284
2285 selected = fts_select_index(*word->f_str);
2286
2287 query->fts_index_table.suffix = fts_get_suffix(selected);
2288
2289 fts_get_table_name(&query->fts_index_table, table_name);
2290
2291 pars_info_bind_id(info, "index_table_name", table_name);
2292
2293 graph = fts_parse_sql(
2294 &query->fts_index_table,
2295 info,
2296 "DECLARE FUNCTION my_func;\n"
2297 "DECLARE CURSOR c IS"
2298 " SELECT doc_count\n"
2299 " FROM $index_table_name\n"
2300 " WHERE word = :word"
2301 " ORDER BY first_doc_id;\n"
2302 "BEGIN\n"
2303 "\n"
2304 "OPEN c;\n"
2305 "WHILE 1 = 1 LOOP\n"
2306 " FETCH c INTO my_func();\n"
2307 " IF c % NOTFOUND THEN\n"
2308 " EXIT;\n"
2309 " END IF;\n"
2310 "END LOOP;\n"
2311 "CLOSE c;");
2312
2313 for (;;) {
2314 error = fts_eval_sql(trx, graph);
2315
2316 if (error == DB_SUCCESS) {
2317
2318 break; /* Exit the loop. */
2319 } else {
2320
2321 if (error == DB_LOCK_WAIT_TIMEOUT) {
2322 ib::warn() << "lock wait timeout reading FTS"
2323 " index. Retrying!";
2324
2325 trx->error_state = DB_SUCCESS;
2326 } else {
2327 ib::error() << error
2328 << " while reading FTS index.";
2329
2330 break; /* Exit the loop. */
2331 }
2332 }
2333 }
2334
2335 fts_que_graph_free(graph);
2336
2337 return(error);
2338 }
2339
2340 /********************************************************************
2341 Get the total number of words in a documents.
2342 @return DB_SUCCESS if all go well else error code */
2343 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2344 dberr_t
2345 fts_query_terms_in_document(
2346 /*========================*/
2347 fts_query_t* query, /*!< in: FTS query state */
2348 doc_id_t doc_id, /*!< in: the word to check */
2349 ulint* total) /*!< out: total words in document */
2350 {
2351 pars_info_t* info;
2352 dberr_t error;
2353 que_t* graph;
2354 doc_id_t read_doc_id;
2355 trx_t* trx = query->trx;
2356 char table_name[MAX_FULL_NAME_LEN];
2357
2358 trx->op_info = "fetching FTS document term count";
2359
2360 *total = 0;
2361
2362 info = pars_info_create();
2363
2364 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2365
2366 /* Convert to "storage" byte order. */
2367 fts_write_doc_id((byte*) &read_doc_id, doc_id);
2368 fts_bind_doc_id(info, "doc_id", &read_doc_id);
2369
2370 query->fts_index_table.suffix = "DOC_ID";
2371
2372 fts_get_table_name(&query->fts_index_table, table_name);
2373
2374 pars_info_bind_id(info, "index_table_name", table_name);
2375
2376 graph = fts_parse_sql(
2377 &query->fts_index_table,
2378 info,
2379 "DECLARE FUNCTION my_func;\n"
2380 "DECLARE CURSOR c IS"
2381 " SELECT count\n"
2382 " FROM $index_table_name\n"
2383 " WHERE doc_id = :doc_id"
2384 " BEGIN\n"
2385 "\n"
2386 "OPEN c;\n"
2387 "WHILE 1 = 1 LOOP\n"
2388 " FETCH c INTO my_func();\n"
2389 " IF c % NOTFOUND THEN\n"
2390 " EXIT;\n"
2391 " END IF;\n"
2392 "END LOOP;\n"
2393 "CLOSE c;");
2394
2395 for (;;) {
2396 error = fts_eval_sql(trx, graph);
2397
2398 if (error == DB_SUCCESS) {
2399
2400 break; /* Exit the loop. */
2401 } else {
2402
2403 if (error == DB_LOCK_WAIT_TIMEOUT) {
2404 ib::warn() << "lock wait timeout reading FTS"
2405 " doc id table. Retrying!";
2406
2407 trx->error_state = DB_SUCCESS;
2408 } else {
2409 ib::error() << error << " while reading FTS"
2410 " doc id table.";
2411
2412 break; /* Exit the loop. */
2413 }
2414 }
2415 }
2416
2417 fts_que_graph_free(graph);
2418
2419 return(error);
2420 }
2421 #endif
2422
2423 /*****************************************************************//**
2424 Retrieve the document and match the phrase tokens.
2425 @return DB_SUCCESS or error code */
2426 MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
2427 static
2428 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2429 fts_query_match_document(
2430 /*=====================*/
2431 ib_vector_t* tokens, /*!< in: phrase tokens */
2432 fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
2433 fts_match_t* match, /*!< in: doc id and positions */
2434 ulint distance, /*!< in: proximity distance */
2435 st_mysql_ftparser* parser, /*!< in: fts plugin parser */
2436 ibool* found) /*!< out: TRUE if phrase found */
2437 {
2438 dberr_t error;
2439 fts_phrase_t phrase(get_doc->index_cache->index->table);
2440
2441 phrase.match = match; /* Positions to match */
2442 phrase.tokens = tokens; /* Tokens to match */
2443 phrase.distance = distance;
2444 phrase.charset = get_doc->index_cache->charset;
2445 phrase.heap = mem_heap_create(512);
2446 phrase.parser = parser;
2447
2448 *found = phrase.found = FALSE;
2449
2450 error = fts_doc_fetch_by_doc_id(
2451 get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2452 fts_query_fetch_document, &phrase);
2453
2454 if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
2455 ib::error() << "(" << error << ") matching document.";
2456 } else {
2457 *found = phrase.found;
2458 }
2459
2460 mem_heap_free(phrase.heap);
2461
2462 return(error);
2463 }
2464
2465 /*****************************************************************//**
2466 This function fetches the original documents and count the
2467 words in between matching words to see that is in specified distance
2468 @return DB_SUCCESS if all OK */
2469 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2470 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2471 fts_query_is_in_proximity_range(
2472 /*============================*/
2473 const fts_query_t* query, /*!< in: query instance */
2474 fts_match_t** match, /*!< in: query instance */
2475 fts_proximity_t* qualified_pos) /*!< in: position info for
2476 qualified ranges */
2477 {
2478 fts_get_doc_t get_doc;
2479 fts_cache_t* cache = query->index->table->fts->cache;
2480 dberr_t err;
2481
2482 memset(&get_doc, 0x0, sizeof(get_doc));
2483
2484 rw_lock_x_lock(&cache->lock);
2485 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2486 rw_lock_x_unlock(&cache->lock);
2487 ut_a(get_doc.index_cache != NULL);
2488
2489 fts_phrase_t phrase(get_doc.index_cache->index->table);
2490
2491 phrase.distance = query->distance;
2492 phrase.charset = get_doc.index_cache->charset;
2493 phrase.heap = mem_heap_create(512);
2494 phrase.proximity_pos = qualified_pos;
2495 phrase.found = FALSE;
2496
2497 err = fts_doc_fetch_by_doc_id(
2498 &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2499 fts_query_fetch_document, &phrase);
2500
2501 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
2502 ib::error() << "(" << err << ") in verification"
2503 " phase of proximity search";
2504 }
2505
2506 /* Free the prepared statement. */
2507 if (get_doc.get_document_graph) {
2508 fts_que_graph_free(get_doc.get_document_graph);
2509 get_doc.get_document_graph = NULL;
2510 }
2511
2512 mem_heap_free(phrase.heap);
2513
2514 return(err == DB_SUCCESS && phrase.found);
2515 }
2516
2517 /*****************************************************************//**
2518 Iterate over the matched document ids and search the for the
2519 actual phrase in the text.
2520 @return DB_SUCCESS if all OK */
2521 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2522 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2523 fts_query_search_phrase(
2524 /*====================*/
2525 fts_query_t* query, /*!< in: query instance */
2526 ib_vector_t* orig_tokens, /*!< in: tokens to search,
2527 with any stopwords in the
2528 original phrase */
2529 ib_vector_t* tokens) /*!< in: tokens that does
2530 not include stopwords and
2531 can be used to calculate
2532 ranking */
2533 {
2534 ulint i;
2535 fts_get_doc_t get_doc;
2536 ulint n_matched;
2537 fts_cache_t* cache = query->index->table->fts->cache;
2538
2539 n_matched = ib_vector_size(query->matched);
2540
2541 /* Setup the doc retrieval infrastructure. */
2542 memset(&get_doc, 0x0, sizeof(get_doc));
2543
2544 rw_lock_x_lock(&cache->lock);
2545
2546 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2547
2548 /* Must find the index cache */
2549 ut_a(get_doc.index_cache != NULL);
2550
2551 rw_lock_x_unlock(&cache->lock);
2552
2553 #ifdef FTS_INTERNAL_DIAG_PRINT
2554 ib::info() << "Start phrase search";
2555 #endif
2556
2557 /* Read the document from disk and do the actual
2558 match, matching documents will be added to the current
2559 doc id set. */
2560 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2561 fts_match_t* match;
2562 ibool found = FALSE;
2563
2564 match = static_cast<fts_match_t*>(
2565 ib_vector_get(query->matched, i));
2566
2567 /* Skip the document ids that were filtered out by
2568 an earlier pass. */
2569 if (match->doc_id != 0) {
2570
2571 query->error = fts_query_match_document(
2572 orig_tokens, &get_doc, match,
2573 query->distance, query->parser, &found);
2574
2575 if (query->error == DB_SUCCESS && found) {
2576 ulint z;
2577
2578 query->error = fts_query_process_doc_id(query,
2579 match->doc_id, 0);
2580 if (query->error != DB_SUCCESS) {
2581 goto func_exit;
2582 }
2583
2584 for (z = 0; z < ib_vector_size(tokens); z++) {
2585 fts_string_t* token;
2586 token = static_cast<fts_string_t*>(
2587 ib_vector_get(tokens, z));
2588 fts_query_add_word_to_document(
2589 query, match->doc_id, token);
2590 }
2591 }
2592 }
2593 }
2594
2595 func_exit:
2596 /* Free the prepared statement. */
2597 if (get_doc.get_document_graph) {
2598 fts_que_graph_free(get_doc.get_document_graph);
2599 get_doc.get_document_graph = NULL;
2600 }
2601
2602 return(query->error);
2603 }
2604
2605 /** Split the phrase into tokens
2606 @param[in,out] query query instance
2607 @param[in] node query node to search
2608 @param[in,out] tokens token vector
2609 @param[in,out] orig_tokens original node tokens include stopword
2610 @param[in,out] heap mem heap */
2611 static
2612 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2613 fts_query_phrase_split(
2614 fts_query_t* query,
2615 const fts_ast_node_t* node,
2616 ib_vector_t* tokens,
2617 ib_vector_t* orig_tokens,
2618 mem_heap_t* heap)
2619 {
2620 fts_string_t phrase;
2621 ulint len = 0;
2622 ulint cur_pos = 0;
2623 fts_ast_node_t* term_node = NULL;
2624
2625 if (node->type == FTS_AST_TEXT) {
2626 phrase.f_str = node->text.ptr->str;
2627 phrase.f_len = node->text.ptr->len;
2628 len = phrase.f_len;
2629 } else {
2630 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2631 phrase.f_str = NULL;
2632 phrase.f_len = 0;
2633 term_node = node->list.head;
2634 }
2635
2636 while (true) {
2637 fts_cache_t* cache = query->index->table->fts->cache;
2638 ulint cur_len;
2639 fts_string_t result_str;
2640
2641 if (node->type == FTS_AST_TEXT) {
2642 if (cur_pos >= len) {
2643 break;
2644 }
2645
2646 cur_len = innobase_mysql_fts_get_token(
2647 query->fts_index_table.charset,
2648 reinterpret_cast<const byte*>(phrase.f_str)
2649 + cur_pos,
2650 reinterpret_cast<const byte*>(phrase.f_str)
2651 + len,
2652 &result_str);
2653
2654 if (cur_len == 0) {
2655 break;
2656 }
2657
2658 cur_pos += cur_len;
2659 } else {
2660 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2661 /* Term node in parser phrase list */
2662 if (term_node == NULL) {
2663 break;
2664 }
2665
2666 ut_a(term_node->type == FTS_AST_TERM);
2667 result_str.f_str = term_node->term.ptr->str;
2668 result_str.f_len = term_node->term.ptr->len;
2669 result_str.f_n_char = fts_get_token_size(
2670 query->fts_index_table.charset,
2671 reinterpret_cast<char*>(result_str.f_str),
2672 result_str.f_len);
2673
2674 term_node = term_node->next;
2675 }
2676
2677 if (result_str.f_n_char == 0) {
2678 continue;
2679 }
2680
2681 fts_string_t* token = static_cast<fts_string_t*>(
2682 ib_vector_push(tokens, NULL));
2683 fts_string_dup(token, &result_str, heap);
2684
2685 if (fts_check_token(
2686 &result_str,
2687 cache->stopword_info.cached_stopword,
2688 query->fts_index_table.charset)) {
2689 /* Add the word to the RB tree so that we can
2690 calculate it's frequencey within a document. */
2691 fts_query_add_word_freq(query, token);
2692 } else {
2693 ib_vector_pop(tokens);
2694 }
2695
2696 /* we will start to store all words including stopwords
2697 in the "orig_tokens" vector, but skip any leading words
2698 that are stopwords */
2699 if (!ib_vector_is_empty(tokens)) {
2700 fts_string_t* orig_token = static_cast<fts_string_t*>(
2701 ib_vector_push(orig_tokens, NULL));
2702
2703 orig_token->f_str = token->f_str;
2704 orig_token->f_len = token->f_len;
2705 }
2706 }
2707 }
2708
2709 /*****************************************************************//**
2710 Text/Phrase search.
2711 @return DB_SUCCESS or error code */
2712 static MY_ATTRIBUTE((warn_unused_result))
2713 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2714 fts_query_phrase_search(
2715 /*====================*/
2716 fts_query_t* query, /*!< in: query instance */
2717 const fts_ast_node_t* node) /*!< in: node to search */
2718 {
2719 ib_vector_t* tokens;
2720 ib_vector_t* orig_tokens;
2721 mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2722 ib_alloc_t* heap_alloc;
2723 ulint num_token;
2724
2725 heap_alloc = ib_heap_allocator_create(heap);
2726
2727 tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2728 orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2729
2730 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2731 query->flags = FTS_PROXIMITY;
2732 } else {
2733 query->flags = FTS_PHRASE;
2734 }
2735
2736 /* Split the phrase into tokens. */
2737 fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2738
2739 num_token = ib_vector_size(tokens);
2740 if (num_token > MAX_PROXIMITY_ITEM) {
2741 query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2742 goto func_exit;
2743 }
2744
2745 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2746
2747 /* Ignore empty strings. */
2748 if (num_token > 0) {
2749 fts_string_t* token = NULL;
2750 fts_fetch_t fetch;
2751 trx_t* trx = query->trx;
2752 fts_ast_oper_t oper = query->oper;
2753 que_t* graph = NULL;
2754 ulint i;
2755 dberr_t error;
2756
2757 /* Create the vector for storing matching document ids
2758 and the positions of the first token of the phrase. */
2759 if (!query->matched) {
2760 ib_alloc_t* heap_alloc;
2761
2762 heap_alloc = ib_heap_allocator_create(heap);
2763
2764 if (!(query->flags & FTS_PROXIMITY)
2765 && !(query->flags & FTS_PHRASE)) {
2766 query->matched = ib_vector_create(
2767 heap_alloc, sizeof(fts_match_t),
2768 64);
2769 } else {
2770 ut_a(num_token <= MAX_PROXIMITY_ITEM);
2771 query->match_array =
2772 (ib_vector_t**) mem_heap_alloc(
2773 heap,
2774 num_token *
2775 sizeof(query->matched));
2776
2777 for (i = 0; i < num_token; i++) {
2778 query->match_array[i] =
2779 ib_vector_create(
2780 heap_alloc, sizeof(fts_match_t),
2781 64);
2782 }
2783
2784 query->matched = query->match_array[0];
2785 }
2786 }
2787
2788 /* Setup the callback args for filtering and consolidating
2789 the ilist. */
2790 fetch.read_arg = query;
2791 fetch.read_record = fts_query_index_fetch_nodes;
2792
2793 for (i = 0; i < num_token; i++) {
2794 /* Search for the first word from the phrase. */
2795 token = static_cast<fts_string_t*>(
2796 ib_vector_get(tokens, i));
2797
2798 if (query->flags & FTS_PROXIMITY
2799 || query->flags & FTS_PHRASE) {
2800 query->matched = query->match_array[i];
2801 }
2802
2803 error = fts_index_fetch_nodes(
2804 trx, &graph, &query->fts_index_table,
2805 token, &fetch);
2806
2807 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2808 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2809 if (error != DB_SUCCESS) {
2810 query->error = error;
2811 }
2812
2813 fts_que_graph_free(graph);
2814 graph = NULL;
2815
2816 fts_query_cache(query, token);
2817
2818 if (!(query->flags & FTS_PHRASE)
2819 && !(query->flags & FTS_PROXIMITY)) {
2820 break;
2821 }
2822
2823 /* If any of the token can't be found,
2824 no need to continue match */
2825 if (ib_vector_is_empty(query->match_array[i])
2826 || query->error != DB_SUCCESS) {
2827 goto func_exit;
2828 }
2829 }
2830
2831 /* Just a single word, no need to fetch the original
2832 documents to do phrase matching */
2833 if (ib_vector_size(orig_tokens) == 1
2834 && !ib_vector_is_empty(query->match_array[0])) {
2835 fts_match_t* match;
2836 ulint n_matched;
2837
2838 n_matched = ib_vector_size(query->match_array[0]);
2839
2840 for (i = 0; i < n_matched; i++) {
2841 match = static_cast<fts_match_t*>(
2842 ib_vector_get(
2843 query->match_array[0], i));
2844
2845 query->error = fts_query_process_doc_id(
2846 query, match->doc_id, 0);
2847 if (query->error != DB_SUCCESS) {
2848 goto func_exit;
2849 }
2850
2851 fts_query_add_word_to_document(
2852 query, match->doc_id, token);
2853 }
2854 query->oper = oper;
2855 goto func_exit;
2856 }
2857
2858 /* If we are doing proximity search, verify the distance
2859 between all words, and check they are in specified distance. */
2860 if (query->flags & FTS_PROXIMITY) {
2861 fts_phrase_or_proximity_search(query, tokens);
2862 } else {
2863 ibool matched;
2864
2865 /* Phrase Search case:
2866 We filter out the doc ids that don't contain
2867 all the tokens in the phrase. It's cheaper to
2868 search the ilist than bringing the documents in
2869 and then doing a search through the text. Isolated
2870 testing shows this also helps in mitigating disruption
2871 of the buffer cache. */
2872 matched = fts_phrase_or_proximity_search(query, tokens);
2873 query->matched = query->match_array[0];
2874
2875 /* Read the actual text in and search for the phrase. */
2876 if (matched) {
2877 ut_ad(query->error == DB_SUCCESS);
2878 query->error = fts_query_search_phrase(
2879 query, orig_tokens, tokens);
2880 }
2881 }
2882
2883 /* Restore original operation. */
2884 query->oper = oper;
2885
2886 if (query->error != DB_SUCCESS) {
2887 goto func_exit;
2888 }
2889 }
2890
2891 func_exit:
2892 mem_heap_free(heap);
2893
2894 /* Don't need it anymore. */
2895 query->matched = NULL;
2896
2897 return(query->error);
2898 }
2899
2900 /*****************************************************************//**
2901 Find the word and evaluate.
2902 @return DB_SUCCESS if all go well */
2903 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2904 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2905 fts_query_execute(
2906 /*==============*/
2907 fts_query_t* query, /*!< in: query instance */
2908 fts_string_t* token) /*!< in: token to search */
2909 {
2910 switch (query->oper) {
2911 case FTS_NONE:
2912 case FTS_NEGATE:
2913 case FTS_INCR_RATING:
2914 case FTS_DECR_RATING:
2915 query->error = fts_query_union(query, token);
2916 break;
2917
2918 case FTS_EXIST:
2919 query->error = fts_query_intersect(query, token);
2920 break;
2921
2922 case FTS_IGNORE:
2923 query->error = fts_query_difference(query, token);
2924 break;
2925
2926 default:
2927 ut_error;
2928 }
2929
2930 return(query->error);
2931 }
2932
2933 /*****************************************************************//**
2934 Create a wildcard string. It's the responsibility of the caller to
2935 free the byte* pointer. It's allocated using ut_malloc_nokey().
2936 @return ptr to allocated memory */
2937 static
2938 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2939 fts_query_get_token(
2940 /*================*/
2941 fts_ast_node_t* node, /*!< in: the current sub tree */
2942 fts_string_t* token) /*!< in: token to create */
2943 {
2944 ulint str_len;
2945 byte* new_ptr = NULL;
2946
2947 str_len = node->term.ptr->len;
2948
2949 ut_a(node->type == FTS_AST_TERM);
2950
2951 token->f_len = str_len;
2952 token->f_str = node->term.ptr->str;
2953
2954 if (node->term.wildcard) {
2955
2956 token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2957 token->f_len = str_len + 1;
2958
2959 memcpy(token->f_str, node->term.ptr->str, str_len);
2960
2961 token->f_str[str_len] = '%';
2962 token->f_str[token->f_len] = 0;
2963
2964 new_ptr = token->f_str;
2965 }
2966
2967 return(new_ptr);
2968 }
2969
2970 static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
2971
2972 /*****************************************************************//**
2973 Visit every node of the AST. */
2974 static
2975 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2976 fts_query_visitor(
2977 /*==============*/
2978 fts_ast_oper_t oper, /*!< in: current operator */
2979 fts_ast_node_t* node, /*!< in: The root of the current subtree*/
2980 void* arg) /*!< in: callback arg*/
2981 {
2982 byte* ptr;
2983 fts_string_t token;
2984 fts_query_t* query = static_cast<fts_query_t*>(arg);
2985
2986 ut_a(node);
2987 DBUG_ENTER("fts_query_visitor");
2988 DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2989
2990 token.f_n_char = 0;
2991 query->oper = oper;
2992 query->cur_node = node;
2993
2994 switch (node->type) {
2995 case FTS_AST_TEXT:
2996 case FTS_AST_PARSER_PHRASE_LIST:
2997
2998 if (query->oper == FTS_EXIST) {
2999 ut_ad(query->intersection == NULL);
3000 query->intersection = rbt_create(
3001 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3002
3003 query->total_size += SIZEOF_RBT_CREATE;
3004 }
3005
3006 /* Set the current proximity distance. */
3007 query->distance = node->text.distance;
3008
3009 /* Force collection of doc ids and the positions. */
3010 query->collect_positions = TRUE;
3011
3012 query->error = fts_query_phrase_search(query, node);
3013
3014 query->collect_positions = FALSE;
3015
3016 if (query->oper == FTS_EXIST) {
3017 fts_query_free_doc_ids(query, query->doc_ids);
3018 query->doc_ids = query->intersection;
3019 query->intersection = NULL;
3020 }
3021
3022 break;
3023
3024 case FTS_AST_TERM:
3025 token.f_str = node->term.ptr->str;
3026 token.f_len = node->term.ptr->len;
3027
3028 /* Collect wildcard words for QUERY EXPANSION. */
3029 if (node->term.wildcard && query->wildcard_words != NULL) {
3030 ib_rbt_bound_t parent;
3031
3032 if (rbt_search(query->wildcard_words, &parent, &token)
3033 != 0) {
3034 fts_string_t word;
3035
3036 fts_string_dup(&word, &token, query->heap);
3037 rbt_add_node(query->wildcard_words, &parent,
3038 &word);
3039 }
3040 }
3041
3042 /* Add the word to our RB tree that will be used to
3043 calculate this terms per document frequency. */
3044 fts_query_add_word_freq(query, &token);
3045
3046 ptr = fts_query_get_token(node, &token);
3047 query->error = fts_query_execute(query, &token);
3048
3049 if (ptr) {
3050 ut_free(ptr);
3051 }
3052
3053 break;
3054
3055 case FTS_AST_SUBEXP_LIST:
3056 query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3057 break;
3058
3059 default:
3060 ut_error;
3061 }
3062
3063 if (query->oper == FTS_EXIST) {
3064 query->multi_exist = true;
3065 }
3066
3067 DBUG_RETURN(query->error);
3068 }
3069
3070 /** Process (nested) sub-expression, create a new result set to store the
3071 sub-expression result by processing nodes under current sub-expression
3072 list. Merge the sub-expression result with that of parent expression list.
3073 @param[in,out] node current root node
3074 @param[in,out] visitor callback function
3075 @param[in,out] arg argument for callback
3076 @return DB_SUCCESS if all go well */
3077 static
3078 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3079 fts_ast_visit_sub_exp(
3080 fts_ast_node_t* node,
3081 fts_ast_callback visitor,
3082 void* arg)
3083 {
3084 fts_ast_oper_t cur_oper;
3085 fts_query_t* query = static_cast<fts_query_t*>(arg);
3086 ib_rbt_t* parent_doc_ids;
3087 ib_rbt_t* subexpr_doc_ids;
3088 dberr_t error = DB_SUCCESS;
3089 bool will_be_ignored = false;
3090 bool multi_exist;
3091
3092 DBUG_ENTER("fts_ast_visit_sub_exp");
3093
3094 ut_a(node->type == FTS_AST_SUBEXP_LIST);
3095
3096 /* To avoid stack overflow, we limit the mutual recursion
3097 depth between fts_ast_visit(), fts_query_visitor() and
3098 fts_ast_visit_sub_exp(). */
3099 if (query->visiting_sub_exp++ > 31) {
3100 query->error = DB_OUT_OF_MEMORY;
3101 DBUG_RETURN(query->error);
3102 }
3103
3104 cur_oper = query->oper;
3105
3106 /* Save current result set */
3107 parent_doc_ids = query->doc_ids;
3108
3109 /* Create new result set to store the sub-expression result. We
3110 will merge this result set with the parent after processing. */
3111 query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3112 fts_ranking_doc_id_cmp);
3113
3114 query->total_size += SIZEOF_RBT_CREATE;
3115
3116 multi_exist = query->multi_exist;
3117 query->multi_exist = false;
3118 /* Process nodes in current sub-expression and store its
3119 result set in query->doc_ids we created above. */
3120 error = fts_ast_visit(FTS_NONE, node, visitor,
3121 arg, &will_be_ignored);
3122
3123 /* Reinstate parent node state */
3124 query->multi_exist = multi_exist;
3125 query->oper = cur_oper;
3126 query->visiting_sub_exp--;
3127
3128 /* Merge the sub-expression result with the parent result set. */
3129 subexpr_doc_ids = query->doc_ids;
3130 query->doc_ids = parent_doc_ids;
3131 if (error == DB_SUCCESS) {
3132 error = fts_merge_doc_ids(query, subexpr_doc_ids);
3133 }
3134
3135 /* Free current result set. Result already merged into parent. */
3136 fts_query_free_doc_ids(query, subexpr_doc_ids);
3137
3138 DBUG_RETURN(error);
3139 }
3140
3141 #if 0
3142 /*****************************************************************//***
3143 Check if the doc id exists in the ilist.
3144 @return TRUE if doc id found */
3145 static
3146 ulint
3147 fts_query_find_doc_id(
3148 /*==================*/
3149 fts_select_t* select, /*!< in/out: contains the doc id to
3150 find, we update the word freq if
3151 document found */
3152 void* data, /*!< in: doc id ilist */
3153 ulint len) /*!< in: doc id ilist size */
3154 {
3155 byte* ptr = data;
3156 doc_id_t doc_id = 0;
3157 ulint decoded = 0;
3158
3159 /* Decode the ilist and search for selected doc_id. We also
3160 calculate the frequency of the word in the document if found. */
3161 while (decoded < len && !select->found) {
3162 ulint freq = 0;
3163 ulint min_pos = 0;
3164 ulint last_pos = 0;
3165 ulint pos = fts_decode_vlc(&ptr);
3166
3167 /* Add the delta. */
3168 doc_id += pos;
3169
3170 while (*ptr) {
3171 ++freq;
3172 last_pos += fts_decode_vlc(&ptr);
3173
3174 /* Only if min_pos is not set and the current
3175 term exists in a position greater than the
3176 min_pos of the previous term. */
3177 if (min_pos == 0 && last_pos > select->min_pos) {
3178 min_pos = last_pos;
3179 }
3180 }
3181
3182 /* Skip the end of word position marker. */
3183 ++ptr;
3184
3185 /* Bytes decoded so far. */
3186 decoded = ptr - (byte*) data;
3187
3188 /* A word may exist in the document but we only consider a
3189 match if it exists in a position that is greater than the
3190 position of the previous term. */
3191 if (doc_id == select->doc_id && min_pos > 0) {
3192 fts_doc_freq_t* doc_freq;
3193
3194 /* Add the doc id to the doc freq rb tree, if
3195 the doc id doesn't exist it will be created. */
3196 doc_freq = fts_query_add_doc_freq(
3197 select->word_freq->doc_freqs, doc_id);
3198
3199 /* Avoid duplicating the frequency tally */
3200 if (doc_freq->freq == 0) {
3201 doc_freq->freq = freq;
3202 }
3203
3204 select->found = TRUE;
3205 select->min_pos = min_pos;
3206 }
3207 }
3208
3209 return(select->found);
3210 }
3211 #endif
3212
3213 /*****************************************************************//**
3214 Read and filter nodes.
3215 @return DB_SUCCESS if all go well,
3216 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3217 static
3218 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3219 fts_query_filter_doc_ids(
3220 /*=====================*/
3221 fts_query_t* query, /*!< in: query instance */
3222 const fts_string_t* word, /*!< in: the current word */
3223 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
3224 const fts_node_t* node, /*!< in: current FTS node */
3225 void* data, /*!< in: doc id ilist */
3226 ulint len, /*!< in: doc id ilist size */
3227 ibool calc_doc_count) /*!< in: whether to remember doc count */
3228 {
3229 const byte* ptr = static_cast<byte*>(data);
3230 doc_id_t doc_id = 0;
3231 ulint decoded = 0;
3232 ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3233
3234 /* Decode the ilist and add the doc ids to the query doc_id set. */
3235 while (decoded < len) {
3236 ulint freq = 0;
3237 fts_doc_freq_t* doc_freq;
3238 fts_match_t* match = NULL;
3239 doc_id_t last_pos = 0;
3240 doc_id_t pos = fts_decode_vlc(&ptr);
3241
3242 /* Some sanity checks. */
3243 if (doc_id == 0) {
3244 ut_a(pos == node->first_doc_id);
3245 }
3246
3247 /* Add the delta. */
3248 doc_id += pos;
3249
3250 if (calc_doc_count) {
3251 word_freq->doc_count++;
3252 }
3253
3254 /* We simply collect the matching instances here. */
3255 if (query->collect_positions) {
3256 ib_alloc_t* heap_alloc;
3257
3258 /* Create a new fts_match_t instance. */
3259 match = static_cast<fts_match_t*>(
3260 ib_vector_push(query->matched, NULL));
3261
3262 match->start = 0;
3263 match->doc_id = doc_id;
3264 heap_alloc = ib_vector_allocator(query->matched);
3265
3266 /* Allocate from the same heap as the
3267 parent container. */
3268 match->positions = ib_vector_create(
3269 heap_alloc, sizeof(ulint), 64);
3270
3271 query->total_size += sizeof(fts_match_t)
3272 + sizeof(ib_vector_t)
3273 + sizeof(ulint) * 64;
3274 }
3275
3276 /* Unpack the positions within the document. */
3277 while (*ptr) {
3278 last_pos += fts_decode_vlc(&ptr);
3279
3280 /* Collect the matching word positions, for phrase
3281 matching later. */
3282 if (query->collect_positions) {
3283 ib_vector_push(match->positions, &last_pos);
3284 }
3285
3286 ++freq;
3287 }
3288
3289 /* End of list marker. */
3290 last_pos = (ulint) -1;
3291
3292 if (query->collect_positions) {
3293 ut_a(match != NULL);
3294 ib_vector_push(match->positions, &last_pos);
3295 }
3296
3297 /* Add the doc id to the doc freq rb tree, if the doc id
3298 doesn't exist it will be created. */
3299 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3300
3301 /* Avoid duplicating frequency tally. */
3302 if (doc_freq->freq == 0) {
3303 doc_freq->freq = freq;
3304 }
3305
3306 /* Skip the end of word position marker. */
3307 ++ptr;
3308
3309 /* Bytes decoded so far */
3310 decoded = ulint(ptr - (byte*) data);
3311
3312 /* We simply collect the matching documents and the
3313 positions here and match later. */
3314 if (!query->collect_positions) {
3315 /* We ignore error here and will check it later */
3316 fts_query_process_doc_id(query, doc_id, 0);
3317
3318 /* Add the word to the document's matched RB tree. */
3319 fts_query_add_word_to_document(query, doc_id, word);
3320 }
3321 }
3322
3323 /* Some sanity checks. */
3324 ut_a(doc_id == node->last_doc_id);
3325
3326 if (query->total_size > fts_result_cache_limit) {
3327 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3328 } else {
3329 return(DB_SUCCESS);
3330 }
3331 }
3332
3333 /*****************************************************************//**
3334 Read the FTS INDEX row.
3335 @return DB_SUCCESS if all go well. */
3336 static
3337 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3338 fts_query_read_node(
3339 /*================*/
3340 fts_query_t* query, /*!< in: query instance */
3341 const fts_string_t* word, /*!< in: current word */
3342 que_node_t* exp) /*!< in: query graph node */
3343 {
3344 int i;
3345 int ret;
3346 fts_node_t node;
3347 ib_rbt_bound_t parent;
3348 fts_word_freq_t* word_freq;
3349 ibool skip = FALSE;
3350 fts_string_t term;
3351 byte buf[FTS_MAX_WORD_LEN + 1];
3352 dberr_t error = DB_SUCCESS;
3353
3354 ut_a(query->cur_node->type == FTS_AST_TERM
3355 || query->cur_node->type == FTS_AST_TEXT
3356 || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3357
3358 memset(&node, 0, sizeof(node));
3359 term.f_str = buf;
3360
3361 /* Need to consider the wildcard search case, the word frequency
3362 is created on the search string not the actual word. So we need
3363 to assign the frequency on search string behalf. */
3364 if (query->cur_node->type == FTS_AST_TERM
3365 && query->cur_node->term.wildcard) {
3366
3367 term.f_len = query->cur_node->term.ptr->len;
3368 ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3369 memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3370 } else {
3371 term.f_len = word->f_len;
3372 ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3373 memcpy(term.f_str, word->f_str, word->f_len);
3374 }
3375
3376 /* Lookup the word in our rb tree, it must exist. */
3377 ret = rbt_search(query->word_freqs, &parent, &term);
3378
3379 ut_a(ret == 0);
3380
3381 word_freq = rbt_value(fts_word_freq_t, parent.last);
3382
3383 /* Start from 1 since the first column has been read by the caller.
3384 Also, we rely on the order of the columns projected, to filter
3385 out ilists that are out of range and we always want to read
3386 the doc_count irrespective of the suitablility of the row. */
3387
3388 for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3389
3390 dfield_t* dfield = que_node_get_val(exp);
3391 byte* data = static_cast<byte*>(
3392 dfield_get_data(dfield));
3393 ulint len = dfield_get_len(dfield);
3394
3395 ut_a(len != UNIV_SQL_NULL);
3396
3397 /* Note: The column numbers below must match the SELECT. */
3398
3399 switch (i) {
3400 case 1: /* DOC_COUNT */
3401 word_freq->doc_count += mach_read_from_4(data);
3402 break;
3403
3404 case 2: /* FIRST_DOC_ID */
3405 node.first_doc_id = fts_read_doc_id(data);
3406
3407 /* Skip nodes whose doc ids are out range. */
3408 if (query->oper == FTS_EXIST
3409 && query->upper_doc_id > 0
3410 && node.first_doc_id > query->upper_doc_id) {
3411 skip = TRUE;
3412 }
3413 break;
3414
3415 case 3: /* LAST_DOC_ID */
3416 node.last_doc_id = fts_read_doc_id(data);
3417
3418 /* Skip nodes whose doc ids are out range. */
3419 if (query->oper == FTS_EXIST
3420 && query->lower_doc_id > 0
3421 && node.last_doc_id < query->lower_doc_id) {
3422 skip = TRUE;
3423 }
3424 break;
3425
3426 case 4: /* ILIST */
3427
3428 error = fts_query_filter_doc_ids(
3429 query, &word_freq->word, word_freq,
3430 &node, data, len, FALSE);
3431
3432 break;
3433
3434 default:
3435 ut_error;
3436 }
3437 }
3438
3439 if (!skip) {
3440 /* Make sure all columns were read. */
3441
3442 ut_a(i == 5);
3443 }
3444
3445 return error;
3446 }
3447
3448 /*****************************************************************//**
3449 Callback function to fetch the rows in an FTS INDEX record.
3450 @return always returns TRUE */
3451 static
3452 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3453 fts_query_index_fetch_nodes(
3454 /*========================*/
3455 void* row, /*!< in: sel_node_t* */
3456 void* user_arg) /*!< in: pointer to fts_fetch_t */
3457 {
3458 fts_string_t key;
3459 sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3460 fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3461 fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3462 que_node_t* exp = sel_node->select_list;
3463 dfield_t* dfield = que_node_get_val(exp);
3464 void* data = dfield_get_data(dfield);
3465 ulint dfield_len = dfield_get_len(dfield);
3466
3467 key.f_str = static_cast<byte*>(data);
3468 key.f_len = dfield_len;
3469
3470 ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3471
3472 /* Note: we pass error out by 'query->error' */
3473 query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3474
3475 if (query->error != DB_SUCCESS) {
3476 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3477 return(FALSE);
3478 } else {
3479 return(TRUE);
3480 }
3481 }
3482
3483 /*****************************************************************//**
3484 Calculate the inverse document frequency (IDF) for all the terms. */
3485 static
3486 void
fts_query_calculate_idf(fts_query_t * query)3487 fts_query_calculate_idf(
3488 /*====================*/
3489 fts_query_t* query) /*!< in: Query state */
3490 {
3491 const ib_rbt_node_t* node;
3492 ib_uint64_t total_docs = query->total_docs;
3493
3494 /* We need to free any instances of fts_doc_freq_t that we
3495 may have allocated. */
3496 for (node = rbt_first(query->word_freqs);
3497 node;
3498 node = rbt_next(query->word_freqs, node)) {
3499
3500 fts_word_freq_t* word_freq;
3501
3502 word_freq = rbt_value(fts_word_freq_t, node);
3503
3504 if (word_freq->doc_count > 0) {
3505 if (total_docs == word_freq->doc_count) {
3506 /* QP assume ranking > 0 if we find
3507 a match. Since Log10(1) = 0, we cannot
3508 make IDF a zero value if do find a
3509 word in all documents. So let's make
3510 it an arbitrary very small number */
3511 word_freq->idf = log10(1.0001);
3512 } else {
3513 word_freq->idf = log10(
3514 static_cast<double>(total_docs)
3515 / static_cast<double>(
3516 word_freq->doc_count));
3517 }
3518 }
3519 }
3520 }
3521
3522 /*****************************************************************//**
3523 Calculate the ranking of the document. */
3524 static
3525 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3526 fts_query_calculate_ranking(
3527 /*========================*/
3528 const fts_query_t* query, /*!< in: query state */
3529 fts_ranking_t* ranking) /*!< in: Document to rank */
3530 {
3531 ulint pos = 0;
3532 fts_string_t word;
3533
3534 /* At this stage, ranking->rank should not exceed the 1.0
3535 bound */
3536 ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3537 ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3538
3539 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3540 int ret;
3541 ib_rbt_bound_t parent;
3542 double weight;
3543 fts_doc_freq_t* doc_freq;
3544 fts_word_freq_t* word_freq;
3545
3546 ret = rbt_search(query->word_freqs, &parent, &word);
3547
3548 /* It must exist. */
3549 ut_a(ret == 0);
3550
3551 word_freq = rbt_value(fts_word_freq_t, parent.last);
3552
3553 ret = rbt_search(
3554 word_freq->doc_freqs, &parent, &ranking->doc_id);
3555
3556 /* It must exist. */
3557 ut_a(ret == 0);
3558
3559 doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3560
3561 weight = (double) doc_freq->freq * word_freq->idf;
3562
3563 ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3564 }
3565 }
3566
3567 /*****************************************************************//**
3568 Add ranking to the result set. */
3569 static
3570 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3571 fts_query_add_ranking(
3572 /*==================*/
3573 fts_query_t* query, /*!< in: query state */
3574 ib_rbt_t* ranking_tree, /*!< in: ranking tree */
3575 const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
3576 {
3577 ib_rbt_bound_t parent;
3578
3579 /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3580 if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3581 fts_ranking_t* ranking;
3582
3583 ranking = rbt_value(fts_ranking_t, parent.last);
3584
3585 ranking->rank += new_ranking->rank;
3586
3587 ut_a(ranking->words == NULL);
3588 } else {
3589 rbt_add_node(ranking_tree, &parent, new_ranking);
3590
3591 query->total_size += SIZEOF_RBT_NODE_ADD
3592 + sizeof(fts_ranking_t);
3593 }
3594 }
3595
3596 /*****************************************************************//**
3597 Retrieve the FTS Relevance Ranking result for doc with doc_id
3598 @return the relevance ranking value, 0 if no ranking value
3599 present. */
3600 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3601 fts_retrieve_ranking(
3602 /*=================*/
3603 fts_result_t* result, /*!< in: FTS result structure */
3604 doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
3605 {
3606 ib_rbt_bound_t parent;
3607 fts_ranking_t new_ranking;
3608
3609 DBUG_ENTER("fts_retrieve_ranking");
3610
3611 if (!result || !result->rankings_by_id) {
3612 DBUG_RETURN(0);
3613 }
3614
3615 new_ranking.doc_id = doc_id;
3616
3617 /* Lookup the ranking in our rb tree */
3618 if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3619 fts_ranking_t* ranking;
3620
3621 ranking = rbt_value(fts_ranking_t, parent.last);
3622
3623 DBUG_RETURN(ranking->rank);
3624 }
3625
3626 DBUG_RETURN(0);
3627 }
3628
3629 /*****************************************************************//**
3630 Create the result and copy the data to it. */
3631 static
3632 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3633 fts_query_prepare_result(
3634 /*=====================*/
3635 fts_query_t* query, /*!< in: Query state */
3636 fts_result_t* result) /*!< in: result this can contain
3637 data from a previous search on
3638 another FTS index */
3639 {
3640 const ib_rbt_node_t* node;
3641 bool result_is_null = false;
3642
3643 DBUG_ENTER("fts_query_prepare_result");
3644
3645 if (result == NULL) {
3646 result = static_cast<fts_result_t*>(
3647 ut_zalloc_nokey(sizeof(*result)));
3648
3649 result->rankings_by_id = rbt_create(
3650 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3651
3652 query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3653 result_is_null = true;
3654 }
3655
3656 if (query->flags == FTS_OPT_RANKING) {
3657 fts_word_freq_t* word_freq;
3658 ulint size = ib_vector_size(query->deleted->doc_ids);
3659 doc_id_t* updates =
3660 (doc_id_t*) query->deleted->doc_ids->data;
3661
3662 node = rbt_first(query->word_freqs);
3663 ut_ad(node);
3664 word_freq = rbt_value(fts_word_freq_t, node);
3665
3666 for (node = rbt_first(word_freq->doc_freqs);
3667 node;
3668 node = rbt_next(word_freq->doc_freqs, node)) {
3669 fts_doc_freq_t* doc_freq;
3670 fts_ranking_t ranking;
3671
3672 doc_freq = rbt_value(fts_doc_freq_t, node);
3673
3674 /* Don't put deleted docs into result */
3675 if (fts_bsearch(updates, 0, static_cast<int>(size),
3676 doc_freq->doc_id) >= 0) {
3677 /* one less matching doc count */
3678 --word_freq->doc_count;
3679 continue;
3680 }
3681
3682 ranking.doc_id = doc_freq->doc_id;
3683 ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3684 ranking.words = NULL;
3685
3686 fts_query_add_ranking(query, result->rankings_by_id,
3687 &ranking);
3688
3689 if (query->total_size > fts_result_cache_limit) {
3690 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3691 fts_query_free_result(result);
3692 DBUG_RETURN(NULL);
3693 }
3694 }
3695
3696 /* Calculate IDF only after we exclude the deleted items */
3697 fts_query_calculate_idf(query);
3698
3699 node = rbt_first(query->word_freqs);
3700 word_freq = rbt_value(fts_word_freq_t, node);
3701
3702 /* Calculate the ranking for each doc */
3703 for (node = rbt_first(result->rankings_by_id);
3704 node != NULL;
3705 node = rbt_next(result->rankings_by_id, node)) {
3706
3707 fts_ranking_t* ranking;
3708
3709 ranking = rbt_value(fts_ranking_t, node);
3710
3711 ranking->rank = static_cast<fts_rank_t>(
3712 ranking->rank * word_freq->idf * word_freq->idf);
3713 }
3714
3715 DBUG_RETURN(result);
3716 }
3717
3718 ut_a(rbt_size(query->doc_ids) > 0);
3719
3720 for (node = rbt_first(query->doc_ids);
3721 node;
3722 node = rbt_next(query->doc_ids, node)) {
3723
3724 fts_ranking_t* ranking;
3725
3726 ranking = rbt_value(fts_ranking_t, node);
3727 fts_query_calculate_ranking(query, ranking);
3728
3729 // FIXME: I think we may requre this information to improve the
3730 // ranking of doc ids which have more word matches from
3731 // different FTS indexes.
3732
3733 /* We don't need these anymore free the resources. */
3734 ranking->words = NULL;
3735
3736 if (!result_is_null) {
3737 fts_query_add_ranking(query, result->rankings_by_id, ranking);
3738
3739 if (query->total_size > fts_result_cache_limit) {
3740 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3741 fts_query_free_result(result);
3742 DBUG_RETURN(NULL);
3743 }
3744 }
3745 }
3746
3747 if (result_is_null) {
3748 /* Use doc_ids directly */
3749 rbt_free(result->rankings_by_id);
3750 result->rankings_by_id = query->doc_ids;
3751 query->doc_ids = NULL;
3752 }
3753
3754 DBUG_RETURN(result);
3755 }
3756
3757 /*****************************************************************//**
3758 Get the result of the query. Calculate the similarity coefficient. */
3759 static
3760 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3761 fts_query_get_result(
3762 /*=================*/
3763 fts_query_t* query, /*!< in: query instance */
3764 fts_result_t* result) /*!< in: result */
3765 {
3766 DBUG_ENTER("fts_query_get_result");
3767
3768 if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3769 /* Copy the doc ids to the result. */
3770 result = fts_query_prepare_result(query, result);
3771 } else {
3772 /* Create an empty result instance. */
3773 result = static_cast<fts_result_t*>(
3774 ut_zalloc_nokey(sizeof(*result)));
3775 }
3776
3777 DBUG_RETURN(result);
3778 }
3779
3780 /*****************************************************************//**
3781 FTS Query free resources and reset. */
3782 static
3783 void
fts_query_free(fts_query_t * query)3784 fts_query_free(
3785 /*===========*/
3786 fts_query_t* query) /*!< in: query instance to free*/
3787 {
3788
3789 if (query->read_nodes_graph) {
3790 fts_que_graph_free(query->read_nodes_graph);
3791 }
3792
3793 if (query->root) {
3794 fts_ast_free_node(query->root);
3795 }
3796
3797 if (query->deleted) {
3798 fts_doc_ids_free(query->deleted);
3799 }
3800
3801 if (query->intersection) {
3802 fts_query_free_doc_ids(query, query->intersection);
3803 }
3804
3805 if (query->doc_ids) {
3806 fts_query_free_doc_ids(query, query->doc_ids);
3807 }
3808
3809 if (query->word_freqs) {
3810 const ib_rbt_node_t* node;
3811
3812 /* We need to free any instances of fts_doc_freq_t that we
3813 may have allocated. */
3814 for (node = rbt_first(query->word_freqs);
3815 node;
3816 node = rbt_next(query->word_freqs, node)) {
3817
3818 fts_word_freq_t* word_freq;
3819
3820 word_freq = rbt_value(fts_word_freq_t, node);
3821
3822 /* We need to cast away the const. */
3823 rbt_free(word_freq->doc_freqs);
3824 }
3825
3826 rbt_free(query->word_freqs);
3827 }
3828
3829 if (query->wildcard_words != NULL) {
3830 rbt_free(query->wildcard_words);
3831 }
3832
3833 ut_a(!query->intersection);
3834
3835 if (query->word_map) {
3836 rbt_free(query->word_map);
3837 }
3838
3839 if (query->word_vector != NULL) {
3840 UT_DELETE(query->word_vector);
3841 }
3842
3843 if (query->heap) {
3844 mem_heap_free(query->heap);
3845 }
3846
3847 memset(query, 0, sizeof(*query));
3848 }
3849
3850 /*****************************************************************//**
3851 Parse the query using flex/bison or plugin parser.
3852 @return parse tree node. */
3853 static
3854 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3855 fts_query_parse(
3856 /*============*/
3857 fts_query_t* query, /*!< in: query instance */
3858 byte* query_str, /*!< in: query string */
3859 ulint query_len) /*!< in: query string length */
3860 {
3861 int error;
3862 fts_ast_state_t state;
3863 bool mode = query->boolean_mode;
3864 DBUG_ENTER("fts_query_parse");
3865
3866 memset(&state, 0x0, sizeof(state));
3867
3868 state.charset = query->fts_index_table.charset;
3869
3870 DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3871 query->parser = NULL;);
3872
3873 if (query->parser) {
3874 state.root = state.cur_node =
3875 fts_ast_create_node_list(&state, NULL);
3876 error = fts_parse_by_parser(mode, query_str, query_len,
3877 query->parser, &state);
3878 } else {
3879 /* Setup the scanner to use, this depends on the mode flag. */
3880 state.lexer = fts_lexer_create(mode, query_str, query_len);
3881 state.charset = query->fts_index_table.charset;
3882 error = fts_parse(&state);
3883 fts_lexer_free(state.lexer);
3884 state.lexer = NULL;
3885 }
3886
3887 /* Error during parsing ? */
3888 if (error) {
3889 /* Free the nodes that were allocated during parsing. */
3890 fts_ast_state_free(&state);
3891 } else {
3892 query->root = state.root;
3893
3894 if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
3895 fts_ast_node_print(query->root);
3896 }
3897 }
3898
3899 DBUG_RETURN(state.root);
3900 }
3901
3902 /*******************************************************************//**
3903 FTS Query optimization
3904 Set FTS_OPT_RANKING if it is a simple term query */
3905 static
3906 void
fts_query_can_optimize(fts_query_t * query,uint flags)3907 fts_query_can_optimize(
3908 /*===================*/
3909 fts_query_t* query, /*!< in/out: query instance */
3910 uint flags) /*!< In: FTS search mode */
3911 {
3912 fts_ast_node_t* node = query->root;
3913
3914 if (flags & FTS_EXPAND) {
3915 return;
3916 }
3917
3918 /* Check if it has only a term without oper */
3919 ut_ad(node->type == FTS_AST_LIST);
3920 node = node->list.head;
3921 if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3922 query->flags = FTS_OPT_RANKING;
3923 }
3924 }
3925
3926 /** FTS Query entry point.
3927 @param[in,out] trx transaction
3928 @param[in] index fts index to search
3929 @param[in] flags FTS search mode
3930 @param[in] query_str FTS query
3931 @param[in] query_len FTS query string len in bytes
3932 @param[in,out] result result doc ids
3933 @return DB_SUCCESS if successful otherwise error code */
3934 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3935 fts_query(
3936 trx_t* trx,
3937 dict_index_t* index,
3938 uint flags,
3939 const byte* query_str,
3940 ulint query_len,
3941 fts_result_t** result)
3942 {
3943 fts_query_t query;
3944 dberr_t error = DB_SUCCESS;
3945 byte* lc_query_str;
3946 ulint lc_query_str_len;
3947 ulint result_len;
3948 bool boolean_mode;
3949 trx_t* query_trx; /* FIXME: use provided trx */
3950 CHARSET_INFO* charset;
3951 ulint start_time_ms;
3952 bool will_be_ignored = false;
3953
3954 boolean_mode = flags & FTS_BOOL;
3955
3956 *result = NULL;
3957 memset(&query, 0x0, sizeof(query));
3958 query_trx = trx_create();
3959 query_trx->op_info = "FTS query";
3960
3961 start_time_ms = ut_time_ms();
3962
3963 query.trx = query_trx;
3964 query.index = index;
3965 query.boolean_mode = boolean_mode;
3966 query.deleted = fts_doc_ids_create();
3967 query.cur_node = NULL;
3968
3969 query.fts_common_table.type = FTS_COMMON_TABLE;
3970 query.fts_common_table.table_id = index->table->id;
3971 query.fts_common_table.table = index->table;
3972
3973 charset = fts_index_get_charset(index);
3974
3975 query.fts_index_table.type = FTS_INDEX_TABLE;
3976 query.fts_index_table.index_id = index->id;
3977 query.fts_index_table.table_id = index->table->id;
3978 query.fts_index_table.charset = charset;
3979 query.fts_index_table.table = index->table;
3980
3981 query.word_map = rbt_create_arg_cmp(
3982 sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3983 query.word_vector = UT_NEW_NOKEY(word_vector_t());
3984 query.error = DB_SUCCESS;
3985
3986 /* Setup the RB tree that will be used to collect per term
3987 statistics. */
3988 query.word_freqs = rbt_create_arg_cmp(
3989 sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3990 (void*) charset);
3991
3992 if (flags & FTS_EXPAND) {
3993 query.wildcard_words = rbt_create_arg_cmp(
3994 sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3995 }
3996
3997 query.total_size += SIZEOF_RBT_CREATE;
3998
3999 query.total_docs = dict_table_get_n_rows(index->table);
4000
4001 query.fts_common_table.suffix = "DELETED";
4002
4003 /* Read the deleted doc_ids, we need these for filtering. */
4004 error = fts_table_fetch_doc_ids(
4005 NULL, &query.fts_common_table, query.deleted);
4006
4007 if (error != DB_SUCCESS) {
4008 goto func_exit;
4009 }
4010
4011 query.fts_common_table.suffix = "DELETED_CACHE";
4012
4013 error = fts_table_fetch_doc_ids(
4014 NULL, &query.fts_common_table, query.deleted);
4015
4016 if (error != DB_SUCCESS) {
4017 goto func_exit;
4018 }
4019
4020 /* Get the deleted doc ids that are in the cache. */
4021 fts_cache_append_deleted_doc_ids(
4022 index->table->fts->cache, query.deleted->doc_ids);
4023 DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4024
4025 /* Sort the vector so that we can do a binary search over the ids. */
4026 ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
4027
4028 /* Convert the query string to lower case before parsing. We own
4029 the ut_malloc'ed result and so remember to free it before return. */
4030
4031 lc_query_str_len = query_len * charset->casedn_multiply + 1;
4032 lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4033
4034 /* For binary collations, a case sensitive search is
4035 performed. Hence don't convert to lower case. */
4036 if (my_binary_compare(charset)) {
4037 memcpy(lc_query_str, query_str, query_len);
4038 lc_query_str[query_len]= 0;
4039 result_len= query_len;
4040 } else {
4041 result_len = innobase_fts_casedn_str(
4042 charset, (char*)( query_str), query_len,
4043 (char*)(lc_query_str), lc_query_str_len);
4044 }
4045
4046 ut_ad(result_len < lc_query_str_len);
4047
4048 lc_query_str[result_len] = 0;
4049
4050 query.heap = mem_heap_create(128);
4051
4052 /* Create the rb tree for the doc id (current) set. */
4053 query.doc_ids = rbt_create(
4054 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4055 query.parser = index->parser;
4056
4057 query.total_size += SIZEOF_RBT_CREATE;
4058
4059 /* Parse the input query string. */
4060 if (fts_query_parse(&query, lc_query_str, result_len)) {
4061 fts_ast_node_t* ast = query.root;
4062 ast->trx = trx;
4063
4064 /* Optimize query to check if it's a single term */
4065 fts_query_can_optimize(&query, flags);
4066
4067 DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4068 fts_result_cache_limit = 2048;
4069 );
4070
4071 /* Traverse the Abstract Syntax Tree (AST) and execute
4072 the query. */
4073 query.error = fts_ast_visit(
4074 FTS_NONE, ast, fts_query_visitor,
4075 &query, &will_be_ignored);
4076 if (query.error == DB_INTERRUPTED) {
4077 error = DB_INTERRUPTED;
4078 ut_free(lc_query_str);
4079 goto func_exit;
4080 }
4081
4082 /* If query expansion is requested, extend the search
4083 with first search pass result */
4084 if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4085 query.error = fts_expand_query(index, &query);
4086 }
4087
4088 /* Calculate the inverse document frequency of the terms. */
4089 if (query.error == DB_SUCCESS
4090 && query.flags != FTS_OPT_RANKING) {
4091 fts_query_calculate_idf(&query);
4092 }
4093
4094 /* Copy the result from the query state, so that we can
4095 return it to the caller. */
4096 if (query.error == DB_SUCCESS) {
4097 *result = fts_query_get_result(&query, *result);
4098 }
4099
4100 error = query.error;
4101 } else {
4102 /* still return an empty result set */
4103 *result = static_cast<fts_result_t*>(
4104 ut_zalloc_nokey(sizeof(**result)));
4105 }
4106
4107 if (trx_is_interrupted(trx)) {
4108 error = DB_INTERRUPTED;
4109 ut_free(lc_query_str);
4110 if (*result) {
4111 fts_query_free_result(*result);
4112 }
4113 goto func_exit;
4114 }
4115
4116 ut_free(lc_query_str);
4117
4118 if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
4119 ulint diff_time = ut_time_ms() - start_time_ms;
4120
4121 ib::info() << "FTS Search Processing time: "
4122 << diff_time / 1000 << " secs: " << diff_time % 1000
4123 << " millisec: row(s) "
4124 << ((*result)->rankings_by_id
4125 ? lint(rbt_size((*result)->rankings_by_id))
4126 : -1);
4127
4128 /* Log memory consumption & result size */
4129 ib::info() << "Full Search Memory: " << query.total_size
4130 << " (bytes), Row: "
4131 << ((*result)->rankings_by_id
4132 ? rbt_size((*result)->rankings_by_id)
4133 : 0)
4134 << ".";
4135 }
4136
4137 func_exit:
4138 fts_query_free(&query);
4139
4140 query_trx->free();
4141
4142 return(error);
4143 }
4144
4145 /*****************************************************************//**
4146 FTS Query free result, returned by fts_query(). */
4147 void
fts_query_free_result(fts_result_t * result)4148 fts_query_free_result(
4149 /*==================*/
4150 fts_result_t* result) /*!< in: result instance to free.*/
4151 {
4152 if (result) {
4153 if (result->rankings_by_id != NULL) {
4154 rbt_free(result->rankings_by_id);
4155 result->rankings_by_id = NULL;
4156 }
4157 if (result->rankings_by_rank != NULL) {
4158 rbt_free(result->rankings_by_rank);
4159 result->rankings_by_rank = NULL;
4160 }
4161
4162 ut_free(result);
4163 result = NULL;
4164 }
4165 }
4166
4167 /*****************************************************************//**
4168 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4169 void
fts_query_sort_result_on_rank(fts_result_t * result)4170 fts_query_sort_result_on_rank(
4171 /*==========================*/
4172 fts_result_t* result) /*!< out: result instance to sort.*/
4173 {
4174 const ib_rbt_node_t* node;
4175 ib_rbt_t* ranked;
4176
4177 ut_a(result->rankings_by_id != NULL);
4178 if (result->rankings_by_rank) {
4179 rbt_free(result->rankings_by_rank);
4180 }
4181
4182 ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4183
4184 /* We need to free any instances of fts_doc_freq_t that we
4185 may have allocated. */
4186 for (node = rbt_first(result->rankings_by_id);
4187 node;
4188 node = rbt_next(result->rankings_by_id, node)) {
4189
4190 fts_ranking_t* ranking;
4191
4192 ranking = rbt_value(fts_ranking_t, node);
4193
4194 ut_a(ranking->words == NULL);
4195
4196 rbt_insert(ranked, ranking, ranking);
4197 }
4198
4199 /* Reset the current node too. */
4200 result->current = NULL;
4201 result->rankings_by_rank = ranked;
4202 }
4203
4204 /*******************************************************************//**
4205 A debug function to print result doc_id set. */
4206 static
4207 void
fts_print_doc_id(fts_query_t * query)4208 fts_print_doc_id(
4209 /*=============*/
4210 fts_query_t* query) /*!< in : tree that stores doc_ids.*/
4211 {
4212 const ib_rbt_node_t* node;
4213
4214 /* Iterate each member of the doc_id set */
4215 for (node = rbt_first(query->doc_ids);
4216 node;
4217 node = rbt_next(query->doc_ids, node)) {
4218 fts_ranking_t* ranking;
4219 ranking = rbt_value(fts_ranking_t, node);
4220
4221 ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4222
4223 ulint pos = 0;
4224 fts_string_t word;
4225
4226 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4227 ib::info() << "doc_ids info, value: " << word.f_str;
4228 }
4229 }
4230 }
4231
4232 /*************************************************************//**
4233 This function implements a simple "blind" query expansion search:
4234 words in documents found in the first search pass will be used as
4235 search arguments to search the document again, thus "expand"
4236 the search result set.
4237 @return DB_SUCCESS if success, otherwise the error code */
4238 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4239 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4240 fts_expand_query(
4241 /*=============*/
4242 dict_index_t* index, /*!< in: FTS index to search */
4243 fts_query_t* query) /*!< in: FTS query instance */
4244 {
4245 const ib_rbt_node_t* node;
4246 const ib_rbt_node_t* token_node;
4247 fts_doc_t result_doc;
4248 dberr_t error = DB_SUCCESS;
4249 const fts_index_cache_t*index_cache;
4250
4251 /* If no doc is found in first search pass, return */
4252 if (!rbt_size(query->doc_ids)) {
4253 return(error);
4254 }
4255
4256 /* Init "result_doc", to hold words from the first search pass */
4257 fts_doc_init(&result_doc);
4258
4259 rw_lock_x_lock(&index->table->fts->cache->lock);
4260 index_cache = fts_find_index_cache(index->table->fts->cache, index);
4261 rw_lock_x_unlock(&index->table->fts->cache->lock);
4262
4263 ut_a(index_cache);
4264
4265 result_doc.tokens = rbt_create_arg_cmp(
4266 sizeof(fts_token_t), innobase_fts_text_cmp,
4267 (void*) index_cache->charset);
4268
4269 result_doc.charset = index_cache->charset;
4270 result_doc.parser = index_cache->index->parser;
4271
4272 query->total_size += SIZEOF_RBT_CREATE;
4273
4274 if (UNIV_UNLIKELY(fts_enable_diag_print)) {
4275 fts_print_doc_id(query);
4276 }
4277
4278 for (node = rbt_first(query->doc_ids);
4279 node;
4280 node = rbt_next(query->doc_ids, node)) {
4281
4282 fts_ranking_t* ranking;
4283 ulint prev_token_size;
4284 ulint estimate_size;
4285
4286 prev_token_size = rbt_size(result_doc.tokens);
4287
4288 ranking = rbt_value(fts_ranking_t, node);
4289
4290 /* Fetch the documents with the doc_id from the
4291 result of first seach pass. Since we do not
4292 store document-to-word mapping, we need to
4293 fetch the original document and parse them.
4294 Future optimization could be done here if we
4295 support some forms of document-to-word mapping */
4296 fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4297 FTS_FETCH_DOC_BY_ID_EQUAL,
4298 fts_query_expansion_fetch_doc,
4299 &result_doc);
4300
4301 /* Estimate memory used, see fts_process_token and fts_token_t.
4302 We ignore token size here. */
4303 estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4304 * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4305 + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4306 query->total_size += estimate_size;
4307
4308 if (query->total_size > fts_result_cache_limit) {
4309 error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4310 goto func_exit;
4311 }
4312 }
4313
4314 /* Remove words that have already been searched in the first pass */
4315 for (ulint i = 0; i < query->word_vector->size(); i++) {
4316 fts_string_t word = query->word_vector->at(i);
4317 ib_rbt_bound_t parent;
4318
4319 if (query->wildcard_words
4320 && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4321 /* If it's a wildcard word, remove words having
4322 it as prefix. */
4323 while (rbt_search_cmp(result_doc.tokens,
4324 &parent, &word, NULL,
4325 innobase_fts_text_cmp_prefix)
4326 == 0) {
4327 ut_free(rbt_remove_node(result_doc.tokens,
4328 parent.last));
4329 }
4330 } else {
4331 /* We don't check return value, because the word may
4332 have been deleted by a previous wildcard word as its
4333 prefix, e.g. ('g * good'). */
4334 rbt_delete(result_doc.tokens, &word);
4335 }
4336 }
4337
4338 /* Search the table the second time with expanded search list */
4339 for (token_node = rbt_first(result_doc.tokens);
4340 token_node;
4341 token_node = rbt_next(result_doc.tokens, token_node)) {
4342 fts_token_t* mytoken;
4343 mytoken = rbt_value(fts_token_t, token_node);
4344
4345 /* '%' in the end is treated as prefix search,
4346 it can cause assert failure, so we skip it. */
4347 if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4348 continue;
4349 }
4350
4351 ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4352 fts_query_add_word_freq(query, &mytoken->text);
4353 error = fts_query_union(query, &mytoken->text);
4354
4355 if (error != DB_SUCCESS) {
4356 break;
4357 }
4358 }
4359
4360 func_exit:
4361 fts_doc_free(&result_doc);
4362
4363 return(error);
4364 }
4365 /*************************************************************//**
4366 This function finds documents that contain all words in a
4367 phrase or proximity search. And if proximity search, verify
4368 the words are close enough to each other, as in specified distance.
4369 This function is called for phrase and proximity search.
4370 @return TRUE if documents are found, FALSE if otherwise */
4371 static
4372 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4373 fts_phrase_or_proximity_search(
4374 /*===========================*/
4375 fts_query_t* query, /*!< in/out: query instance.
4376 query->doc_ids might be instantiated
4377 with qualified doc IDs */
4378 ib_vector_t* tokens) /*!< in: Tokens contain words */
4379 {
4380 ulint n_matched;
4381 ulint i;
4382 ibool matched = FALSE;
4383 ulint num_token = ib_vector_size(tokens);
4384 fts_match_t* match[MAX_PROXIMITY_ITEM];
4385 ibool end_list = FALSE;
4386
4387 /* Number of matched documents for the first token */
4388 n_matched = ib_vector_size(query->match_array[0]);
4389
4390 /* We have a set of match list for each word, we shall
4391 walk through the list and find common documents that
4392 contain all the matching words. */
4393 for (i = 0; i < n_matched; i++) {
4394 ulint j;
4395 ulint k = 0;
4396 fts_proximity_t qualified_pos;
4397
4398 match[0] = static_cast<fts_match_t*>(
4399 ib_vector_get(query->match_array[0], i));
4400
4401 /* For remaining match list for the token(word), we
4402 try to see if there is a document with the same
4403 doc id */
4404 for (j = 1; j < num_token; j++) {
4405 match[j] = static_cast<fts_match_t*>(
4406 ib_vector_get(query->match_array[j], k));
4407
4408 while (match[j]->doc_id < match[0]->doc_id
4409 && k < ib_vector_size(query->match_array[j])) {
4410 match[j] = static_cast<fts_match_t*>(
4411 ib_vector_get(
4412 query->match_array[j], k));
4413 k++;
4414 }
4415
4416 if (match[j]->doc_id > match[0]->doc_id) {
4417 /* no match */
4418 if (query->flags & FTS_PHRASE) {
4419 match[0]->doc_id = 0;
4420 }
4421 break;
4422 }
4423
4424 if (k == ib_vector_size(query->match_array[j])) {
4425 end_list = TRUE;
4426
4427 if (query->flags & FTS_PHRASE) {
4428 ulint s;
4429 /* Since i is the last doc id in the
4430 match_array[j], remove all doc ids > i
4431 from the match_array[0]. */
4432 fts_match_t* match_temp;
4433 for (s = i + 1; s < n_matched; s++) {
4434 match_temp = static_cast<
4435 fts_match_t*>(ib_vector_get(
4436 query->match_array[0], s));
4437 match_temp->doc_id = 0;
4438 }
4439
4440 if (match[j]->doc_id !=
4441 match[0]->doc_id) {
4442 /* no match */
4443 match[0]->doc_id = 0;
4444 }
4445 }
4446
4447 if (match[j]->doc_id != match[0]->doc_id) {
4448 goto func_exit;
4449 }
4450 }
4451
4452 /* FIXME: A better solution will be a counter array
4453 remember each run's last position. So we don't
4454 reset it here very time */
4455 k = 0;
4456 }
4457
4458 if (j != num_token) {
4459 continue;
4460 }
4461
4462 /* For this matching doc, we need to further
4463 verify whether the words in the doc are close
4464 to each other, and within the distance specified
4465 in the proximity search */
4466 if (query->flags & FTS_PHRASE) {
4467 matched = TRUE;
4468 } else if (fts_proximity_get_positions(
4469 match, num_token, ULINT_MAX, &qualified_pos)) {
4470
4471 /* Fetch the original documents and count the
4472 words in between matching words to see that is in
4473 specified distance */
4474 if (fts_query_is_in_proximity_range(
4475 query, match, &qualified_pos)) {
4476 /* If so, mark we find a matching doc */
4477 query->error = fts_query_process_doc_id(
4478 query, match[0]->doc_id, 0);
4479 if (query->error != DB_SUCCESS) {
4480 matched = FALSE;
4481 goto func_exit;
4482 }
4483
4484 matched = TRUE;
4485 for (ulint z = 0; z < num_token; z++) {
4486 fts_string_t* token;
4487 token = static_cast<fts_string_t*>(
4488 ib_vector_get(tokens, z));
4489 fts_query_add_word_to_document(
4490 query, match[0]->doc_id, token);
4491 }
4492 }
4493 }
4494
4495 if (end_list) {
4496 break;
4497 }
4498 }
4499
4500 func_exit:
4501 return(matched);
4502 }
4503
4504 /*************************************************************//**
4505 This function checks whether words in result documents are close to
4506 each other (within proximity range as specified by "distance").
4507 If "distance" is MAX_ULINT, then it will find all combinations of
4508 positions of matching words and store min and max positions
4509 in the "qualified_pos" for later verification.
4510 @return true if words are close to each other, false if otherwise */
4511 static
4512 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4513 fts_proximity_get_positions(
4514 /*========================*/
4515 fts_match_t** match, /*!< in: query instance */
4516 ulint num_match, /*!< in: number of matching
4517 items */
4518 ulint distance, /*!< in: distance value
4519 for proximity search */
4520 fts_proximity_t* qualified_pos) /*!< out: the position info
4521 records ranges containing
4522 all matching words. */
4523 {
4524 ulint i;
4525 ulint idx[MAX_PROXIMITY_ITEM];
4526 ulint num_pos[MAX_PROXIMITY_ITEM];
4527 ulint min_idx;
4528
4529 qualified_pos->n_pos = 0;
4530
4531 ut_a(num_match <= MAX_PROXIMITY_ITEM);
4532
4533 /* Each word could appear multiple times in a doc. So
4534 we need to walk through each word's position list, and find
4535 closest distance between different words to see if
4536 they are in the proximity distance. */
4537
4538 /* Assume each word's position list is sorted, we
4539 will just do a walk through to all words' lists
4540 similar to a the merge phase of a merge sort */
4541 for (i = 0; i < num_match; i++) {
4542 /* idx is the current position we are checking
4543 for a particular word */
4544 idx[i] = 0;
4545
4546 /* Number of positions for this word */
4547 num_pos[i] = ib_vector_size(match[i]->positions);
4548 }
4549
4550 /* Start with the first word */
4551 min_idx = 0;
4552
4553 while (idx[min_idx] < num_pos[min_idx]) {
4554 ulint position[MAX_PROXIMITY_ITEM];
4555 ulint min_pos = ULINT_MAX;
4556 ulint max_pos = 0;
4557
4558 /* Check positions in each word position list, and
4559 record the max/min position */
4560 for (i = 0; i < num_match; i++) {
4561 position[i] = *(ulint*) ib_vector_get_const(
4562 match[i]->positions, idx[i]);
4563
4564 if (position[i] == ULINT_UNDEFINED) {
4565 break;
4566 }
4567
4568 if (position[i] < min_pos) {
4569 min_pos = position[i];
4570 min_idx = i;
4571 }
4572
4573 if (position[i] > max_pos) {
4574 max_pos = position[i];
4575 }
4576 }
4577
4578 /* If max and min position are within range, we
4579 find a good match */
4580 if (max_pos - min_pos <= distance
4581 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4582 /* The charset has variable character
4583 length encoding, record the min_pos and
4584 max_pos, we will need to verify the actual
4585 number of characters */
4586 qualified_pos->min_pos.push_back(min_pos);
4587 qualified_pos->max_pos.push_back(max_pos);
4588 qualified_pos->n_pos++;
4589 }
4590
4591 /* Otherwise, move to the next position is the
4592 list for the word with the smallest position */
4593 idx[min_idx]++;
4594 }
4595
4596 return(qualified_pos->n_pos != 0);
4597 }
4598