1 /*****************************************************************************
2
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /**************************************************//**
21 @file fts/fts0que.cc
22 Full Text Search functionality.
23
24 Created 2007/03/27 Sunny Bains
25 Completed 2011/7/10 Sunny and Jimmy Yang
26 *******************************************************/
27
28 #include "dict0dict.h"
29 #include "ut0rbt.h"
30 #include "row0sel.h"
31 #include "fts0fts.h"
32 #include "fts0priv.h"
33 #include "fts0ast.h"
34 #include "fts0pars.h"
35 #include "fts0types.h"
36 #include "fts0plugin.h"
37 #include "fts0vlc.h"
38
39 #include <iomanip>
40 #include <vector>
41
42 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
43
44 #define RANK_DOWNGRADE (-1.0F)
45 #define RANK_UPGRADE (1.0F)
46
47 /* Maximum number of words supported in a phrase or proximity search. */
48 #define MAX_PROXIMITY_ITEM 128
49
50 /* Memory used by rbt itself for create and node add */
51 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
52 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
53
54 /*Initial byte length for 'words' in fts_ranking_t */
55 #define RANKING_WORDS_INIT_LEN 4
56
57 // FIXME: Need to have a generic iterator that traverses the ilist.
58
59 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
60
61 struct fts_word_freq_t;
62
63 /** State of an FTS query. */
64 struct fts_query_t {
65 mem_heap_t* heap; /*!< Heap to use for allocations */
66
67 trx_t* trx; /*!< The query transaction */
68
69 dict_index_t* index; /*!< The FTS index to search */
70 /*!< FTS auxiliary common table def */
71
72 fts_table_t fts_common_table;
73
74 fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
75
76 size_t total_size; /*!< total memory size used by query */
77
78 fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
79 filtered from the output */
80
81 fts_ast_node_t* root; /*!< Abstract syntax tree */
82
83 fts_ast_node_t* cur_node; /*!< Current tree node */
84
85 ib_rbt_t* word_map; /*!< Matched word map for
86 searching by word*/
87
88 word_vector_t* word_vector; /*!< Matched word vector for
89 searching by index */
90
91 ib_rbt_t* doc_ids; /*!< The current set of matching
92 doc ids, elements are of
93 type fts_ranking_t */
94
95 ib_rbt_t* intersection; /*!< The doc ids that were found in
96 doc_ids, this tree will become
97 the new doc_ids, elements are of type
98 fts_ranking_t */
99
100 /*!< Prepared statement to read the
101 nodes from the FTS INDEX */
102 que_t* read_nodes_graph;
103
104 fts_ast_oper_t oper; /*!< Current boolean mode operator */
105
106 /*!< TRUE if we want to collect the
107 word positions within the document */
108 ibool collect_positions;
109
110 ulint flags; /*!< Specify the full text search type,
111 such as boolean search, phrase
112 search, proximity search etc. */
113
114 ulint distance; /*!< The proximity distance of a
115 phrase search. */
116
117 /*!< These doc ids are used as a
118 boundary condition when searching the
119 FTS index rows */
120
121 doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
122
123 doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
124
125 bool boolean_mode; /*!< TRUE if boolean mode query */
126
127 ib_vector_t* matched; /*!< Array of matching documents
128 (fts_match_t) to search for a phrase */
129
130 ib_vector_t** match_array; /*!< Used for proximity search, contains
131 position info for each matched word
132 in the word list */
133
134 ib_uint64_t total_docs; /*!< The total number of documents */
135
136 ulint total_words; /*!< The total number of words */
137
138 dberr_t error; /*!< Error code if any, that is
139 encountered during query processing */
140
141 ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
142 document, its elements are of type
143 fts_word_freq_t */
144
145 ib_rbt_t* wildcard_words; /*!< words with wildcard */
146
147 bool multi_exist; /*!< multiple FTS_EXIST oper */
148 byte visiting_sub_exp; /*!< count of nested
149 fts_ast_visit_sub_exp() */
150
151 st_mysql_ftparser* parser; /*!< fts plugin parser */
152 };
153
154 /** For phrase matching, first we collect the documents and the positions
155 then we match. */
156 struct fts_match_t {
157 doc_id_t doc_id; /*!< Document id */
158
159 ulint start; /*!< Start the phrase match from
160 this offset within the positions
161 vector. */
162
163 ib_vector_t* positions; /*!< Offsets of a word in a
164 document */
165 };
166
167 /** For matching tokens in a phrase search. We use this data structure in
168 the callback that determines whether a document should be accepted or
169 rejected for a phrase search. */
170 struct fts_select_t {
171 doc_id_t doc_id; /*!< The document id to match */
172
173 ulint min_pos; /*!< For found to be TRUE at least
174 one position must be greater than
175 min_pos. */
176
177 ibool found; /*!< TRUE if found */
178
179 fts_word_freq_t*
180 word_freq; /*!< Word frequency instance of the
181 current word being looked up in
182 the FTS index */
183 };
184
185 typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
186
187 /** structure defines a set of ranges for original documents, each of which
188 has a minimum position and maximum position. Text in such range should
189 contain all words in the proximity search. We will need to count the
190 words in such range to make sure it is less than the specified distance
191 of the proximity search */
192 struct fts_proximity_t {
193 ulint n_pos; /*!< number of position set, defines
194 a range (min to max) containing all
195 matching words */
196 pos_vector_t min_pos; /*!< the minimum position (in bytes)
197 of the range */
198 pos_vector_t max_pos; /*!< the maximum position (in bytes)
199 of the range */
200 };
201
202 /** The match positions and tokesn to match */
203 struct fts_phrase_t {
fts_phrase_tfts_phrase_t204 fts_phrase_t(const dict_table_t* table)
205 :
206 found(false),
207 match(NULL),
208 tokens(NULL),
209 distance(0),
210 charset(NULL),
211 heap(NULL),
212 zip_size(table->space->zip_size()),
213 proximity_pos(NULL),
214 parser(NULL)
215 {
216 }
217
218 /** Match result */
219 ibool found;
220
221 /** Positions within text */
222 const fts_match_t* match;
223
224 /** Tokens to match */
225 const ib_vector_t* tokens;
226
227 /** For matching on proximity distance. Can be 0 for exact match */
228 ulint distance;
229
230 /** Phrase match charset */
231 CHARSET_INFO* charset;
232
233 /** Heap for word processing */
234 mem_heap_t* heap;
235
236 /** ROW_FORMAT=COMPRESSED page size, or 0 */
237 const ulint zip_size;
238
239 /** Position info for proximity search verification. Records the
240 min and max position of words matched */
241 fts_proximity_t* proximity_pos;
242
243 /** FTS plugin parser */
244 st_mysql_ftparser* parser;
245 };
246
247 /** Paramter passed to fts phrase match by parser */
248 struct fts_phrase_param_t {
249 fts_phrase_t* phrase; /*!< Match phrase instance */
250 ulint token_index; /*!< Index of token to match next */
251 mem_heap_t* heap; /*!< Heap for word processing */
252 };
253
254 /** For storing the frequncy of a word/term in a document */
255 struct fts_doc_freq_t {
256 doc_id_t doc_id; /*!< Document id */
257 ulint freq; /*!< Frequency of a word in a document */
258 };
259
260 /** To determine the word frequency per document. */
261 struct fts_word_freq_t {
262 fts_string_t word; /*!< Word for which we need the freq,
263 it's allocated on the query heap */
264
265 ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
266 word frequencies. The elements are
267 of type fts_doc_freq_t */
268 ib_uint64_t doc_count; /*!< Total number of documents that
269 contain this word */
270 double idf; /*!< Inverse document frequency */
271 };
272
273 /********************************************************************
274 Callback function to fetch the rows in an FTS INDEX record.
275 @return always TRUE */
276 static
277 ibool
278 fts_query_index_fetch_nodes(
279 /*========================*/
280 void* row, /*!< in: sel_node_t* */
281 void* user_arg); /*!< in: pointer to ib_vector_t */
282
283 /********************************************************************
284 Read and filter nodes.
285 @return fts_node_t instance */
286 static
287 dberr_t
288 fts_query_filter_doc_ids(
289 /*=====================*/
290 fts_query_t* query, /*!< in: query instance */
291 const fts_string_t* word, /*!< in: the current word */
292 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
293 const fts_node_t* node, /*!< in: current FTS node */
294 void* data, /*!< in: doc id ilist */
295 ulint len, /*!< in: doc id ilist size */
296 ibool calc_doc_count);/*!< in: whether to remember doc
297 count */
298
299 /** Process (nested) sub-expression, create a new result set to store the
300 sub-expression result by processing nodes under current sub-expression
301 list. Merge the sub-expression result with that of parent expression list.
302 @param[in,out] node current root node
303 @param[in,out] visitor callback function
304 @param[in,out] arg argument for callback
305 @return DB_SUCCESS if all go well */
306 static
307 dberr_t
308 fts_ast_visit_sub_exp(
309 fts_ast_node_t* node,
310 fts_ast_callback visitor,
311 void* arg);
312
313 #if 0
314 /*****************************************************************//***
315 Find a doc_id in a word's ilist.
316 @return TRUE if found. */
317 static
318 ibool
319 fts_query_find_doc_id(
320 /*==================*/
321 fts_select_t* select, /*!< in/out: search the doc id selected,
322 update the frequency if found. */
323 void* data, /*!< in: doc id ilist */
324 ulint len); /*!< in: doc id ilist size */
325 #endif
326
327 /*************************************************************//**
328 This function implements a simple "blind" query expansion search:
329 words in documents found in the first search pass will be used as
330 search arguments to search the document again, thus "expand"
331 the search result set.
332 @return DB_SUCCESS if success, otherwise the error code */
333 static
334 dberr_t
335 fts_expand_query(
336 /*=============*/
337 dict_index_t* index, /*!< in: FTS index to search */
338 fts_query_t* query) /*!< in: query result, to be freed
339 by the client */
340 MY_ATTRIBUTE((nonnull, warn_unused_result));
341 /*************************************************************//**
342 This function finds documents that contain all words in a
343 phrase or proximity search. And if proximity search, verify
344 the words are close enough to each other, as in specified distance.
345 This function is called for phrase and proximity search.
346 @return TRUE if documents are found, FALSE if otherwise */
347 static
348 ibool
349 fts_phrase_or_proximity_search(
350 /*===========================*/
351 fts_query_t* query, /*!< in/out: query instance
352 query->doc_ids might be instantiated
353 with qualified doc IDs */
354 ib_vector_t* tokens); /*!< in: Tokens contain words */
355 /*************************************************************//**
356 This function checks whether words in result documents are close to
357 each other (within proximity range as specified by "distance").
358 If "distance" is MAX_ULINT, then it will find all combinations of
359 positions of matching words and store min and max positions
360 in the "qualified_pos" for later verification.
361 @return true if words are close to each other, false if otherwise */
362 static
363 bool
364 fts_proximity_get_positions(
365 /*========================*/
366 fts_match_t** match, /*!< in: query instance */
367 ulint num_match, /*!< in: number of matching
368 items */
369 ulint distance, /*!< in: distance value
370 for proximity search */
371 fts_proximity_t* qualified_pos); /*!< out: the position info
372 records ranges containing
373 all matching words. */
374 #if 0
375 /********************************************************************
376 Get the total number of words in a documents. */
377 static
378 ulint
379 fts_query_terms_in_document(
380 /*========================*/
381 /*!< out: DB_SUCCESS if all go well
382 else error code */
383 fts_query_t* query, /*!< in: FTS query state */
384 doc_id_t doc_id, /*!< in: the word to check */
385 ulint* total); /*!< out: total words in document */
386 #endif
387
388 /********************************************************************
389 Compare two fts_doc_freq_t doc_ids.
390 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
391 UNIV_INLINE
392 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)393 fts_freq_doc_id_cmp(
394 /*================*/
395 const void* p1, /*!< in: id1 */
396 const void* p2) /*!< in: id2 */
397 {
398 const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
399 const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
400
401 return((int) (fq1->doc_id - fq2->doc_id));
402 }
403
404 #if 0
405 /*******************************************************************//**
406 Print the table used for calculating LCS. */
407 static
408 void
409 fts_print_lcs_table(
410 /*================*/
411 const ulint* table, /*!< in: array to print */
412 ulint n_rows, /*!< in: total no. of rows */
413 ulint n_cols) /*!< in: total no. of cols */
414 {
415 ulint i;
416
417 for (i = 0; i < n_rows; ++i) {
418 ulint j;
419
420 printf("\n");
421
422 for (j = 0; j < n_cols; ++j) {
423
424 printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425 }
426 }
427 }
428
429 /********************************************************************
430 Find the longest common subsequence between the query string and
431 the document. */
432 static
433 ulint
434 fts_query_lcs(
435 /*==========*/
436 /*!< out: LCS (length) between
437 two ilists */
438 const ulint* p1, /*!< in: word positions of query */
439 ulint len_p1, /*!< in: no. of elements in p1 */
440 const ulint* p2, /*!< in: word positions within document */
441 ulint len_p2) /*!< in: no. of elements in p2 */
442 {
443 int i;
444 ulint len = 0;
445 ulint r = len_p1;
446 ulint c = len_p2;
447 ulint size = (r + 1) * (c + 1) * sizeof(ulint);
448 ulint* table = (ulint*) ut_malloc_nokey(size);
449
450 /* Traverse the table backwards, from the last row to the first and
451 also from the last column to the first. We compute the smaller
452 common subsequeces first, then use the caluclated values to determine
453 the longest common subsequence. The result will be in TABLE[0][0]. */
454 for (i = r; i >= 0; --i) {
455 int j;
456
457 for (j = c; j >= 0; --j) {
458
459 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
460
461 FTS_ELEM(table, c, i, j) = 0;
462
463 } else if (p1[i] == p2[j]) {
464
465 FTS_ELEM(table, c, i, j) = FTS_ELEM(
466 table, c, i + 1, j + 1) + 1;
467
468 } else {
469
470 ulint value;
471
472 value = ut_max(
473 FTS_ELEM(table, c, i + 1, j),
474 FTS_ELEM(table, c, i, j + 1));
475
476 FTS_ELEM(table, c, i, j) = value;
477 }
478 }
479 }
480
481 len = FTS_ELEM(table, c, 0, 0);
482
483 fts_print_lcs_table(table, r, c);
484 printf("\nLen=" ULINTPF "\n", len);
485
486 ut_free(table);
487
488 return(len);
489 }
490 #endif
491
492 /*******************************************************************//**
493 Compare two fts_ranking_t instance on their rank value and doc ids in
494 descending order on the rank and ascending order on doc id.
495 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
496 static
497 int
fts_query_compare_rank(const void * p1,const void * p2)498 fts_query_compare_rank(
499 /*===================*/
500 const void* p1, /*!< in: pointer to elem */
501 const void* p2) /*!< in: pointer to elem */
502 {
503 const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
504 const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
505
506 if (r2->rank < r1->rank) {
507 return(-1);
508 } else if (r2->rank == r1->rank) {
509
510 if (r1->doc_id < r2->doc_id) {
511 return(1);
512 } else if (r1->doc_id > r2->doc_id) {
513 return(1);
514 }
515
516 return(0);
517 }
518
519 return(1);
520 }
521
522 /*******************************************************************//**
523 Create words in ranking */
524 static
525 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)526 fts_ranking_words_create(
527 /*=====================*/
528 fts_query_t* query, /*!< in: query instance */
529 fts_ranking_t* ranking) /*!< in: ranking instance */
530 {
531 ranking->words = static_cast<byte*>(
532 mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533 ranking->words_len = RANKING_WORDS_INIT_LEN;
534 }
535
536 /*
537 The optimization here is using a char array(bitmap) to replace words rb tree
538 in fts_ranking_t.
539
540 It can save lots of memory except in some cases of QUERY EXPANSION.
541
542 'word_map' is used as a word dictionary, in which the key is a word, the value
543 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544 if not, we add it into 'word_map', and give it a position(actually a number).
545 then we set the corresponding bit to '1' at the position in the char array 'words'.
546
547 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548 more quickly than searching by value in 'word_map'. we use 'word_vector'
549 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551 by the word.
552 */
553
554 /*******************************************************************//**
555 Add a word into ranking */
556 static
557 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)558 fts_ranking_words_add(
559 /*==================*/
560 fts_query_t* query, /*!< in: query instance */
561 fts_ranking_t* ranking, /*!< in: ranking instance */
562 const fts_string_t* word) /*!< in: term/word to add */
563 {
564 ulint pos;
565 ulint byte_offset;
566 ulint bit_offset;
567 ib_rbt_bound_t parent;
568
569 /* Note: we suppose the word map and vector are append-only. */
570 ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571
572 /* We use ib_rbt to simulate a map, f_n_char means position. */
573 if (rbt_search(query->word_map, &parent, word) == 0) {
574 fts_string_t* result_word;
575
576 result_word = rbt_value(fts_string_t, parent.last);
577 pos = result_word->f_n_char;
578 ut_ad(pos < rbt_size(query->word_map));
579 } else {
580 /* Add the word to map. */
581 fts_string_t new_word;
582
583 pos = rbt_size(query->word_map);
584
585 fts_string_dup(&new_word, word, query->heap);
586 new_word.f_n_char = pos;
587
588 rbt_add_node(query->word_map, &parent, &new_word);
589 ut_ad(rbt_validate(query->word_map));
590 query->word_vector->push_back(new_word);
591 }
592
593 /* Check words len */
594 byte_offset = pos / CHAR_BIT;
595 if (byte_offset >= ranking->words_len) {
596 byte* words = ranking->words;
597 ulint words_len = ranking->words_len;
598
599 while (byte_offset >= words_len) {
600 words_len *= 2;
601 }
602
603 ranking->words = static_cast<byte*>(
604 mem_heap_zalloc(query->heap, words_len));
605 ut_memcpy(ranking->words, words, ranking->words_len);
606 ranking->words_len = words_len;
607 }
608
609 /* Set ranking words */
610 ut_ad(byte_offset < ranking->words_len);
611 bit_offset = pos % CHAR_BIT;
612 ranking->words[byte_offset] |= 1 << bit_offset;
613 }
614
615 /*******************************************************************//**
616 Get a word from a ranking
617 @return true if it's successful */
618 static
619 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)620 fts_ranking_words_get_next(
621 /*=======================*/
622 const fts_query_t* query, /*!< in: query instance */
623 fts_ranking_t* ranking,/*!< in: ranking instance */
624 ulint* pos, /*!< in/out: word start pos */
625 fts_string_t* word) /*!< in/out: term/word to add */
626 {
627 bool ret = false;
628 ulint max_pos = ranking->words_len * CHAR_BIT;
629
630 /* Search for next word */
631 while (*pos < max_pos) {
632 ulint byte_offset = *pos / CHAR_BIT;
633 ulint bit_offset = *pos % CHAR_BIT;
634
635 if (ranking->words[byte_offset] & (1 << bit_offset)) {
636 ret = true;
637 break;
638 }
639
640 *pos += 1;
641 };
642
643 /* Get next word from word vector */
644 if (ret) {
645 ut_ad(*pos < query->word_vector->size());
646 *word = query->word_vector->at((size_t)*pos);
647 *pos += 1;
648 }
649
650 return ret;
651 }
652
653 /*******************************************************************//**
654 Add a word if it doesn't exist, to the term freq RB tree. We store
655 a pointer to the word that is passed in as the argument.
656 @return pointer to word */
657 static
658 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)659 fts_query_add_word_freq(
660 /*====================*/
661 fts_query_t* query, /*!< in: query instance */
662 const fts_string_t* word) /*!< in: term/word to add */
663 {
664 ib_rbt_bound_t parent;
665
666 /* Lookup the word in our rb tree and add if it doesn't exist. */
667 if (rbt_search(query->word_freqs, &parent, word) != 0) {
668 fts_word_freq_t word_freq;
669
670 memset(&word_freq, 0, sizeof(word_freq));
671
672 fts_string_dup(&word_freq.word, word, query->heap);
673
674 word_freq.doc_count = 0;
675
676 word_freq.doc_freqs = rbt_create(
677 sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
678
679 parent.last = rbt_add_node(
680 query->word_freqs, &parent, &word_freq);
681
682 query->total_size += word->f_len
683 + SIZEOF_RBT_CREATE
684 + SIZEOF_RBT_NODE_ADD
685 + sizeof(fts_word_freq_t);
686 }
687
688 return(rbt_value(fts_word_freq_t, parent.last));
689 }
690
691 /*******************************************************************//**
692 Add a doc id if it doesn't exist, to the doc freq RB tree.
693 @return pointer to word */
694 static
695 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)696 fts_query_add_doc_freq(
697 /*===================*/
698 fts_query_t* query, /*!< in: query instance */
699 ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
700 doc_id_t doc_id) /*!< in: doc id to add */
701 {
702 ib_rbt_bound_t parent;
703
704 /* Lookup the doc id in our rb tree and add if it doesn't exist. */
705 if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
706 fts_doc_freq_t doc_freq;
707
708 memset(&doc_freq, 0, sizeof(doc_freq));
709
710 doc_freq.freq = 0;
711 doc_freq.doc_id = doc_id;
712
713 parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
714
715 query->total_size += SIZEOF_RBT_NODE_ADD
716 + sizeof(fts_doc_freq_t);
717 }
718
719 return(rbt_value(fts_doc_freq_t, parent.last));
720 }
721
722 /*******************************************************************//**
723 Add the doc id to the query set only if it's not in the
724 deleted array. */
725 static
726 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)727 fts_query_union_doc_id(
728 /*===================*/
729 fts_query_t* query, /*!< in: query instance */
730 doc_id_t doc_id, /*!< in: the doc id to add */
731 fts_rank_t rank) /*!< in: if non-zero, it is the
732 rank associated with the doc_id */
733 {
734 ib_rbt_bound_t parent;
735 ulint size = ib_vector_size(query->deleted->doc_ids);
736 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
737
738 /* Check if the doc id is deleted and it's not already in our set. */
739 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
740 && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
741
742 fts_ranking_t ranking;
743
744 ranking.rank = rank;
745 ranking.doc_id = doc_id;
746 fts_ranking_words_create(query, &ranking);
747
748 rbt_add_node(query->doc_ids, &parent, &ranking);
749
750 query->total_size += SIZEOF_RBT_NODE_ADD
751 + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
752 }
753 }
754
755 /*******************************************************************//**
756 Remove the doc id from the query set only if it's not in the
757 deleted set. */
758 static
759 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)760 fts_query_remove_doc_id(
761 /*====================*/
762 fts_query_t* query, /*!< in: query instance */
763 doc_id_t doc_id) /*!< in: the doc id to add */
764 {
765 ib_rbt_bound_t parent;
766 ulint size = ib_vector_size(query->deleted->doc_ids);
767 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
768
769 /* Check if the doc id is deleted and it's in our set. */
770 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
771 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
772 ut_free(rbt_remove_node(query->doc_ids, parent.last));
773
774 ut_ad(query->total_size >=
775 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
776 query->total_size -= SIZEOF_RBT_NODE_ADD
777 + sizeof(fts_ranking_t);
778 }
779 }
780
781 /*******************************************************************//**
782 Find the doc id in the query set but not in the deleted set, artificialy
783 downgrade or upgrade its ranking by a value and make/initialize its ranking
784 under or above its normal range 0 to 1. This is used for Boolean Search
785 operator such as Negation operator, which makes word's contribution to the
786 row's relevance to be negative */
787 static
788 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)789 fts_query_change_ranking(
790 /*====================*/
791 fts_query_t* query, /*!< in: query instance */
792 doc_id_t doc_id, /*!< in: the doc id to add */
793 ibool downgrade) /*!< in: Whether to downgrade ranking */
794 {
795 ib_rbt_bound_t parent;
796 ulint size = ib_vector_size(query->deleted->doc_ids);
797 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
798
799 /* Check if the doc id is deleted and it's in our set. */
800 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
801 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
802
803 fts_ranking_t* ranking;
804
805 ranking = rbt_value(fts_ranking_t, parent.last);
806
807 ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
808
809 /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
810 and RANK_UPGRADE (0.5) */
811 if (ranking->rank >= 1.0F) {
812 ranking->rank = 1.0F;
813 } else if (ranking->rank <= -1.0F) {
814 ranking->rank = -1.0F;
815 }
816 }
817 }
818
819 /*******************************************************************//**
820 Check the doc id in the query set only if it's not in the
821 deleted array. The doc ids that were found are stored in
822 another rb tree (fts_query_t::intersect). */
823 static
824 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)825 fts_query_intersect_doc_id(
826 /*=======================*/
827 fts_query_t* query, /*!< in: query instance */
828 doc_id_t doc_id, /*!< in: the doc id to add */
829 fts_rank_t rank) /*!< in: if non-zero, it is the
830 rank associated with the doc_id */
831 {
832 ib_rbt_bound_t parent;
833 ulint size = ib_vector_size(query->deleted->doc_ids);
834 doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
835 fts_ranking_t* ranking= NULL;
836
837 /* There are three types of intersect:
838 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
839 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
840 if it matches 'b'. if the doc is also in doc_ids, then change the
841 doc's rank, and add 'a' in doc's words.
842 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
843 if it matches 'b' and it's in doc_ids.(multi_exist = true). */
844
845 /* Check if the doc id is deleted and it's in our set */
846 if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
847 fts_ranking_t new_ranking;
848
849 if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
850 if (query->multi_exist) {
851 return;
852 } else {
853 new_ranking.words = NULL;
854 }
855 } else {
856 ranking = rbt_value(fts_ranking_t, parent.last);
857
858 /* We've just checked the doc id before */
859 if (ranking->words == NULL) {
860 ut_ad(rbt_search(query->intersection, &parent,
861 ranking) == 0);
862 return;
863 }
864
865 /* Merge rank */
866 rank += ranking->rank;
867 if (rank >= 1.0F) {
868 rank = 1.0F;
869 } else if (rank <= -1.0F) {
870 rank = -1.0F;
871 }
872
873 /* Take words */
874 new_ranking.words = ranking->words;
875 new_ranking.words_len = ranking->words_len;
876 }
877
878 new_ranking.rank = rank;
879 new_ranking.doc_id = doc_id;
880
881 if (rbt_search(query->intersection, &parent,
882 &new_ranking) != 0) {
883 if (new_ranking.words == NULL) {
884 fts_ranking_words_create(query, &new_ranking);
885
886 query->total_size += RANKING_WORDS_INIT_LEN;
887 } else {
888 /* Note that the intersection has taken
889 ownership of the ranking data. */
890 ranking->words = NULL;
891 }
892
893 rbt_add_node(query->intersection,
894 &parent, &new_ranking);
895
896 query->total_size += SIZEOF_RBT_NODE_ADD
897 + sizeof(fts_ranking_t);
898 }
899 }
900 }
901
902 /*******************************************************************//**
903 Free the document ranking rb tree. */
904 static
905 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)906 fts_query_free_doc_ids(
907 /*===================*/
908 fts_query_t* query, /*!< in: query instance */
909 ib_rbt_t* doc_ids) /*!< in: rb tree to free */
910 {
911 const ib_rbt_node_t* node;
912
913 for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
914
915 fts_ranking_t* ranking;
916
917 ranking = rbt_value(fts_ranking_t, node);
918
919 if (ranking->words) {
920 ranking->words = NULL;
921 }
922
923 ut_free(rbt_remove_node(doc_ids, node));
924
925 ut_ad(query->total_size >=
926 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
927 query->total_size -= SIZEOF_RBT_NODE_ADD
928 + sizeof(fts_ranking_t);
929 }
930
931 rbt_free(doc_ids);
932
933 ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
934 query->total_size -= SIZEOF_RBT_CREATE;
935 }
936
937 /*******************************************************************//**
938 Add the word to the documents "list" of matching words from
939 the query. We make a copy of the word from the query heap. */
940 static
941 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)942 fts_query_add_word_to_document(
943 /*===========================*/
944 fts_query_t* query, /*!< in: query to update */
945 doc_id_t doc_id, /*!< in: the document to update */
946 const fts_string_t* word) /*!< in: the token to add */
947 {
948 ib_rbt_bound_t parent;
949 fts_ranking_t* ranking = NULL;
950
951 if (query->flags == FTS_OPT_RANKING) {
952 return;
953 }
954
955 /* First we search the intersection RB tree as it could have
956 taken ownership of the words rb tree instance. */
957 if (query->intersection
958 && rbt_search(query->intersection, &parent, &doc_id) == 0) {
959
960 ranking = rbt_value(fts_ranking_t, parent.last);
961 }
962
963 if (ranking == NULL
964 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
965
966 ranking = rbt_value(fts_ranking_t, parent.last);
967 }
968
969 if (ranking != NULL) {
970 fts_ranking_words_add(query, ranking, word);
971 }
972 }
973
974 /*******************************************************************//**
975 Check the node ilist. */
976 static
977 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)978 fts_query_check_node(
979 /*=================*/
980 fts_query_t* query, /*!< in: query to update */
981 const fts_string_t* token, /*!< in: the token to search */
982 const fts_node_t* node) /*!< in: node to check */
983 {
984 /* Skip nodes whose doc ids are out range. */
985 if (query->oper == FTS_EXIST
986 && ((query->upper_doc_id > 0
987 && node->first_doc_id > query->upper_doc_id)
988 || (query->lower_doc_id > 0
989 && node->last_doc_id < query->lower_doc_id))) {
990
991 /* Ignore */
992
993 } else {
994 int ret;
995 ib_rbt_bound_t parent;
996 ulint ilist_size = node->ilist_size;
997 fts_word_freq_t*word_freqs;
998
999 /* The word must exist. */
1000 ret = rbt_search(query->word_freqs, &parent, token);
1001 ut_a(ret == 0);
1002
1003 word_freqs = rbt_value(fts_word_freq_t, parent.last);
1004
1005 query->error = fts_query_filter_doc_ids(
1006 query, token, word_freqs, node,
1007 node->ilist, ilist_size, TRUE);
1008 }
1009 }
1010
1011 /*****************************************************************//**
1012 Search index cache for word with wildcard match.
1013 @return number of words matched */
1014 static
1015 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1016 fts_cache_find_wildcard(
1017 /*====================*/
1018 fts_query_t* query, /*!< in: query instance */
1019 const fts_index_cache_t*index_cache, /*!< in: cache to search */
1020 const fts_string_t* token) /*!< in: token to search */
1021 {
1022 ib_rbt_bound_t parent;
1023 const ib_vector_t* nodes = NULL;
1024 fts_string_t srch_text;
1025 byte term[FTS_MAX_WORD_LEN + 1];
1026 ulint num_word = 0;
1027
1028 srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1029 ? token->f_len - 1
1030 : token->f_len;
1031
1032 strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1033 term[srch_text.f_len] = '\0';
1034 srch_text.f_str = term;
1035
1036 /* Lookup the word in the rb tree */
1037 if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1038 innobase_fts_text_cmp_prefix) == 0) {
1039 const fts_tokenizer_word_t* word;
1040 ulint i;
1041 const ib_rbt_node_t* cur_node;
1042 ibool forward = FALSE;
1043
1044 word = rbt_value(fts_tokenizer_word_t, parent.last);
1045 cur_node = parent.last;
1046
1047 while (innobase_fts_text_cmp_prefix(
1048 index_cache->charset, &srch_text, &word->text) == 0) {
1049
1050 nodes = word->nodes;
1051
1052 for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1053 int ret;
1054 const fts_node_t* node;
1055 ib_rbt_bound_t freq_parent;
1056 fts_word_freq_t* word_freqs;
1057
1058 node = static_cast<const fts_node_t*>(
1059 ib_vector_get_const(nodes, i));
1060
1061 ret = rbt_search(query->word_freqs,
1062 &freq_parent,
1063 &srch_text);
1064
1065 ut_a(ret == 0);
1066
1067 word_freqs = rbt_value(
1068 fts_word_freq_t,
1069 freq_parent.last);
1070
1071 query->error = fts_query_filter_doc_ids(
1072 query, &srch_text,
1073 word_freqs, node,
1074 node->ilist, node->ilist_size, TRUE);
1075
1076 if (query->error != DB_SUCCESS) {
1077 return(0);
1078 }
1079 }
1080
1081 num_word++;
1082
1083 if (!forward) {
1084 cur_node = rbt_prev(
1085 index_cache->words, cur_node);
1086 } else {
1087 cont_search:
1088 cur_node = rbt_next(
1089 index_cache->words, cur_node);
1090 }
1091
1092 if (!cur_node) {
1093 break;
1094 }
1095
1096 word = rbt_value(fts_tokenizer_word_t, cur_node);
1097 }
1098
1099 if (!forward) {
1100 forward = TRUE;
1101 cur_node = parent.last;
1102 goto cont_search;
1103 }
1104 }
1105
1106 return(num_word);
1107 }
1108
1109 /*****************************************************************//**
1110 Set difference.
1111 @return DB_SUCCESS if all go well */
1112 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1113 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1114 fts_query_difference(
1115 /*=================*/
1116 fts_query_t* query, /*!< in: query instance */
1117 const fts_string_t* token) /*!< in: token to search */
1118 {
1119 ulint n_doc_ids= 0;
1120 trx_t* trx = query->trx;
1121 dict_table_t* table = query->index->table;
1122
1123 ut_a(query->oper == FTS_IGNORE);
1124
1125 #ifdef FTS_INTERNAL_DIAG_PRINT
1126 {
1127 ib::info out;
1128 out << "DIFFERENCE: Searching: '";
1129 out.write(token->f_str, token->f_len);
1130 out << "'";
1131 }
1132 #endif
1133
1134 if (query->doc_ids) {
1135 n_doc_ids = rbt_size(query->doc_ids);
1136 }
1137
1138 /* There is nothing we can substract from an empty set. */
1139 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1140 ulint i;
1141 fts_fetch_t fetch;
1142 const ib_vector_t* nodes;
1143 const fts_index_cache_t*index_cache;
1144 que_t* graph = NULL;
1145 fts_cache_t* cache = table->fts->cache;
1146 dberr_t error;
1147
1148 rw_lock_x_lock(&cache->lock);
1149
1150 index_cache = fts_find_index_cache(cache, query->index);
1151
1152 /* Must find the index cache */
1153 ut_a(index_cache != NULL);
1154
1155 /* Search the cache for a matching word first. */
1156 if (query->cur_node->term.wildcard
1157 && query->flags != FTS_PROXIMITY
1158 && query->flags != FTS_PHRASE) {
1159 fts_cache_find_wildcard(query, index_cache, token);
1160 } else {
1161 nodes = fts_cache_find_word(index_cache, token);
1162
1163 for (i = 0; nodes && i < ib_vector_size(nodes)
1164 && query->error == DB_SUCCESS; ++i) {
1165 const fts_node_t* node;
1166
1167 node = static_cast<const fts_node_t*>(
1168 ib_vector_get_const(nodes, i));
1169
1170 fts_query_check_node(query, token, node);
1171 }
1172 }
1173
1174 rw_lock_x_unlock(&cache->lock);
1175
1176 /* error is passed by 'query->error' */
1177 if (query->error != DB_SUCCESS) {
1178 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1179 return(query->error);
1180 }
1181
1182 /* Setup the callback args for filtering and
1183 consolidating the ilist. */
1184 fetch.read_arg = query;
1185 fetch.read_record = fts_query_index_fetch_nodes;
1186
1187 error = fts_index_fetch_nodes(
1188 trx, &graph, &query->fts_index_table, token, &fetch);
1189
1190 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1191 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1192 if (error != DB_SUCCESS) {
1193 query->error = error;
1194 }
1195
1196 fts_que_graph_free(graph);
1197 }
1198
1199 /* The size can't increase. */
1200 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1201
1202 return(query->error);
1203 }
1204
1205 /*****************************************************************//**
1206 Intersect the token doc ids with the current set.
1207 @return DB_SUCCESS if all go well */
1208 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1209 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1210 fts_query_intersect(
1211 /*================*/
1212 fts_query_t* query, /*!< in: query instance */
1213 const fts_string_t* token) /*!< in: the token to search */
1214 {
1215 trx_t* trx = query->trx;
1216 dict_table_t* table = query->index->table;
1217
1218 ut_a(query->oper == FTS_EXIST);
1219
1220 #ifdef FTS_INTERNAL_DIAG_PRINT
1221 {
1222 ib::info out;
1223 out << "INTERSECT: Searching: '";
1224 out.write(token->f_str, token->f_len);
1225 out << "'";
1226 }
1227 #endif
1228
1229 /* If the words set is not empty and multi exist is true,
1230 we know the intersection set is empty in advance. */
1231 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1232 ulint n_doc_ids = 0;
1233 ulint i;
1234 fts_fetch_t fetch;
1235 const ib_vector_t* nodes;
1236 const fts_index_cache_t*index_cache;
1237 que_t* graph = NULL;
1238 fts_cache_t* cache = table->fts->cache;
1239 dberr_t error;
1240
1241 ut_a(!query->intersection);
1242
1243 n_doc_ids = rbt_size(query->doc_ids);
1244
1245 /* Create the rb tree that will hold the doc ids of
1246 the intersection. */
1247 query->intersection = rbt_create(
1248 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1249
1250 query->total_size += SIZEOF_RBT_CREATE;
1251
1252 /* This is to avoid decompressing the ilist if the
1253 node's ilist doc ids are out of range. */
1254 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1255 const ib_rbt_node_t* node;
1256 doc_id_t* doc_id;
1257
1258 node = rbt_first(query->doc_ids);
1259 doc_id = rbt_value(doc_id_t, node);
1260 query->lower_doc_id = *doc_id;
1261
1262 node = rbt_last(query->doc_ids);
1263 doc_id = rbt_value(doc_id_t, node);
1264 query->upper_doc_id = *doc_id;
1265
1266 } else {
1267 query->lower_doc_id = 0;
1268 query->upper_doc_id = 0;
1269 }
1270
1271 /* Search the cache for a matching word first. */
1272
1273 rw_lock_x_lock(&cache->lock);
1274
1275 /* Search for the index specific cache. */
1276 index_cache = fts_find_index_cache(cache, query->index);
1277
1278 /* Must find the index cache. */
1279 ut_a(index_cache != NULL);
1280
1281 if (query->cur_node->term.wildcard) {
1282 /* Wildcard search the index cache */
1283 fts_cache_find_wildcard(query, index_cache, token);
1284 } else {
1285 nodes = fts_cache_find_word(index_cache, token);
1286
1287 for (i = 0; nodes && i < ib_vector_size(nodes)
1288 && query->error == DB_SUCCESS; ++i) {
1289 const fts_node_t* node;
1290
1291 node = static_cast<const fts_node_t*>(
1292 ib_vector_get_const(nodes, i));
1293
1294 fts_query_check_node(query, token, node);
1295 }
1296 }
1297
1298 rw_lock_x_unlock(&cache->lock);
1299
1300 /* error is passed by 'query->error' */
1301 if (query->error != DB_SUCCESS) {
1302 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1303 return(query->error);
1304 }
1305
1306 /* Setup the callback args for filtering and
1307 consolidating the ilist. */
1308 fetch.read_arg = query;
1309 fetch.read_record = fts_query_index_fetch_nodes;
1310
1311 error = fts_index_fetch_nodes(
1312 trx, &graph, &query->fts_index_table, token, &fetch);
1313
1314 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1315 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1316 if (error != DB_SUCCESS) {
1317 query->error = error;
1318 }
1319
1320 fts_que_graph_free(graph);
1321
1322 if (query->error == DB_SUCCESS) {
1323 /* Make the intesection (rb tree) the current doc id
1324 set and free the old set. */
1325 fts_query_free_doc_ids(query, query->doc_ids);
1326 query->doc_ids = query->intersection;
1327 query->intersection = NULL;
1328
1329 ut_a(!query->multi_exist || (query->multi_exist
1330 && rbt_size(query->doc_ids) <= n_doc_ids));
1331 }
1332 }
1333
1334 return(query->error);
1335 }
1336
1337 /*****************************************************************//**
1338 Query index cache.
1339 @return DB_SUCCESS if all go well */
1340 static
1341 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1342 fts_query_cache(
1343 /*============*/
1344 fts_query_t* query, /*!< in/out: query instance */
1345 const fts_string_t* token) /*!< in: token to search */
1346 {
1347 const fts_index_cache_t*index_cache;
1348 dict_table_t* table = query->index->table;
1349 fts_cache_t* cache = table->fts->cache;
1350
1351 /* Search the cache for a matching word first. */
1352 rw_lock_x_lock(&cache->lock);
1353
1354 /* Search for the index specific cache. */
1355 index_cache = fts_find_index_cache(cache, query->index);
1356
1357 /* Must find the index cache. */
1358 ut_a(index_cache != NULL);
1359
1360 if (query->cur_node->term.wildcard
1361 && query->flags != FTS_PROXIMITY
1362 && query->flags != FTS_PHRASE) {
1363 /* Wildcard search the index cache */
1364 fts_cache_find_wildcard(query, index_cache, token);
1365 } else {
1366 const ib_vector_t* nodes;
1367 ulint i;
1368
1369 nodes = fts_cache_find_word(index_cache, token);
1370
1371 for (i = 0; nodes && i < ib_vector_size(nodes)
1372 && query->error == DB_SUCCESS; ++i) {
1373 const fts_node_t* node;
1374
1375 node = static_cast<const fts_node_t*>(
1376 ib_vector_get_const(nodes, i));
1377
1378 fts_query_check_node(query, token, node);
1379 }
1380 }
1381
1382 rw_lock_x_unlock(&cache->lock);
1383
1384 return(query->error);
1385 }
1386
1387 /*****************************************************************//**
1388 Set union.
1389 @return DB_SUCCESS if all go well */
1390 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1391 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1392 fts_query_union(
1393 /*============*/
1394 fts_query_t* query, /*!< in: query instance */
1395 fts_string_t* token) /*!< in: token to search */
1396 {
1397 fts_fetch_t fetch;
1398 ulint n_doc_ids = 0;
1399 trx_t* trx = query->trx;
1400 que_t* graph = NULL;
1401 dberr_t error;
1402
1403 ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1404 query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1405
1406 #ifdef FTS_INTERNAL_DIAG_PRINT
1407 {
1408 ib::info out;
1409 out << "UNION: Searching: '";
1410 out.write(token->f_str, token->f_len);
1411 out << "'";
1412 }
1413 #endif
1414
1415 if (query->doc_ids) {
1416 n_doc_ids = rbt_size(query->doc_ids);
1417 }
1418
1419 if (token->f_len == 0) {
1420 return(query->error);
1421 }
1422
1423 fts_query_cache(query, token);
1424
1425 /* Setup the callback args for filtering and
1426 consolidating the ilist. */
1427 fetch.read_arg = query;
1428 fetch.read_record = fts_query_index_fetch_nodes;
1429
1430 /* Read the nodes from disk. */
1431 error = fts_index_fetch_nodes(
1432 trx, &graph, &query->fts_index_table, token, &fetch);
1433
1434 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1435 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1436 if (error != DB_SUCCESS) {
1437 query->error = error;
1438 }
1439
1440 fts_que_graph_free(graph);
1441
1442 if (query->error == DB_SUCCESS) {
1443
1444 /* The size can't decrease. */
1445 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1446
1447 /* Calulate the number of doc ids that were added to
1448 the current doc id set. */
1449 if (query->doc_ids) {
1450 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1451 }
1452 }
1453
1454 return(query->error);
1455 }
1456
1457 /*****************************************************************//**
1458 Depending upon the current query operator process the doc id.
1459 return DB_SUCCESS if all go well
1460 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1461 static
1462 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1463 fts_query_process_doc_id(
1464 /*=====================*/
1465 fts_query_t* query, /*!< in: query instance */
1466 doc_id_t doc_id, /*!< in: doc id to process */
1467 fts_rank_t rank) /*!< in: if non-zero, it is the
1468 rank associated with the doc_id */
1469 {
1470 if (query->flags == FTS_OPT_RANKING) {
1471 return(DB_SUCCESS);
1472 }
1473
1474 switch (query->oper) {
1475 case FTS_NONE:
1476 fts_query_union_doc_id(query, doc_id, rank);
1477 break;
1478
1479 case FTS_EXIST:
1480 fts_query_intersect_doc_id(query, doc_id, rank);
1481 break;
1482
1483 case FTS_IGNORE:
1484 fts_query_remove_doc_id(query, doc_id);
1485 break;
1486
1487 case FTS_NEGATE:
1488 fts_query_change_ranking(query, doc_id, TRUE);
1489 break;
1490
1491 case FTS_DECR_RATING:
1492 fts_query_union_doc_id(query, doc_id, rank);
1493 fts_query_change_ranking(query, doc_id, TRUE);
1494 break;
1495
1496 case FTS_INCR_RATING:
1497 fts_query_union_doc_id(query, doc_id, rank);
1498 fts_query_change_ranking(query, doc_id, FALSE);
1499 break;
1500
1501 default:
1502 ut_error;
1503 }
1504
1505 if (query->total_size > fts_result_cache_limit) {
1506 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1507 } else {
1508 return(DB_SUCCESS);
1509 }
1510 }
1511
1512 /*****************************************************************//**
1513 Merge two result sets. */
1514 static
1515 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1516 fts_merge_doc_ids(
1517 /*==============*/
1518 fts_query_t* query, /*!< in,out: query instance */
1519 const ib_rbt_t* doc_ids) /*!< in: result set to merge */
1520 {
1521 const ib_rbt_node_t* node;
1522
1523 DBUG_ENTER("fts_merge_doc_ids");
1524
1525 ut_a(!query->intersection);
1526
1527 /* To process FTS_EXIST operation (intersection), we need
1528 to create a new result set for fts_query_intersect(). */
1529 if (query->oper == FTS_EXIST) {
1530
1531 query->intersection = rbt_create(
1532 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1533
1534 query->total_size += SIZEOF_RBT_CREATE;
1535 }
1536
1537 /* Merge the elements to the result set. */
1538 for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1539 fts_ranking_t* ranking;
1540 ulint pos = 0;
1541 fts_string_t word;
1542
1543 ranking = rbt_value(fts_ranking_t, node);
1544
1545 query->error = fts_query_process_doc_id(
1546 query, ranking->doc_id, ranking->rank);
1547
1548 if (query->error != DB_SUCCESS) {
1549 DBUG_RETURN(query->error);
1550 }
1551
1552 /* Merge words. Don't need to take operator into account. */
1553 ut_a(ranking->words);
1554 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1555 fts_query_add_word_to_document(query, ranking->doc_id,
1556 &word);
1557 }
1558 }
1559
1560 /* If it is an intersection operation, reset query->doc_ids
1561 to query->intersection and free the old result list. */
1562 if (query->oper == FTS_EXIST && query->intersection != NULL) {
1563 fts_query_free_doc_ids(query, query->doc_ids);
1564 query->doc_ids = query->intersection;
1565 query->intersection = NULL;
1566 }
1567
1568 DBUG_RETURN(DB_SUCCESS);
1569 }
1570
1571 /*****************************************************************//**
1572 Skip non-whitespace in a string. Move ptr to the next word boundary.
1573 @return pointer to first whitespace character or end */
1574 UNIV_INLINE
1575 byte*
fts_query_skip_word(byte * ptr,const byte * end)1576 fts_query_skip_word(
1577 /*================*/
1578 byte* ptr, /*!< in: start of scan */
1579 const byte* end) /*!< in: pointer to end of string */
1580 {
1581 /* TODO: Does this have to be UTF-8 too ? */
1582 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1583 ++ptr;
1584 }
1585
1586 return(ptr);
1587 }
1588
1589 /*****************************************************************//**
1590 Check whether the remaining terms in the phrase match the text.
1591 @return TRUE if matched else FALSE */
1592 static
1593 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1594 fts_query_match_phrase_terms(
1595 /*=========================*/
1596 fts_phrase_t* phrase, /*!< in: phrase to match */
1597 byte** start, /*!< in/out: text to search, we can't
1598 make this const becase we need to
1599 first convert the string to
1600 lowercase */
1601 const byte* end, /*!< in: pointer to the end of
1602 the string to search */
1603 mem_heap_t* heap) /*!< in: heap */
1604 {
1605 ulint i;
1606 byte* ptr = *start;
1607 const ib_vector_t* tokens = phrase->tokens;
1608 ulint distance = phrase->distance;
1609
1610 /* We check only from the second term onwards, since the first
1611 must have matched otherwise we wouldn't be here. */
1612 for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1613 fts_string_t match;
1614 fts_string_t cmp_str;
1615 const fts_string_t* token;
1616 int result;
1617 ulint ret;
1618
1619 ret = innobase_mysql_fts_get_token(
1620 phrase->charset, ptr,
1621 const_cast<byte*>(end), &match);
1622
1623 if (match.f_len > 0) {
1624 /* Get next token to match. */
1625 token = static_cast<const fts_string_t*>(
1626 ib_vector_get_const(tokens, i));
1627
1628 fts_string_dup(&cmp_str, &match, heap);
1629
1630 result = innobase_fts_text_case_cmp(
1631 phrase->charset, token, &cmp_str);
1632
1633 /* Skip the rest of the tokens if this one doesn't
1634 match and the proximity distance is exceeded. */
1635 if (result
1636 && (distance == ULINT_UNDEFINED
1637 || distance == 0)) {
1638
1639 break;
1640 }
1641
1642 /* This token matched move to the next token. */
1643 if (result == 0) {
1644 /* Advance the text to search by the length
1645 of the last token. */
1646 ptr += ret;
1647
1648 /* Advance to the next token. */
1649 ++i;
1650 } else {
1651
1652 ut_a(distance != ULINT_UNDEFINED);
1653
1654 ptr = fts_query_skip_word(ptr, end);
1655 }
1656
1657 /* Distance can be 0 for exact matches. */
1658 if (distance != ULINT_UNDEFINED && distance > 0) {
1659 --distance;
1660 }
1661 } else {
1662 ptr += ret;
1663 }
1664 }
1665
1666 *start = ptr;
1667
1668 /* Can't be greater than the number of elements. */
1669 ut_a(i <= ib_vector_size(tokens));
1670
1671 /* This is the case for multiple words. */
1672 if (i == ib_vector_size(tokens)) {
1673 phrase->found = TRUE;
1674 }
1675
1676 return(phrase->found);
1677 }
1678
1679 /*****************************************************************//**
1680 Callback function to count the number of words in position ranges,
1681 and see whether the word count is in specified "phrase->distance"
1682 @return true if the number of characters is less than the "distance" */
1683 static
1684 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1685 fts_proximity_is_word_in_range(
1686 /*===========================*/
1687 const fts_phrase_t*
1688 phrase, /*!< in: phrase with the search info */
1689 byte* start, /*!< in: text to search */
1690 ulint total_len) /*!< in: length of text */
1691 {
1692 fts_proximity_t* proximity_pos = phrase->proximity_pos;
1693
1694 ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1695 ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1696
1697 /* Search each matched position pair (with min and max positions)
1698 and count the number of words in the range */
1699 for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1700 ulint cur_pos = proximity_pos->min_pos[i];
1701 ulint n_word = 0;
1702
1703 ut_ad(proximity_pos->max_pos[i] <= total_len);
1704
1705 /* Walk through words in the range and count them */
1706 while (cur_pos <= proximity_pos->max_pos[i]) {
1707 ulint len;
1708 fts_string_t str;
1709
1710 len = innobase_mysql_fts_get_token(
1711 phrase->charset,
1712 start + cur_pos,
1713 start + total_len, &str);
1714
1715 if (len == 0) {
1716 break;
1717 }
1718
1719 /* Advances position with "len" bytes */
1720 cur_pos += len;
1721
1722 /* Record the number of words */
1723 if (str.f_n_char > 0) {
1724 n_word++;
1725 }
1726
1727 if (n_word > phrase->distance) {
1728 break;
1729 }
1730 }
1731
1732 /* Check if the number of words is less than specified
1733 "distance" */
1734 if (n_word && n_word <= phrase->distance) {
1735 return(true);
1736 }
1737 }
1738
1739 return(false);
1740 }
1741
1742 /*****************************************************************//**
1743 FTS plugin parser 'myql_add_word' callback function for phrase match
1744 Refer to 'st_mysql_ftparser_param' for more detail.
1745 @return 0 if match, or return non-zero */
1746 static
1747 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO *)1748 fts_query_match_phrase_add_word_for_parser(
1749 /*=======================================*/
1750 MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
1751 const char* word, /*!< in: token */
1752 int word_len, /*!< in: token length */
1753 MYSQL_FTPARSER_BOOLEAN_INFO*)
1754 {
1755 fts_phrase_param_t* phrase_param;
1756 fts_phrase_t* phrase;
1757 const ib_vector_t* tokens;
1758 fts_string_t match;
1759 fts_string_t cmp_str;
1760 const fts_string_t* token;
1761 int result;
1762 mem_heap_t* heap;
1763
1764 phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1765 heap = phrase_param->heap;
1766 phrase = phrase_param->phrase;
1767 tokens = phrase->tokens;
1768
1769 /* In case plugin parser doesn't check return value */
1770 if (phrase_param->token_index == ib_vector_size(tokens)) {
1771 return(1);
1772 }
1773
1774 match.f_str = (uchar *)(word);
1775 match.f_len = ulint(word_len);
1776 match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1777
1778 if (match.f_len > 0) {
1779 /* Get next token to match. */
1780 ut_a(phrase_param->token_index < ib_vector_size(tokens));
1781 token = static_cast<const fts_string_t*>(
1782 ib_vector_get_const(tokens, phrase_param->token_index));
1783
1784 fts_string_dup(&cmp_str, &match, heap);
1785
1786 result = innobase_fts_text_case_cmp(
1787 phrase->charset, token, &cmp_str);
1788
1789 if (result == 0) {
1790 phrase_param->token_index++;
1791 } else {
1792 return(1);
1793 }
1794 }
1795
1796 /* Can't be greater than the number of elements. */
1797 ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1798
1799 /* This is the case for multiple words. */
1800 if (phrase_param->token_index == ib_vector_size(tokens)) {
1801 phrase->found = TRUE;
1802 }
1803
1804 return(static_cast<int>(phrase->found));
1805 }
1806
1807 /*****************************************************************//**
1808 Check whether the terms in the phrase match the text.
1809 @return TRUE if matched else FALSE */
1810 static
1811 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1812 fts_query_match_phrase_terms_by_parser(
1813 /*===================================*/
1814 fts_phrase_param_t* phrase_param, /* in/out: phrase param */
1815 st_mysql_ftparser* parser, /* in: plugin fts parser */
1816 byte* text, /* in: text to check */
1817 ulint len) /* in: text length */
1818 {
1819 MYSQL_FTPARSER_PARAM param;
1820
1821 ut_a(parser);
1822
1823 /* Set paramters for param */
1824 param.mysql_parse = fts_tokenize_document_internal;
1825 param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1826 param.mysql_ftparam = phrase_param;
1827 param.cs = phrase_param->phrase->charset;
1828 param.doc = reinterpret_cast<char*>(text);
1829 param.length = static_cast<int>(len);
1830 param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1831
1832 PARSER_INIT(parser, ¶m);
1833 parser->parse(¶m);
1834 PARSER_DEINIT(parser, ¶m);
1835
1836 return(phrase_param->phrase->found);
1837 }
1838
1839 /*****************************************************************//**
1840 Callback function to fetch and search the document.
1841 @return TRUE if matched else FALSE */
1842 static
1843 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1844 fts_query_match_phrase(
1845 /*===================*/
1846 fts_phrase_t* phrase, /*!< in: phrase to match */
1847 byte* start, /*!< in: text to search, we can't make
1848 this const becase we need to first
1849 convert the string to lowercase */
1850 ulint cur_len, /*!< in: length of text */
1851 ulint prev_len, /*!< in: total length for searched
1852 doc fields*/
1853 mem_heap_t* heap) /* heap */
1854 {
1855 ulint i;
1856 const fts_string_t* first;
1857 const byte* end = start + cur_len;
1858 const ib_vector_t* tokens = phrase->tokens;
1859 const ib_vector_t* positions = phrase->match->positions;
1860
1861 ut_a(!phrase->found);
1862 ut_a(phrase->match->doc_id > 0);
1863 ut_a(ib_vector_size(tokens) > 0);
1864 ut_a(ib_vector_size(positions) > 0);
1865
1866 first = static_cast<const fts_string_t*>(
1867 ib_vector_get_const(tokens, 0));
1868
1869 ut_a(phrase->match->start < ib_vector_size(positions));
1870
1871 for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1872 ulint pos;
1873 byte* ptr = start;
1874
1875 pos = *(ulint*) ib_vector_get_const(positions, i);
1876
1877 if (pos == ULINT_UNDEFINED) {
1878 break;
1879 }
1880
1881 if (pos < prev_len) {
1882 continue;
1883 }
1884
1885 /* Document positions are calculated from the beginning
1886 of the first field, need to save the length for each
1887 searched field to adjust the doc position when search
1888 phrases. */
1889 pos -= prev_len;
1890 ptr = start + pos;
1891
1892 /* Within limits ? */
1893 if (ptr >= end) {
1894 break;
1895 }
1896
1897 if (phrase->parser) {
1898 fts_phrase_param_t phrase_param;
1899
1900 phrase_param.phrase = phrase;
1901 phrase_param.token_index = 0;
1902 phrase_param.heap = heap;
1903
1904 if (fts_query_match_phrase_terms_by_parser(
1905 &phrase_param,
1906 phrase->parser,
1907 ptr,
1908 ulint(end - ptr))) {
1909 break;
1910 }
1911 } else {
1912 fts_string_t match;
1913 fts_string_t cmp_str;
1914 ulint ret;
1915
1916 match.f_str = ptr;
1917 ret = innobase_mysql_fts_get_token(
1918 phrase->charset, start + pos,
1919 const_cast<byte*>(end), &match);
1920
1921 if (match.f_len == 0) {
1922 break;
1923 }
1924
1925 fts_string_dup(&cmp_str, &match, heap);
1926
1927 if (innobase_fts_text_case_cmp(
1928 phrase->charset, first, &cmp_str) == 0) {
1929
1930 /* This is the case for the single word
1931 in the phrase. */
1932 if (ib_vector_size(phrase->tokens) == 1) {
1933 phrase->found = TRUE;
1934 break;
1935 }
1936
1937 ptr += ret;
1938
1939 /* Match the remaining terms in the phrase. */
1940 if (fts_query_match_phrase_terms(phrase, &ptr,
1941 end, heap)) {
1942 break;
1943 }
1944 }
1945 }
1946 }
1947
1948 return(phrase->found);
1949 }
1950
1951 /*****************************************************************//**
1952 Callback function to fetch and search the document.
1953 @return whether the phrase is found */
1954 static
1955 ibool
fts_query_fetch_document(void * row,void * user_arg)1956 fts_query_fetch_document(
1957 /*=====================*/
1958 void* row, /*!< in: sel_node_t* */
1959 void* user_arg) /*!< in: fts_doc_t* */
1960 {
1961
1962 que_node_t* exp;
1963 sel_node_t* node = static_cast<sel_node_t*>(row);
1964 fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1965 ulint prev_len = 0;
1966 ulint total_len = 0;
1967 byte* document_text = NULL;
1968
1969 exp = node->select_list;
1970
1971 phrase->found = FALSE;
1972
1973 /* For proximity search, we will need to get the whole document
1974 from all fields, so first count the total length of the document
1975 from all the fields */
1976 if (phrase->proximity_pos) {
1977 while (exp) {
1978 ulint field_len;
1979 dfield_t* dfield = que_node_get_val(exp);
1980 byte* data = static_cast<byte*>(
1981 dfield_get_data(dfield));
1982
1983 if (dfield_is_ext(dfield)) {
1984 ulint local_len = dfield_get_len(dfield);
1985
1986 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1987
1988 field_len = mach_read_from_4(
1989 data + local_len + BTR_EXTERN_LEN + 4);
1990 } else {
1991 field_len = dfield_get_len(dfield);
1992 }
1993
1994 if (field_len != UNIV_SQL_NULL) {
1995 total_len += field_len + 1;
1996 }
1997
1998 exp = que_node_get_next(exp);
1999 }
2000
2001 document_text = static_cast<byte*>(mem_heap_zalloc(
2002 phrase->heap, total_len));
2003
2004 if (!document_text) {
2005 return(FALSE);
2006 }
2007 }
2008
2009 exp = node->select_list;
2010
2011 while (exp) {
2012 dfield_t* dfield = que_node_get_val(exp);
2013 byte* data = static_cast<byte*>(
2014 dfield_get_data(dfield));
2015 ulint cur_len;
2016
2017 if (dfield_is_ext(dfield)) {
2018 data = btr_copy_externally_stored_field(
2019 &cur_len, data, phrase->zip_size,
2020 dfield_get_len(dfield), phrase->heap);
2021 } else {
2022 cur_len = dfield_get_len(dfield);
2023 }
2024
2025 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2026 if (phrase->proximity_pos) {
2027 ut_ad(prev_len + cur_len <= total_len);
2028 memcpy(document_text + prev_len, data, cur_len);
2029 } else {
2030 /* For phrase search */
2031 phrase->found =
2032 fts_query_match_phrase(
2033 phrase,
2034 static_cast<byte*>(data),
2035 cur_len, prev_len,
2036 phrase->heap);
2037 }
2038
2039 /* Document positions are calculated from the beginning
2040 of the first field, need to save the length for each
2041 searched field to adjust the doc position when search
2042 phrases. */
2043 prev_len += cur_len + 1;
2044 }
2045
2046 if (phrase->found) {
2047 break;
2048 }
2049
2050 exp = que_node_get_next(exp);
2051 }
2052
2053 if (phrase->proximity_pos) {
2054 ut_ad(prev_len <= total_len);
2055
2056 phrase->found = fts_proximity_is_word_in_range(
2057 phrase, document_text, total_len);
2058 }
2059
2060 return(phrase->found);
2061 }
2062
2063 #if 0
2064 /********************************************************************
2065 Callback function to check whether a record was found or not. */
2066 static
2067 ibool
2068 fts_query_select(
2069 /*=============*/
2070 void* row, /*!< in: sel_node_t* */
2071 void* user_arg) /*!< in: fts_doc_t* */
2072 {
2073 int i;
2074 que_node_t* exp;
2075 sel_node_t* node = row;
2076 fts_select_t* select = user_arg;
2077
2078 ut_a(select->word_freq);
2079 ut_a(select->word_freq->doc_freqs);
2080
2081 exp = node->select_list;
2082
2083 for (i = 0; exp && !select->found; ++i) {
2084 dfield_t* dfield = que_node_get_val(exp);
2085 void* data = dfield_get_data(dfield);
2086 ulint len = dfield_get_len(dfield);
2087
2088 switch (i) {
2089 case 0: /* DOC_COUNT */
2090 if (len != UNIV_SQL_NULL && len != 0) {
2091
2092 select->word_freq->doc_count +=
2093 mach_read_from_4(data);
2094 }
2095 break;
2096
2097 case 1: /* ILIST */
2098 if (len != UNIV_SQL_NULL && len != 0) {
2099
2100 fts_query_find_doc_id(select, data, len);
2101 }
2102 break;
2103
2104 default:
2105 ut_error;
2106 }
2107
2108 exp = que_node_get_next(exp);
2109 }
2110
2111 return(FALSE);
2112 }
2113
2114 /********************************************************************
2115 Read the rows from the FTS index, that match word and where the
2116 doc id is between first and last doc id.
2117 @return DB_SUCCESS if all go well else error code */
2118 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2119 dberr_t
2120 fts_query_find_term(
2121 /*================*/
2122 fts_query_t* query, /*!< in: FTS query state */
2123 que_t** graph, /*!< in: prepared statement */
2124 const fts_string_t* word, /*!< in: the word to fetch */
2125 doc_id_t doc_id, /*!< in: doc id to match */
2126 ulint* min_pos,/*!< in/out: pos found must be
2127 greater than this minimum value. */
2128 ibool* found) /*!< out: TRUE if found else FALSE */
2129 {
2130 pars_info_t* info;
2131 dberr_t error;
2132 fts_select_t select;
2133 doc_id_t match_doc_id;
2134 trx_t* trx = query->trx;
2135 char table_name[MAX_FULL_NAME_LEN];
2136
2137 trx->op_info = "fetching FTS index matching nodes";
2138
2139 if (*graph) {
2140 info = (*graph)->info;
2141 } else {
2142 ulint selected;
2143
2144 info = pars_info_create();
2145
2146 selected = fts_select_index(*word->f_str);
2147 query->fts_index_table.suffix = fts_get_suffix(selected);
2148
2149 fts_get_table_name(&query->fts_index_table, table_name);
2150 pars_info_bind_id(info, "index_table_name", table_name);
2151 }
2152
2153 select.found = FALSE;
2154 select.doc_id = doc_id;
2155 select.min_pos = *min_pos;
2156 select.word_freq = fts_query_add_word_freq(query, word->f_str);
2157
2158 pars_info_bind_function(info, "my_func", fts_query_select, &select);
2159 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2160
2161 /* Convert to "storage" byte order. */
2162 fts_write_doc_id((byte*) &match_doc_id, doc_id);
2163
2164 fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2165
2166 fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2167
2168 if (!*graph) {
2169
2170 *graph = fts_parse_sql(
2171 &query->fts_index_table,
2172 info,
2173 "DECLARE FUNCTION my_func;\n"
2174 "DECLARE CURSOR c IS"
2175 " SELECT doc_count, ilist\n"
2176 " FROM $index_table_name\n"
2177 " WHERE word LIKE :word AND"
2178 " first_doc_id <= :min_doc_id AND"
2179 " last_doc_id >= :max_doc_id\n"
2180 " ORDER BY first_doc_id;\n"
2181 "BEGIN\n"
2182 "\n"
2183 "OPEN c;\n"
2184 "WHILE 1 = 1 LOOP\n"
2185 " FETCH c INTO my_func();\n"
2186 " IF c % NOTFOUND THEN\n"
2187 " EXIT;\n"
2188 " END IF;\n"
2189 "END LOOP;\n"
2190 "CLOSE c;");
2191 }
2192
2193 for (;;) {
2194 error = fts_eval_sql(trx, *graph);
2195
2196 if (error == DB_SUCCESS) {
2197
2198 break; /* Exit the loop. */
2199 } else {
2200
2201 if (error == DB_LOCK_WAIT_TIMEOUT) {
2202 ib::warn() << "lock wait timeout reading FTS"
2203 " index. Retrying!";
2204
2205 trx->error_state = DB_SUCCESS;
2206 } else {
2207 ib::error() << error
2208 << " while reading FTS index.";
2209
2210 break; /* Exit the loop. */
2211 }
2212 }
2213 }
2214
2215 /* Value to return */
2216 *found = select.found;
2217
2218 if (*found) {
2219 *min_pos = select.min_pos;
2220 }
2221
2222 return(error);
2223 }
2224
2225 /********************************************************************
2226 Callback aggregator for int columns. */
2227 static
2228 ibool
2229 fts_query_sum(
2230 /*==========*/
2231 /*!< out: always returns TRUE */
2232 void* row, /*!< in: sel_node_t* */
2233 void* user_arg) /*!< in: ulint* */
2234 {
2235
2236 que_node_t* exp;
2237 sel_node_t* node = row;
2238 ulint* total = user_arg;
2239
2240 exp = node->select_list;
2241
2242 while (exp) {
2243 dfield_t* dfield = que_node_get_val(exp);
2244 void* data = dfield_get_data(dfield);
2245 ulint len = dfield_get_len(dfield);
2246
2247 if (len != UNIV_SQL_NULL && len != 0) {
2248 *total += mach_read_from_4(data);
2249 }
2250
2251 exp = que_node_get_next(exp);
2252 }
2253
2254 return(TRUE);
2255 }
2256
2257 /********************************************************************
2258 Calculate the total documents that contain a particular word (term).
2259 @return DB_SUCCESS if all go well else error code */
2260 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2261 dberr_t
2262 fts_query_total_docs_containing_term(
2263 /*=================================*/
2264 fts_query_t* query, /*!< in: FTS query state */
2265 const fts_string_t* word, /*!< in: the word to check */
2266 ulint* total) /*!< out: documents containing word */
2267 {
2268 pars_info_t* info;
2269 dberr_t error;
2270 que_t* graph;
2271 ulint selected;
2272 trx_t* trx = query->trx;
2273 char table_name[MAX_FULL_NAME_LEN]
2274
2275 trx->op_info = "fetching FTS index document count";
2276
2277 *total = 0;
2278
2279 info = pars_info_create();
2280
2281 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2282 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2283
2284 selected = fts_select_index(*word->f_str);
2285
2286 query->fts_index_table.suffix = fts_get_suffix(selected);
2287
2288 fts_get_table_name(&query->fts_index_table, table_name);
2289
2290 pars_info_bind_id(info, "index_table_name", table_name);
2291
2292 graph = fts_parse_sql(
2293 &query->fts_index_table,
2294 info,
2295 "DECLARE FUNCTION my_func;\n"
2296 "DECLARE CURSOR c IS"
2297 " SELECT doc_count\n"
2298 " FROM $index_table_name\n"
2299 " WHERE word = :word"
2300 " ORDER BY first_doc_id;\n"
2301 "BEGIN\n"
2302 "\n"
2303 "OPEN c;\n"
2304 "WHILE 1 = 1 LOOP\n"
2305 " FETCH c INTO my_func();\n"
2306 " IF c % NOTFOUND THEN\n"
2307 " EXIT;\n"
2308 " END IF;\n"
2309 "END LOOP;\n"
2310 "CLOSE c;");
2311
2312 for (;;) {
2313 error = fts_eval_sql(trx, graph);
2314
2315 if (error == DB_SUCCESS) {
2316
2317 break; /* Exit the loop. */
2318 } else {
2319
2320 if (error == DB_LOCK_WAIT_TIMEOUT) {
2321 ib::warn() << "lock wait timeout reading FTS"
2322 " index. Retrying!";
2323
2324 trx->error_state = DB_SUCCESS;
2325 } else {
2326 ib::error() << error
2327 << " while reading FTS index.";
2328
2329 break; /* Exit the loop. */
2330 }
2331 }
2332 }
2333
2334 fts_que_graph_free(graph);
2335
2336 return(error);
2337 }
2338
2339 /********************************************************************
2340 Get the total number of words in a documents.
2341 @return DB_SUCCESS if all go well else error code */
2342 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2343 dberr_t
2344 fts_query_terms_in_document(
2345 /*========================*/
2346 fts_query_t* query, /*!< in: FTS query state */
2347 doc_id_t doc_id, /*!< in: the word to check */
2348 ulint* total) /*!< out: total words in document */
2349 {
2350 pars_info_t* info;
2351 dberr_t error;
2352 que_t* graph;
2353 doc_id_t read_doc_id;
2354 trx_t* trx = query->trx;
2355 char table_name[MAX_FULL_NAME_LEN];
2356
2357 trx->op_info = "fetching FTS document term count";
2358
2359 *total = 0;
2360
2361 info = pars_info_create();
2362
2363 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2364
2365 /* Convert to "storage" byte order. */
2366 fts_write_doc_id((byte*) &read_doc_id, doc_id);
2367 fts_bind_doc_id(info, "doc_id", &read_doc_id);
2368
2369 query->fts_index_table.suffix = "DOC_ID";
2370
2371 fts_get_table_name(&query->fts_index_table, table_name);
2372
2373 pars_info_bind_id(info, "index_table_name", table_name);
2374
2375 graph = fts_parse_sql(
2376 &query->fts_index_table,
2377 info,
2378 "DECLARE FUNCTION my_func;\n"
2379 "DECLARE CURSOR c IS"
2380 " SELECT count\n"
2381 " FROM $index_table_name\n"
2382 " WHERE doc_id = :doc_id"
2383 " BEGIN\n"
2384 "\n"
2385 "OPEN c;\n"
2386 "WHILE 1 = 1 LOOP\n"
2387 " FETCH c INTO my_func();\n"
2388 " IF c % NOTFOUND THEN\n"
2389 " EXIT;\n"
2390 " END IF;\n"
2391 "END LOOP;\n"
2392 "CLOSE c;");
2393
2394 for (;;) {
2395 error = fts_eval_sql(trx, graph);
2396
2397 if (error == DB_SUCCESS) {
2398
2399 break; /* Exit the loop. */
2400 } else {
2401
2402 if (error == DB_LOCK_WAIT_TIMEOUT) {
2403 ib::warn() << "lock wait timeout reading FTS"
2404 " doc id table. Retrying!";
2405
2406 trx->error_state = DB_SUCCESS;
2407 } else {
2408 ib::error() << error << " while reading FTS"
2409 " doc id table.";
2410
2411 break; /* Exit the loop. */
2412 }
2413 }
2414 }
2415
2416 fts_que_graph_free(graph);
2417
2418 return(error);
2419 }
2420 #endif
2421
2422 /*****************************************************************//**
2423 Retrieve the document and match the phrase tokens.
2424 @return DB_SUCCESS or error code */
2425 MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
2426 static
2427 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2428 fts_query_match_document(
2429 /*=====================*/
2430 ib_vector_t* tokens, /*!< in: phrase tokens */
2431 fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
2432 fts_match_t* match, /*!< in: doc id and positions */
2433 ulint distance, /*!< in: proximity distance */
2434 st_mysql_ftparser* parser, /*!< in: fts plugin parser */
2435 ibool* found) /*!< out: TRUE if phrase found */
2436 {
2437 dberr_t error;
2438 fts_phrase_t phrase(get_doc->index_cache->index->table);
2439
2440 phrase.match = match; /* Positions to match */
2441 phrase.tokens = tokens; /* Tokens to match */
2442 phrase.distance = distance;
2443 phrase.charset = get_doc->index_cache->charset;
2444 phrase.heap = mem_heap_create(512);
2445 phrase.parser = parser;
2446
2447 *found = phrase.found = FALSE;
2448
2449 error = fts_doc_fetch_by_doc_id(
2450 get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2451 fts_query_fetch_document, &phrase);
2452
2453 if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
2454 ib::error() << "(" << error << ") matching document.";
2455 } else {
2456 *found = phrase.found;
2457 }
2458
2459 mem_heap_free(phrase.heap);
2460
2461 return(error);
2462 }
2463
2464 /*****************************************************************//**
2465 This function fetches the original documents and count the
2466 words in between matching words to see that is in specified distance
2467 @return DB_SUCCESS if all OK */
2468 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2469 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2470 fts_query_is_in_proximity_range(
2471 /*============================*/
2472 const fts_query_t* query, /*!< in: query instance */
2473 fts_match_t** match, /*!< in: query instance */
2474 fts_proximity_t* qualified_pos) /*!< in: position info for
2475 qualified ranges */
2476 {
2477 fts_get_doc_t get_doc;
2478 fts_cache_t* cache = query->index->table->fts->cache;
2479 dberr_t err;
2480
2481 memset(&get_doc, 0x0, sizeof(get_doc));
2482
2483 rw_lock_x_lock(&cache->lock);
2484 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2485 rw_lock_x_unlock(&cache->lock);
2486 ut_a(get_doc.index_cache != NULL);
2487
2488 fts_phrase_t phrase(get_doc.index_cache->index->table);
2489
2490 phrase.distance = query->distance;
2491 phrase.charset = get_doc.index_cache->charset;
2492 phrase.heap = mem_heap_create(512);
2493 phrase.proximity_pos = qualified_pos;
2494 phrase.found = FALSE;
2495
2496 err = fts_doc_fetch_by_doc_id(
2497 &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2498 fts_query_fetch_document, &phrase);
2499
2500 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
2501 ib::error() << "(" << err << ") in verification"
2502 " phase of proximity search";
2503 }
2504
2505 /* Free the prepared statement. */
2506 if (get_doc.get_document_graph) {
2507 fts_que_graph_free(get_doc.get_document_graph);
2508 get_doc.get_document_graph = NULL;
2509 }
2510
2511 mem_heap_free(phrase.heap);
2512
2513 return(err == DB_SUCCESS && phrase.found);
2514 }
2515
2516 /*****************************************************************//**
2517 Iterate over the matched document ids and search the for the
2518 actual phrase in the text.
2519 @return DB_SUCCESS if all OK */
2520 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2521 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2522 fts_query_search_phrase(
2523 /*====================*/
2524 fts_query_t* query, /*!< in: query instance */
2525 ib_vector_t* orig_tokens, /*!< in: tokens to search,
2526 with any stopwords in the
2527 original phrase */
2528 ib_vector_t* tokens) /*!< in: tokens that does
2529 not include stopwords and
2530 can be used to calculate
2531 ranking */
2532 {
2533 ulint i;
2534 fts_get_doc_t get_doc;
2535 ulint n_matched;
2536 fts_cache_t* cache = query->index->table->fts->cache;
2537
2538 n_matched = ib_vector_size(query->matched);
2539
2540 /* Setup the doc retrieval infrastructure. */
2541 memset(&get_doc, 0x0, sizeof(get_doc));
2542
2543 rw_lock_x_lock(&cache->lock);
2544
2545 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2546
2547 /* Must find the index cache */
2548 ut_a(get_doc.index_cache != NULL);
2549
2550 rw_lock_x_unlock(&cache->lock);
2551
2552 #ifdef FTS_INTERNAL_DIAG_PRINT
2553 ib::info() << "Start phrase search";
2554 #endif
2555
2556 /* Read the document from disk and do the actual
2557 match, matching documents will be added to the current
2558 doc id set. */
2559 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2560 fts_match_t* match;
2561 ibool found = FALSE;
2562
2563 match = static_cast<fts_match_t*>(
2564 ib_vector_get(query->matched, i));
2565
2566 /* Skip the document ids that were filtered out by
2567 an earlier pass. */
2568 if (match->doc_id != 0) {
2569
2570 query->error = fts_query_match_document(
2571 orig_tokens, &get_doc, match,
2572 query->distance, query->parser, &found);
2573
2574 if (query->error == DB_SUCCESS && found) {
2575 ulint z;
2576
2577 query->error = fts_query_process_doc_id(query,
2578 match->doc_id, 0);
2579 if (query->error != DB_SUCCESS) {
2580 goto func_exit;
2581 }
2582
2583 for (z = 0; z < ib_vector_size(tokens); z++) {
2584 fts_string_t* token;
2585 token = static_cast<fts_string_t*>(
2586 ib_vector_get(tokens, z));
2587 fts_query_add_word_to_document(
2588 query, match->doc_id, token);
2589 }
2590 }
2591 }
2592 }
2593
2594 func_exit:
2595 /* Free the prepared statement. */
2596 if (get_doc.get_document_graph) {
2597 fts_que_graph_free(get_doc.get_document_graph);
2598 get_doc.get_document_graph = NULL;
2599 }
2600
2601 return(query->error);
2602 }
2603
2604 /** Split the phrase into tokens
2605 @param[in,out] query query instance
2606 @param[in] node query node to search
2607 @param[in,out] tokens token vector
2608 @param[in,out] orig_tokens original node tokens include stopword
2609 @param[in,out] heap mem heap */
2610 static
2611 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2612 fts_query_phrase_split(
2613 fts_query_t* query,
2614 const fts_ast_node_t* node,
2615 ib_vector_t* tokens,
2616 ib_vector_t* orig_tokens,
2617 mem_heap_t* heap)
2618 {
2619 fts_string_t phrase;
2620 ulint len = 0;
2621 ulint cur_pos = 0;
2622 fts_ast_node_t* term_node = NULL;
2623
2624 if (node->type == FTS_AST_TEXT) {
2625 phrase.f_str = node->text.ptr->str;
2626 phrase.f_len = node->text.ptr->len;
2627 len = phrase.f_len;
2628 } else {
2629 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2630 phrase.f_str = NULL;
2631 phrase.f_len = 0;
2632 term_node = node->list.head;
2633 }
2634
2635 while (true) {
2636 fts_cache_t* cache = query->index->table->fts->cache;
2637 ulint cur_len;
2638 fts_string_t result_str;
2639
2640 if (node->type == FTS_AST_TEXT) {
2641 if (cur_pos >= len) {
2642 break;
2643 }
2644
2645 cur_len = innobase_mysql_fts_get_token(
2646 query->fts_index_table.charset,
2647 reinterpret_cast<const byte*>(phrase.f_str)
2648 + cur_pos,
2649 reinterpret_cast<const byte*>(phrase.f_str)
2650 + len,
2651 &result_str);
2652
2653 if (cur_len == 0) {
2654 break;
2655 }
2656
2657 cur_pos += cur_len;
2658 } else {
2659 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2660 /* Term node in parser phrase list */
2661 if (term_node == NULL) {
2662 break;
2663 }
2664
2665 ut_a(term_node->type == FTS_AST_TERM);
2666 result_str.f_str = term_node->term.ptr->str;
2667 result_str.f_len = term_node->term.ptr->len;
2668 result_str.f_n_char = fts_get_token_size(
2669 query->fts_index_table.charset,
2670 reinterpret_cast<char*>(result_str.f_str),
2671 result_str.f_len);
2672
2673 term_node = term_node->next;
2674 }
2675
2676 if (result_str.f_n_char == 0) {
2677 continue;
2678 }
2679
2680 fts_string_t* token = static_cast<fts_string_t*>(
2681 ib_vector_push(tokens, NULL));
2682 fts_string_dup(token, &result_str, heap);
2683
2684 if (fts_check_token(
2685 &result_str,
2686 cache->stopword_info.cached_stopword,
2687 query->fts_index_table.charset)) {
2688 /* Add the word to the RB tree so that we can
2689 calculate it's frequencey within a document. */
2690 fts_query_add_word_freq(query, token);
2691 } else {
2692 ib_vector_pop(tokens);
2693 }
2694
2695 /* we will start to store all words including stopwords
2696 in the "orig_tokens" vector, but skip any leading words
2697 that are stopwords */
2698 if (!ib_vector_is_empty(tokens)) {
2699 fts_string_t* orig_token = static_cast<fts_string_t*>(
2700 ib_vector_push(orig_tokens, NULL));
2701
2702 orig_token->f_str = token->f_str;
2703 orig_token->f_len = token->f_len;
2704 }
2705 }
2706 }
2707
2708 /*****************************************************************//**
2709 Text/Phrase search.
2710 @return DB_SUCCESS or error code */
2711 static MY_ATTRIBUTE((warn_unused_result))
2712 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2713 fts_query_phrase_search(
2714 /*====================*/
2715 fts_query_t* query, /*!< in: query instance */
2716 const fts_ast_node_t* node) /*!< in: node to search */
2717 {
2718 ib_vector_t* tokens;
2719 ib_vector_t* orig_tokens;
2720 mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2721 ib_alloc_t* heap_alloc;
2722 ulint num_token;
2723
2724 heap_alloc = ib_heap_allocator_create(heap);
2725
2726 tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2727 orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2728
2729 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2730 query->flags = FTS_PROXIMITY;
2731 } else {
2732 query->flags = FTS_PHRASE;
2733 }
2734
2735 /* Split the phrase into tokens. */
2736 fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2737
2738 num_token = ib_vector_size(tokens);
2739 if (num_token > MAX_PROXIMITY_ITEM) {
2740 query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2741 goto func_exit;
2742 }
2743
2744 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2745
2746 /* Ignore empty strings. */
2747 if (num_token > 0) {
2748 fts_string_t* token = NULL;
2749 fts_fetch_t fetch;
2750 trx_t* trx = query->trx;
2751 fts_ast_oper_t oper = query->oper;
2752 que_t* graph = NULL;
2753 ulint i;
2754 dberr_t error;
2755
2756 /* Create the vector for storing matching document ids
2757 and the positions of the first token of the phrase. */
2758 if (!query->matched) {
2759 ib_alloc_t* heap_alloc;
2760
2761 heap_alloc = ib_heap_allocator_create(heap);
2762
2763 if (!(query->flags & FTS_PROXIMITY)
2764 && !(query->flags & FTS_PHRASE)) {
2765 query->matched = ib_vector_create(
2766 heap_alloc, sizeof(fts_match_t),
2767 64);
2768 } else {
2769 ut_a(num_token <= MAX_PROXIMITY_ITEM);
2770 query->match_array =
2771 (ib_vector_t**) mem_heap_alloc(
2772 heap,
2773 num_token *
2774 sizeof(query->matched));
2775
2776 for (i = 0; i < num_token; i++) {
2777 query->match_array[i] =
2778 ib_vector_create(
2779 heap_alloc, sizeof(fts_match_t),
2780 64);
2781 }
2782
2783 query->matched = query->match_array[0];
2784 }
2785 }
2786
2787 /* Setup the callback args for filtering and consolidating
2788 the ilist. */
2789 fetch.read_arg = query;
2790 fetch.read_record = fts_query_index_fetch_nodes;
2791
2792 for (i = 0; i < num_token; i++) {
2793 /* Search for the first word from the phrase. */
2794 token = static_cast<fts_string_t*>(
2795 ib_vector_get(tokens, i));
2796
2797 if (query->flags & FTS_PROXIMITY
2798 || query->flags & FTS_PHRASE) {
2799 query->matched = query->match_array[i];
2800 }
2801
2802 error = fts_index_fetch_nodes(
2803 trx, &graph, &query->fts_index_table,
2804 token, &fetch);
2805
2806 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2807 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2808 if (error != DB_SUCCESS) {
2809 query->error = error;
2810 }
2811
2812 fts_que_graph_free(graph);
2813 graph = NULL;
2814
2815 fts_query_cache(query, token);
2816
2817 if (!(query->flags & FTS_PHRASE)
2818 && !(query->flags & FTS_PROXIMITY)) {
2819 break;
2820 }
2821
2822 /* If any of the token can't be found,
2823 no need to continue match */
2824 if (ib_vector_is_empty(query->match_array[i])
2825 || query->error != DB_SUCCESS) {
2826 goto func_exit;
2827 }
2828 }
2829
2830 /* Just a single word, no need to fetch the original
2831 documents to do phrase matching */
2832 if (ib_vector_size(orig_tokens) == 1
2833 && !ib_vector_is_empty(query->match_array[0])) {
2834 fts_match_t* match;
2835 ulint n_matched;
2836
2837 n_matched = ib_vector_size(query->match_array[0]);
2838
2839 for (i = 0; i < n_matched; i++) {
2840 match = static_cast<fts_match_t*>(
2841 ib_vector_get(
2842 query->match_array[0], i));
2843
2844 query->error = fts_query_process_doc_id(
2845 query, match->doc_id, 0);
2846 if (query->error != DB_SUCCESS) {
2847 goto func_exit;
2848 }
2849
2850 fts_query_add_word_to_document(
2851 query, match->doc_id, token);
2852 }
2853 query->oper = oper;
2854 goto func_exit;
2855 }
2856
2857 /* If we are doing proximity search, verify the distance
2858 between all words, and check they are in specified distance. */
2859 if (query->flags & FTS_PROXIMITY) {
2860 fts_phrase_or_proximity_search(query, tokens);
2861 } else {
2862 ibool matched;
2863
2864 /* Phrase Search case:
2865 We filter out the doc ids that don't contain
2866 all the tokens in the phrase. It's cheaper to
2867 search the ilist than bringing the documents in
2868 and then doing a search through the text. Isolated
2869 testing shows this also helps in mitigating disruption
2870 of the buffer cache. */
2871 matched = fts_phrase_or_proximity_search(query, tokens);
2872 query->matched = query->match_array[0];
2873
2874 /* Read the actual text in and search for the phrase. */
2875 if (matched) {
2876 ut_ad(query->error == DB_SUCCESS);
2877 query->error = fts_query_search_phrase(
2878 query, orig_tokens, tokens);
2879 }
2880 }
2881
2882 /* Restore original operation. */
2883 query->oper = oper;
2884
2885 if (query->error != DB_SUCCESS) {
2886 goto func_exit;
2887 }
2888 }
2889
2890 func_exit:
2891 mem_heap_free(heap);
2892
2893 /* Don't need it anymore. */
2894 query->matched = NULL;
2895
2896 return(query->error);
2897 }
2898
2899 /*****************************************************************//**
2900 Find the word and evaluate.
2901 @return DB_SUCCESS if all go well */
2902 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2903 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2904 fts_query_execute(
2905 /*==============*/
2906 fts_query_t* query, /*!< in: query instance */
2907 fts_string_t* token) /*!< in: token to search */
2908 {
2909 switch (query->oper) {
2910 case FTS_NONE:
2911 case FTS_NEGATE:
2912 case FTS_INCR_RATING:
2913 case FTS_DECR_RATING:
2914 query->error = fts_query_union(query, token);
2915 break;
2916
2917 case FTS_EXIST:
2918 query->error = fts_query_intersect(query, token);
2919 break;
2920
2921 case FTS_IGNORE:
2922 query->error = fts_query_difference(query, token);
2923 break;
2924
2925 default:
2926 ut_error;
2927 }
2928
2929 return(query->error);
2930 }
2931
2932 /*****************************************************************//**
2933 Create a wildcard string. It's the responsibility of the caller to
2934 free the byte* pointer. It's allocated using ut_malloc_nokey().
2935 @return ptr to allocated memory */
2936 static
2937 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2938 fts_query_get_token(
2939 /*================*/
2940 fts_ast_node_t* node, /*!< in: the current sub tree */
2941 fts_string_t* token) /*!< in: token to create */
2942 {
2943 ulint str_len;
2944 byte* new_ptr = NULL;
2945
2946 str_len = node->term.ptr->len;
2947
2948 ut_a(node->type == FTS_AST_TERM);
2949
2950 token->f_len = str_len;
2951 token->f_str = node->term.ptr->str;
2952
2953 if (node->term.wildcard) {
2954
2955 token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2956 token->f_len = str_len + 1;
2957
2958 memcpy(token->f_str, node->term.ptr->str, str_len);
2959
2960 token->f_str[str_len] = '%';
2961 token->f_str[token->f_len] = 0;
2962
2963 new_ptr = token->f_str;
2964 }
2965
2966 return(new_ptr);
2967 }
2968
2969 static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
2970
2971 /*****************************************************************//**
2972 Visit every node of the AST. */
2973 static
2974 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2975 fts_query_visitor(
2976 /*==============*/
2977 fts_ast_oper_t oper, /*!< in: current operator */
2978 fts_ast_node_t* node, /*!< in: The root of the current subtree*/
2979 void* arg) /*!< in: callback arg*/
2980 {
2981 byte* ptr;
2982 fts_string_t token;
2983 fts_query_t* query = static_cast<fts_query_t*>(arg);
2984
2985 ut_a(node);
2986 DBUG_ENTER("fts_query_visitor");
2987 DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2988
2989 token.f_n_char = 0;
2990 query->oper = oper;
2991 query->cur_node = node;
2992
2993 switch (node->type) {
2994 case FTS_AST_TEXT:
2995 case FTS_AST_PARSER_PHRASE_LIST:
2996
2997 if (query->oper == FTS_EXIST) {
2998 ut_ad(query->intersection == NULL);
2999 query->intersection = rbt_create(
3000 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3001
3002 query->total_size += SIZEOF_RBT_CREATE;
3003 }
3004
3005 /* Set the current proximity distance. */
3006 query->distance = node->text.distance;
3007
3008 /* Force collection of doc ids and the positions. */
3009 query->collect_positions = TRUE;
3010
3011 query->error = fts_query_phrase_search(query, node);
3012
3013 query->collect_positions = FALSE;
3014
3015 if (query->oper == FTS_EXIST) {
3016 fts_query_free_doc_ids(query, query->doc_ids);
3017 query->doc_ids = query->intersection;
3018 query->intersection = NULL;
3019 }
3020
3021 break;
3022
3023 case FTS_AST_TERM:
3024 token.f_str = node->term.ptr->str;
3025 token.f_len = node->term.ptr->len;
3026
3027 /* Collect wildcard words for QUERY EXPANSION. */
3028 if (node->term.wildcard && query->wildcard_words != NULL) {
3029 ib_rbt_bound_t parent;
3030
3031 if (rbt_search(query->wildcard_words, &parent, &token)
3032 != 0) {
3033 fts_string_t word;
3034
3035 fts_string_dup(&word, &token, query->heap);
3036 rbt_add_node(query->wildcard_words, &parent,
3037 &word);
3038 }
3039 }
3040
3041 /* Add the word to our RB tree that will be used to
3042 calculate this terms per document frequency. */
3043 fts_query_add_word_freq(query, &token);
3044
3045 ptr = fts_query_get_token(node, &token);
3046 query->error = fts_query_execute(query, &token);
3047
3048 if (ptr) {
3049 ut_free(ptr);
3050 }
3051
3052 break;
3053
3054 case FTS_AST_SUBEXP_LIST:
3055 query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3056 break;
3057
3058 default:
3059 ut_error;
3060 }
3061
3062 if (query->oper == FTS_EXIST) {
3063 query->multi_exist = true;
3064 }
3065
3066 DBUG_RETURN(query->error);
3067 }
3068
3069 /** Process (nested) sub-expression, create a new result set to store the
3070 sub-expression result by processing nodes under current sub-expression
3071 list. Merge the sub-expression result with that of parent expression list.
3072 @param[in,out] node current root node
3073 @param[in,out] visitor callback function
3074 @param[in,out] arg argument for callback
3075 @return DB_SUCCESS if all go well */
3076 static
3077 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3078 fts_ast_visit_sub_exp(
3079 fts_ast_node_t* node,
3080 fts_ast_callback visitor,
3081 void* arg)
3082 {
3083 fts_ast_oper_t cur_oper;
3084 fts_query_t* query = static_cast<fts_query_t*>(arg);
3085 ib_rbt_t* parent_doc_ids;
3086 ib_rbt_t* subexpr_doc_ids;
3087 dberr_t error = DB_SUCCESS;
3088 bool will_be_ignored = false;
3089 bool multi_exist;
3090
3091 DBUG_ENTER("fts_ast_visit_sub_exp");
3092
3093 ut_a(node->type == FTS_AST_SUBEXP_LIST);
3094
3095 /* To avoid stack overflow, we limit the mutual recursion
3096 depth between fts_ast_visit(), fts_query_visitor() and
3097 fts_ast_visit_sub_exp(). */
3098 if (query->visiting_sub_exp++ > 31) {
3099 query->error = DB_OUT_OF_MEMORY;
3100 DBUG_RETURN(query->error);
3101 }
3102
3103 cur_oper = query->oper;
3104
3105 /* Save current result set */
3106 parent_doc_ids = query->doc_ids;
3107
3108 /* Create new result set to store the sub-expression result. We
3109 will merge this result set with the parent after processing. */
3110 query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3111 fts_ranking_doc_id_cmp);
3112
3113 query->total_size += SIZEOF_RBT_CREATE;
3114
3115 multi_exist = query->multi_exist;
3116 query->multi_exist = false;
3117 /* Process nodes in current sub-expression and store its
3118 result set in query->doc_ids we created above. */
3119 error = fts_ast_visit(FTS_NONE, node, visitor,
3120 arg, &will_be_ignored);
3121
3122 /* Reinstate parent node state */
3123 query->multi_exist = multi_exist;
3124 query->oper = cur_oper;
3125 query->visiting_sub_exp--;
3126
3127 /* Merge the sub-expression result with the parent result set. */
3128 subexpr_doc_ids = query->doc_ids;
3129 query->doc_ids = parent_doc_ids;
3130 if (error == DB_SUCCESS) {
3131 error = fts_merge_doc_ids(query, subexpr_doc_ids);
3132 }
3133
3134 /* Free current result set. Result already merged into parent. */
3135 fts_query_free_doc_ids(query, subexpr_doc_ids);
3136
3137 DBUG_RETURN(error);
3138 }
3139
3140 #if 0
3141 /*****************************************************************//***
3142 Check if the doc id exists in the ilist.
3143 @return TRUE if doc id found */
3144 static
3145 ulint
3146 fts_query_find_doc_id(
3147 /*==================*/
3148 fts_select_t* select, /*!< in/out: contains the doc id to
3149 find, we update the word freq if
3150 document found */
3151 void* data, /*!< in: doc id ilist */
3152 ulint len) /*!< in: doc id ilist size */
3153 {
3154 byte* ptr = data;
3155 doc_id_t doc_id = 0;
3156 ulint decoded = 0;
3157
3158 /* Decode the ilist and search for selected doc_id. We also
3159 calculate the frequency of the word in the document if found. */
3160 while (decoded < len && !select->found) {
3161 ulint freq = 0;
3162 ulint min_pos = 0;
3163 ulint last_pos = 0;
3164 ulint pos = fts_decode_vlc(&ptr);
3165
3166 /* Add the delta. */
3167 doc_id += pos;
3168
3169 while (*ptr) {
3170 ++freq;
3171 last_pos += fts_decode_vlc(&ptr);
3172
3173 /* Only if min_pos is not set and the current
3174 term exists in a position greater than the
3175 min_pos of the previous term. */
3176 if (min_pos == 0 && last_pos > select->min_pos) {
3177 min_pos = last_pos;
3178 }
3179 }
3180
3181 /* Skip the end of word position marker. */
3182 ++ptr;
3183
3184 /* Bytes decoded so far. */
3185 decoded = ptr - (byte*) data;
3186
3187 /* A word may exist in the document but we only consider a
3188 match if it exists in a position that is greater than the
3189 position of the previous term. */
3190 if (doc_id == select->doc_id && min_pos > 0) {
3191 fts_doc_freq_t* doc_freq;
3192
3193 /* Add the doc id to the doc freq rb tree, if
3194 the doc id doesn't exist it will be created. */
3195 doc_freq = fts_query_add_doc_freq(
3196 select->word_freq->doc_freqs, doc_id);
3197
3198 /* Avoid duplicating the frequency tally */
3199 if (doc_freq->freq == 0) {
3200 doc_freq->freq = freq;
3201 }
3202
3203 select->found = TRUE;
3204 select->min_pos = min_pos;
3205 }
3206 }
3207
3208 return(select->found);
3209 }
3210 #endif
3211
3212 /*****************************************************************//**
3213 Read and filter nodes.
3214 @return DB_SUCCESS if all go well,
3215 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3216 static
3217 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3218 fts_query_filter_doc_ids(
3219 /*=====================*/
3220 fts_query_t* query, /*!< in: query instance */
3221 const fts_string_t* word, /*!< in: the current word */
3222 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
3223 const fts_node_t* node, /*!< in: current FTS node */
3224 void* data, /*!< in: doc id ilist */
3225 ulint len, /*!< in: doc id ilist size */
3226 ibool calc_doc_count) /*!< in: whether to remember doc count */
3227 {
3228 const byte* ptr = static_cast<byte*>(data);
3229 doc_id_t doc_id = 0;
3230 ulint decoded = 0;
3231 ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3232
3233 /* Decode the ilist and add the doc ids to the query doc_id set. */
3234 while (decoded < len) {
3235 ulint freq = 0;
3236 fts_doc_freq_t* doc_freq;
3237 fts_match_t* match = NULL;
3238 doc_id_t last_pos = 0;
3239 doc_id_t pos = fts_decode_vlc(&ptr);
3240
3241 /* Some sanity checks. */
3242 if (doc_id == 0) {
3243 ut_a(pos == node->first_doc_id);
3244 }
3245
3246 /* Add the delta. */
3247 doc_id += pos;
3248
3249 if (calc_doc_count) {
3250 word_freq->doc_count++;
3251 }
3252
3253 /* We simply collect the matching instances here. */
3254 if (query->collect_positions) {
3255 ib_alloc_t* heap_alloc;
3256
3257 /* Create a new fts_match_t instance. */
3258 match = static_cast<fts_match_t*>(
3259 ib_vector_push(query->matched, NULL));
3260
3261 match->start = 0;
3262 match->doc_id = doc_id;
3263 heap_alloc = ib_vector_allocator(query->matched);
3264
3265 /* Allocate from the same heap as the
3266 parent container. */
3267 match->positions = ib_vector_create(
3268 heap_alloc, sizeof(ulint), 64);
3269
3270 query->total_size += sizeof(fts_match_t)
3271 + sizeof(ib_vector_t)
3272 + sizeof(ulint) * 64;
3273 }
3274
3275 /* Unpack the positions within the document. */
3276 while (*ptr) {
3277 last_pos += fts_decode_vlc(&ptr);
3278
3279 /* Collect the matching word positions, for phrase
3280 matching later. */
3281 if (query->collect_positions) {
3282 ib_vector_push(match->positions, &last_pos);
3283 }
3284
3285 ++freq;
3286 }
3287
3288 /* End of list marker. */
3289 last_pos = (ulint) -1;
3290
3291 if (query->collect_positions) {
3292 ut_a(match != NULL);
3293 ib_vector_push(match->positions, &last_pos);
3294 }
3295
3296 /* Add the doc id to the doc freq rb tree, if the doc id
3297 doesn't exist it will be created. */
3298 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3299
3300 /* Avoid duplicating frequency tally. */
3301 if (doc_freq->freq == 0) {
3302 doc_freq->freq = freq;
3303 }
3304
3305 /* Skip the end of word position marker. */
3306 ++ptr;
3307
3308 /* Bytes decoded so far */
3309 decoded = ulint(ptr - (byte*) data);
3310
3311 /* We simply collect the matching documents and the
3312 positions here and match later. */
3313 if (!query->collect_positions) {
3314 /* We ignore error here and will check it later */
3315 fts_query_process_doc_id(query, doc_id, 0);
3316
3317 /* Add the word to the document's matched RB tree. */
3318 fts_query_add_word_to_document(query, doc_id, word);
3319 }
3320 }
3321
3322 /* Some sanity checks. */
3323 ut_a(doc_id == node->last_doc_id);
3324
3325 if (query->total_size > fts_result_cache_limit) {
3326 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3327 } else {
3328 return(DB_SUCCESS);
3329 }
3330 }
3331
3332 /*****************************************************************//**
3333 Read the FTS INDEX row.
3334 @return DB_SUCCESS if all go well. */
3335 static
3336 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3337 fts_query_read_node(
3338 /*================*/
3339 fts_query_t* query, /*!< in: query instance */
3340 const fts_string_t* word, /*!< in: current word */
3341 que_node_t* exp) /*!< in: query graph node */
3342 {
3343 int i;
3344 int ret;
3345 fts_node_t node;
3346 ib_rbt_bound_t parent;
3347 fts_word_freq_t* word_freq;
3348 ibool skip = FALSE;
3349 fts_string_t term;
3350 byte buf[FTS_MAX_WORD_LEN + 1];
3351 dberr_t error = DB_SUCCESS;
3352
3353 ut_a(query->cur_node->type == FTS_AST_TERM
3354 || query->cur_node->type == FTS_AST_TEXT
3355 || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3356
3357 memset(&node, 0, sizeof(node));
3358 term.f_str = buf;
3359
3360 /* Need to consider the wildcard search case, the word frequency
3361 is created on the search string not the actual word. So we need
3362 to assign the frequency on search string behalf. */
3363 if (query->cur_node->type == FTS_AST_TERM
3364 && query->cur_node->term.wildcard) {
3365
3366 term.f_len = query->cur_node->term.ptr->len;
3367 ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3368 memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3369 } else {
3370 term.f_len = word->f_len;
3371 ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3372 memcpy(term.f_str, word->f_str, word->f_len);
3373 }
3374
3375 /* Lookup the word in our rb tree, it must exist. */
3376 ret = rbt_search(query->word_freqs, &parent, &term);
3377
3378 ut_a(ret == 0);
3379
3380 word_freq = rbt_value(fts_word_freq_t, parent.last);
3381
3382 /* Start from 1 since the first column has been read by the caller.
3383 Also, we rely on the order of the columns projected, to filter
3384 out ilists that are out of range and we always want to read
3385 the doc_count irrespective of the suitablility of the row. */
3386
3387 for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3388
3389 dfield_t* dfield = que_node_get_val(exp);
3390 byte* data = static_cast<byte*>(
3391 dfield_get_data(dfield));
3392 ulint len = dfield_get_len(dfield);
3393
3394 ut_a(len != UNIV_SQL_NULL);
3395
3396 /* Note: The column numbers below must match the SELECT. */
3397
3398 switch (i) {
3399 case 1: /* DOC_COUNT */
3400 word_freq->doc_count += mach_read_from_4(data);
3401 break;
3402
3403 case 2: /* FIRST_DOC_ID */
3404 node.first_doc_id = fts_read_doc_id(data);
3405
3406 /* Skip nodes whose doc ids are out range. */
3407 if (query->oper == FTS_EXIST
3408 && query->upper_doc_id > 0
3409 && node.first_doc_id > query->upper_doc_id) {
3410 skip = TRUE;
3411 }
3412 break;
3413
3414 case 3: /* LAST_DOC_ID */
3415 node.last_doc_id = fts_read_doc_id(data);
3416
3417 /* Skip nodes whose doc ids are out range. */
3418 if (query->oper == FTS_EXIST
3419 && query->lower_doc_id > 0
3420 && node.last_doc_id < query->lower_doc_id) {
3421 skip = TRUE;
3422 }
3423 break;
3424
3425 case 4: /* ILIST */
3426
3427 error = fts_query_filter_doc_ids(
3428 query, &word_freq->word, word_freq,
3429 &node, data, len, FALSE);
3430
3431 break;
3432
3433 default:
3434 ut_error;
3435 }
3436 }
3437
3438 if (!skip) {
3439 /* Make sure all columns were read. */
3440
3441 ut_a(i == 5);
3442 }
3443
3444 return error;
3445 }
3446
3447 /*****************************************************************//**
3448 Callback function to fetch the rows in an FTS INDEX record.
3449 @return always returns TRUE */
3450 static
3451 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3452 fts_query_index_fetch_nodes(
3453 /*========================*/
3454 void* row, /*!< in: sel_node_t* */
3455 void* user_arg) /*!< in: pointer to fts_fetch_t */
3456 {
3457 fts_string_t key;
3458 sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3459 fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3460 fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3461 que_node_t* exp = sel_node->select_list;
3462 dfield_t* dfield = que_node_get_val(exp);
3463 void* data = dfield_get_data(dfield);
3464 ulint dfield_len = dfield_get_len(dfield);
3465
3466 key.f_str = static_cast<byte*>(data);
3467 key.f_len = dfield_len;
3468
3469 ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3470
3471 /* Note: we pass error out by 'query->error' */
3472 query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3473
3474 if (query->error != DB_SUCCESS) {
3475 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3476 return(FALSE);
3477 } else {
3478 return(TRUE);
3479 }
3480 }
3481
3482 /*****************************************************************//**
3483 Calculate the inverse document frequency (IDF) for all the terms. */
3484 static
3485 void
fts_query_calculate_idf(fts_query_t * query)3486 fts_query_calculate_idf(
3487 /*====================*/
3488 fts_query_t* query) /*!< in: Query state */
3489 {
3490 const ib_rbt_node_t* node;
3491 ib_uint64_t total_docs = query->total_docs;
3492
3493 /* We need to free any instances of fts_doc_freq_t that we
3494 may have allocated. */
3495 for (node = rbt_first(query->word_freqs);
3496 node;
3497 node = rbt_next(query->word_freqs, node)) {
3498
3499 fts_word_freq_t* word_freq;
3500
3501 word_freq = rbt_value(fts_word_freq_t, node);
3502
3503 if (word_freq->doc_count > 0) {
3504 if (total_docs == word_freq->doc_count) {
3505 /* QP assume ranking > 0 if we find
3506 a match. Since Log10(1) = 0, we cannot
3507 make IDF a zero value if do find a
3508 word in all documents. So let's make
3509 it an arbitrary very small number */
3510 word_freq->idf = log10(1.0001);
3511 } else {
3512 word_freq->idf = log10(
3513 total_docs
3514 / (double) word_freq->doc_count);
3515 }
3516 }
3517 }
3518 }
3519
3520 /*****************************************************************//**
3521 Calculate the ranking of the document. */
3522 static
3523 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3524 fts_query_calculate_ranking(
3525 /*========================*/
3526 const fts_query_t* query, /*!< in: query state */
3527 fts_ranking_t* ranking) /*!< in: Document to rank */
3528 {
3529 ulint pos = 0;
3530 fts_string_t word;
3531
3532 /* At this stage, ranking->rank should not exceed the 1.0
3533 bound */
3534 ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3535 ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3536
3537 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3538 int ret;
3539 ib_rbt_bound_t parent;
3540 double weight;
3541 fts_doc_freq_t* doc_freq;
3542 fts_word_freq_t* word_freq;
3543
3544 ret = rbt_search(query->word_freqs, &parent, &word);
3545
3546 /* It must exist. */
3547 ut_a(ret == 0);
3548
3549 word_freq = rbt_value(fts_word_freq_t, parent.last);
3550
3551 ret = rbt_search(
3552 word_freq->doc_freqs, &parent, &ranking->doc_id);
3553
3554 /* It must exist. */
3555 ut_a(ret == 0);
3556
3557 doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3558
3559 weight = (double) doc_freq->freq * word_freq->idf;
3560
3561 ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3562 }
3563 }
3564
3565 /*****************************************************************//**
3566 Add ranking to the result set. */
3567 static
3568 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3569 fts_query_add_ranking(
3570 /*==================*/
3571 fts_query_t* query, /*!< in: query state */
3572 ib_rbt_t* ranking_tree, /*!< in: ranking tree */
3573 const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
3574 {
3575 ib_rbt_bound_t parent;
3576
3577 /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3578 if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3579 fts_ranking_t* ranking;
3580
3581 ranking = rbt_value(fts_ranking_t, parent.last);
3582
3583 ranking->rank += new_ranking->rank;
3584
3585 ut_a(ranking->words == NULL);
3586 } else {
3587 rbt_add_node(ranking_tree, &parent, new_ranking);
3588
3589 query->total_size += SIZEOF_RBT_NODE_ADD
3590 + sizeof(fts_ranking_t);
3591 }
3592 }
3593
3594 /*****************************************************************//**
3595 Retrieve the FTS Relevance Ranking result for doc with doc_id
3596 @return the relevance ranking value, 0 if no ranking value
3597 present. */
3598 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3599 fts_retrieve_ranking(
3600 /*=================*/
3601 fts_result_t* result, /*!< in: FTS result structure */
3602 doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
3603 {
3604 ib_rbt_bound_t parent;
3605 fts_ranking_t new_ranking;
3606
3607 DBUG_ENTER("fts_retrieve_ranking");
3608
3609 if (!result || !result->rankings_by_id) {
3610 DBUG_RETURN(0);
3611 }
3612
3613 new_ranking.doc_id = doc_id;
3614
3615 /* Lookup the ranking in our rb tree */
3616 if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3617 fts_ranking_t* ranking;
3618
3619 ranking = rbt_value(fts_ranking_t, parent.last);
3620
3621 DBUG_RETURN(ranking->rank);
3622 }
3623
3624 DBUG_RETURN(0);
3625 }
3626
3627 /*****************************************************************//**
3628 Create the result and copy the data to it. */
3629 static
3630 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3631 fts_query_prepare_result(
3632 /*=====================*/
3633 fts_query_t* query, /*!< in: Query state */
3634 fts_result_t* result) /*!< in: result this can contain
3635 data from a previous search on
3636 another FTS index */
3637 {
3638 const ib_rbt_node_t* node;
3639 bool result_is_null = false;
3640
3641 DBUG_ENTER("fts_query_prepare_result");
3642
3643 if (result == NULL) {
3644 result = static_cast<fts_result_t*>(
3645 ut_zalloc_nokey(sizeof(*result)));
3646
3647 result->rankings_by_id = rbt_create(
3648 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3649
3650 query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3651 result_is_null = true;
3652 }
3653
3654 if (query->flags == FTS_OPT_RANKING) {
3655 fts_word_freq_t* word_freq;
3656 ulint size = ib_vector_size(query->deleted->doc_ids);
3657 doc_id_t* updates =
3658 (doc_id_t*) query->deleted->doc_ids->data;
3659
3660 node = rbt_first(query->word_freqs);
3661 ut_ad(node);
3662 word_freq = rbt_value(fts_word_freq_t, node);
3663
3664 for (node = rbt_first(word_freq->doc_freqs);
3665 node;
3666 node = rbt_next(word_freq->doc_freqs, node)) {
3667 fts_doc_freq_t* doc_freq;
3668 fts_ranking_t ranking;
3669
3670 doc_freq = rbt_value(fts_doc_freq_t, node);
3671
3672 /* Don't put deleted docs into result */
3673 if (fts_bsearch(updates, 0, static_cast<int>(size),
3674 doc_freq->doc_id) >= 0) {
3675 /* one less matching doc count */
3676 --word_freq->doc_count;
3677 continue;
3678 }
3679
3680 ranking.doc_id = doc_freq->doc_id;
3681 ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3682 ranking.words = NULL;
3683
3684 fts_query_add_ranking(query, result->rankings_by_id,
3685 &ranking);
3686
3687 if (query->total_size > fts_result_cache_limit) {
3688 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3689 fts_query_free_result(result);
3690 DBUG_RETURN(NULL);
3691 }
3692 }
3693
3694 /* Calculate IDF only after we exclude the deleted items */
3695 fts_query_calculate_idf(query);
3696
3697 node = rbt_first(query->word_freqs);
3698 word_freq = rbt_value(fts_word_freq_t, node);
3699
3700 /* Calculate the ranking for each doc */
3701 for (node = rbt_first(result->rankings_by_id);
3702 node != NULL;
3703 node = rbt_next(result->rankings_by_id, node)) {
3704
3705 fts_ranking_t* ranking;
3706
3707 ranking = rbt_value(fts_ranking_t, node);
3708
3709 ranking->rank = static_cast<fts_rank_t>(
3710 ranking->rank * word_freq->idf * word_freq->idf);
3711 }
3712
3713 DBUG_RETURN(result);
3714 }
3715
3716 ut_a(rbt_size(query->doc_ids) > 0);
3717
3718 for (node = rbt_first(query->doc_ids);
3719 node;
3720 node = rbt_next(query->doc_ids, node)) {
3721
3722 fts_ranking_t* ranking;
3723
3724 ranking = rbt_value(fts_ranking_t, node);
3725 fts_query_calculate_ranking(query, ranking);
3726
3727 // FIXME: I think we may requre this information to improve the
3728 // ranking of doc ids which have more word matches from
3729 // different FTS indexes.
3730
3731 /* We don't need these anymore free the resources. */
3732 ranking->words = NULL;
3733
3734 if (!result_is_null) {
3735 fts_query_add_ranking(query, result->rankings_by_id, ranking);
3736
3737 if (query->total_size > fts_result_cache_limit) {
3738 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3739 fts_query_free_result(result);
3740 DBUG_RETURN(NULL);
3741 }
3742 }
3743 }
3744
3745 if (result_is_null) {
3746 /* Use doc_ids directly */
3747 rbt_free(result->rankings_by_id);
3748 result->rankings_by_id = query->doc_ids;
3749 query->doc_ids = NULL;
3750 }
3751
3752 DBUG_RETURN(result);
3753 }
3754
3755 /*****************************************************************//**
3756 Get the result of the query. Calculate the similarity coefficient. */
3757 static
3758 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3759 fts_query_get_result(
3760 /*=================*/
3761 fts_query_t* query, /*!< in: query instance */
3762 fts_result_t* result) /*!< in: result */
3763 {
3764 DBUG_ENTER("fts_query_get_result");
3765
3766 if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3767 /* Copy the doc ids to the result. */
3768 result = fts_query_prepare_result(query, result);
3769 } else {
3770 /* Create an empty result instance. */
3771 result = static_cast<fts_result_t*>(
3772 ut_zalloc_nokey(sizeof(*result)));
3773 }
3774
3775 DBUG_RETURN(result);
3776 }
3777
3778 /*****************************************************************//**
3779 FTS Query free resources and reset. */
3780 static
3781 void
fts_query_free(fts_query_t * query)3782 fts_query_free(
3783 /*===========*/
3784 fts_query_t* query) /*!< in: query instance to free*/
3785 {
3786
3787 if (query->read_nodes_graph) {
3788 fts_que_graph_free(query->read_nodes_graph);
3789 }
3790
3791 if (query->root) {
3792 fts_ast_free_node(query->root);
3793 }
3794
3795 if (query->deleted) {
3796 fts_doc_ids_free(query->deleted);
3797 }
3798
3799 if (query->intersection) {
3800 fts_query_free_doc_ids(query, query->intersection);
3801 }
3802
3803 if (query->doc_ids) {
3804 fts_query_free_doc_ids(query, query->doc_ids);
3805 }
3806
3807 if (query->word_freqs) {
3808 const ib_rbt_node_t* node;
3809
3810 /* We need to free any instances of fts_doc_freq_t that we
3811 may have allocated. */
3812 for (node = rbt_first(query->word_freqs);
3813 node;
3814 node = rbt_next(query->word_freqs, node)) {
3815
3816 fts_word_freq_t* word_freq;
3817
3818 word_freq = rbt_value(fts_word_freq_t, node);
3819
3820 /* We need to cast away the const. */
3821 rbt_free(word_freq->doc_freqs);
3822 }
3823
3824 rbt_free(query->word_freqs);
3825 }
3826
3827 if (query->wildcard_words != NULL) {
3828 rbt_free(query->wildcard_words);
3829 }
3830
3831 ut_a(!query->intersection);
3832
3833 if (query->word_map) {
3834 rbt_free(query->word_map);
3835 }
3836
3837 if (query->word_vector != NULL) {
3838 UT_DELETE(query->word_vector);
3839 }
3840
3841 if (query->heap) {
3842 mem_heap_free(query->heap);
3843 }
3844
3845 memset(query, 0, sizeof(*query));
3846 }
3847
3848 /*****************************************************************//**
3849 Parse the query using flex/bison or plugin parser.
3850 @return parse tree node. */
3851 static
3852 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3853 fts_query_parse(
3854 /*============*/
3855 fts_query_t* query, /*!< in: query instance */
3856 byte* query_str, /*!< in: query string */
3857 ulint query_len) /*!< in: query string length */
3858 {
3859 int error;
3860 fts_ast_state_t state;
3861 bool mode = query->boolean_mode;
3862 DBUG_ENTER("fts_query_parse");
3863
3864 memset(&state, 0x0, sizeof(state));
3865
3866 state.charset = query->fts_index_table.charset;
3867
3868 DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3869 query->parser = NULL;);
3870
3871 if (query->parser) {
3872 state.root = state.cur_node =
3873 fts_ast_create_node_list(&state, NULL);
3874 error = fts_parse_by_parser(mode, query_str, query_len,
3875 query->parser, &state);
3876 } else {
3877 /* Setup the scanner to use, this depends on the mode flag. */
3878 state.lexer = fts_lexer_create(mode, query_str, query_len);
3879 state.charset = query->fts_index_table.charset;
3880 error = fts_parse(&state);
3881 fts_lexer_free(state.lexer);
3882 state.lexer = NULL;
3883 }
3884
3885 /* Error during parsing ? */
3886 if (error) {
3887 /* Free the nodes that were allocated during parsing. */
3888 fts_ast_state_free(&state);
3889 } else {
3890 query->root = state.root;
3891
3892 if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
3893 fts_ast_node_print(query->root);
3894 }
3895 }
3896
3897 DBUG_RETURN(state.root);
3898 }
3899
3900 /*******************************************************************//**
3901 FTS Query optimization
3902 Set FTS_OPT_RANKING if it is a simple term query */
3903 static
3904 void
fts_query_can_optimize(fts_query_t * query,uint flags)3905 fts_query_can_optimize(
3906 /*===================*/
3907 fts_query_t* query, /*!< in/out: query instance */
3908 uint flags) /*!< In: FTS search mode */
3909 {
3910 fts_ast_node_t* node = query->root;
3911
3912 if (flags & FTS_EXPAND) {
3913 return;
3914 }
3915
3916 /* Check if it has only a term without oper */
3917 ut_ad(node->type == FTS_AST_LIST);
3918 node = node->list.head;
3919 if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3920 query->flags = FTS_OPT_RANKING;
3921 }
3922 }
3923
3924 /** FTS Query entry point.
3925 @param[in,out] trx transaction
3926 @param[in] index fts index to search
3927 @param[in] flags FTS search mode
3928 @param[in] query_str FTS query
3929 @param[in] query_len FTS query string len in bytes
3930 @param[in,out] result result doc ids
3931 @return DB_SUCCESS if successful otherwise error code */
3932 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3933 fts_query(
3934 trx_t* trx,
3935 dict_index_t* index,
3936 uint flags,
3937 const byte* query_str,
3938 ulint query_len,
3939 fts_result_t** result)
3940 {
3941 fts_query_t query;
3942 dberr_t error = DB_SUCCESS;
3943 byte* lc_query_str;
3944 ulint lc_query_str_len;
3945 ulint result_len;
3946 bool boolean_mode;
3947 trx_t* query_trx; /* FIXME: use provided trx */
3948 CHARSET_INFO* charset;
3949 ulint start_time_ms;
3950 bool will_be_ignored = false;
3951
3952 boolean_mode = flags & FTS_BOOL;
3953
3954 *result = NULL;
3955 memset(&query, 0x0, sizeof(query));
3956 query_trx = trx_create();
3957 query_trx->op_info = "FTS query";
3958
3959 start_time_ms = ut_time_ms();
3960
3961 query.trx = query_trx;
3962 query.index = index;
3963 query.boolean_mode = boolean_mode;
3964 query.deleted = fts_doc_ids_create();
3965 query.cur_node = NULL;
3966
3967 query.fts_common_table.type = FTS_COMMON_TABLE;
3968 query.fts_common_table.table_id = index->table->id;
3969 query.fts_common_table.table = index->table;
3970
3971 charset = fts_index_get_charset(index);
3972
3973 query.fts_index_table.type = FTS_INDEX_TABLE;
3974 query.fts_index_table.index_id = index->id;
3975 query.fts_index_table.table_id = index->table->id;
3976 query.fts_index_table.charset = charset;
3977 query.fts_index_table.table = index->table;
3978
3979 query.word_map = rbt_create_arg_cmp(
3980 sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3981 query.word_vector = UT_NEW_NOKEY(word_vector_t());
3982 query.error = DB_SUCCESS;
3983
3984 /* Setup the RB tree that will be used to collect per term
3985 statistics. */
3986 query.word_freqs = rbt_create_arg_cmp(
3987 sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3988 (void*) charset);
3989
3990 if (flags & FTS_EXPAND) {
3991 query.wildcard_words = rbt_create_arg_cmp(
3992 sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3993 }
3994
3995 query.total_size += SIZEOF_RBT_CREATE;
3996
3997 query.total_docs = dict_table_get_n_rows(index->table);
3998
3999 query.fts_common_table.suffix = "DELETED";
4000
4001 /* Read the deleted doc_ids, we need these for filtering. */
4002 error = fts_table_fetch_doc_ids(
4003 NULL, &query.fts_common_table, query.deleted);
4004
4005 if (error != DB_SUCCESS) {
4006 goto func_exit;
4007 }
4008
4009 query.fts_common_table.suffix = "DELETED_CACHE";
4010
4011 error = fts_table_fetch_doc_ids(
4012 NULL, &query.fts_common_table, query.deleted);
4013
4014 if (error != DB_SUCCESS) {
4015 goto func_exit;
4016 }
4017
4018 /* Get the deleted doc ids that are in the cache. */
4019 fts_cache_append_deleted_doc_ids(
4020 index->table->fts->cache, query.deleted->doc_ids);
4021 DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4022
4023 /* Sort the vector so that we can do a binary search over the ids. */
4024 ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
4025
4026 /* Convert the query string to lower case before parsing. We own
4027 the ut_malloc'ed result and so remember to free it before return. */
4028
4029 lc_query_str_len = query_len * charset->casedn_multiply + 1;
4030 lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4031
4032 /* For binary collations, a case sensitive search is
4033 performed. Hence don't convert to lower case. */
4034 if (my_binary_compare(charset)) {
4035 memcpy(lc_query_str, query_str, query_len);
4036 lc_query_str[query_len]= 0;
4037 result_len= query_len;
4038 } else {
4039 result_len = innobase_fts_casedn_str(
4040 charset, (char*)( query_str), query_len,
4041 (char*)(lc_query_str), lc_query_str_len);
4042 }
4043
4044 ut_ad(result_len < lc_query_str_len);
4045
4046 lc_query_str[result_len] = 0;
4047
4048 query.heap = mem_heap_create(128);
4049
4050 /* Create the rb tree for the doc id (current) set. */
4051 query.doc_ids = rbt_create(
4052 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4053 query.parser = index->parser;
4054
4055 query.total_size += SIZEOF_RBT_CREATE;
4056
4057 /* Parse the input query string. */
4058 if (fts_query_parse(&query, lc_query_str, result_len)) {
4059 fts_ast_node_t* ast = query.root;
4060 ast->trx = trx;
4061
4062 /* Optimize query to check if it's a single term */
4063 fts_query_can_optimize(&query, flags);
4064
4065 DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4066 fts_result_cache_limit = 2048;
4067 );
4068
4069 /* Traverse the Abstract Syntax Tree (AST) and execute
4070 the query. */
4071 query.error = fts_ast_visit(
4072 FTS_NONE, ast, fts_query_visitor,
4073 &query, &will_be_ignored);
4074 if (query.error == DB_INTERRUPTED) {
4075 error = DB_INTERRUPTED;
4076 ut_free(lc_query_str);
4077 goto func_exit;
4078 }
4079
4080 /* If query expansion is requested, extend the search
4081 with first search pass result */
4082 if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4083 query.error = fts_expand_query(index, &query);
4084 }
4085
4086 /* Calculate the inverse document frequency of the terms. */
4087 if (query.error == DB_SUCCESS
4088 && query.flags != FTS_OPT_RANKING) {
4089 fts_query_calculate_idf(&query);
4090 }
4091
4092 /* Copy the result from the query state, so that we can
4093 return it to the caller. */
4094 if (query.error == DB_SUCCESS) {
4095 *result = fts_query_get_result(&query, *result);
4096 }
4097
4098 error = query.error;
4099 } else {
4100 /* still return an empty result set */
4101 *result = static_cast<fts_result_t*>(
4102 ut_zalloc_nokey(sizeof(**result)));
4103 }
4104
4105 if (trx_is_interrupted(trx)) {
4106 error = DB_INTERRUPTED;
4107 ut_free(lc_query_str);
4108 if (*result) {
4109 fts_query_free_result(*result);
4110 }
4111 goto func_exit;
4112 }
4113
4114 ut_free(lc_query_str);
4115
4116 if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
4117 ulint diff_time = ut_time_ms() - start_time_ms;
4118
4119 ib::info() << "FTS Search Processing time: "
4120 << diff_time / 1000 << " secs: " << diff_time % 1000
4121 << " millisec: row(s) "
4122 << ((*result)->rankings_by_id
4123 ? lint(rbt_size((*result)->rankings_by_id))
4124 : -1);
4125
4126 /* Log memory consumption & result size */
4127 ib::info() << "Full Search Memory: " << query.total_size
4128 << " (bytes), Row: "
4129 << ((*result)->rankings_by_id
4130 ? rbt_size((*result)->rankings_by_id)
4131 : 0)
4132 << ".";
4133 }
4134
4135 func_exit:
4136 fts_query_free(&query);
4137
4138 query_trx->free();
4139
4140 return(error);
4141 }
4142
4143 /*****************************************************************//**
4144 FTS Query free result, returned by fts_query(). */
4145 void
fts_query_free_result(fts_result_t * result)4146 fts_query_free_result(
4147 /*==================*/
4148 fts_result_t* result) /*!< in: result instance to free.*/
4149 {
4150 if (result) {
4151 if (result->rankings_by_id != NULL) {
4152 rbt_free(result->rankings_by_id);
4153 result->rankings_by_id = NULL;
4154 }
4155 if (result->rankings_by_rank != NULL) {
4156 rbt_free(result->rankings_by_rank);
4157 result->rankings_by_rank = NULL;
4158 }
4159
4160 ut_free(result);
4161 result = NULL;
4162 }
4163 }
4164
4165 /*****************************************************************//**
4166 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4167 void
fts_query_sort_result_on_rank(fts_result_t * result)4168 fts_query_sort_result_on_rank(
4169 /*==========================*/
4170 fts_result_t* result) /*!< out: result instance to sort.*/
4171 {
4172 const ib_rbt_node_t* node;
4173 ib_rbt_t* ranked;
4174
4175 ut_a(result->rankings_by_id != NULL);
4176 if (result->rankings_by_rank) {
4177 rbt_free(result->rankings_by_rank);
4178 }
4179
4180 ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4181
4182 /* We need to free any instances of fts_doc_freq_t that we
4183 may have allocated. */
4184 for (node = rbt_first(result->rankings_by_id);
4185 node;
4186 node = rbt_next(result->rankings_by_id, node)) {
4187
4188 fts_ranking_t* ranking;
4189
4190 ranking = rbt_value(fts_ranking_t, node);
4191
4192 ut_a(ranking->words == NULL);
4193
4194 rbt_insert(ranked, ranking, ranking);
4195 }
4196
4197 /* Reset the current node too. */
4198 result->current = NULL;
4199 result->rankings_by_rank = ranked;
4200 }
4201
4202 /*******************************************************************//**
4203 A debug function to print result doc_id set. */
4204 static
4205 void
fts_print_doc_id(fts_query_t * query)4206 fts_print_doc_id(
4207 /*=============*/
4208 fts_query_t* query) /*!< in : tree that stores doc_ids.*/
4209 {
4210 const ib_rbt_node_t* node;
4211
4212 /* Iterate each member of the doc_id set */
4213 for (node = rbt_first(query->doc_ids);
4214 node;
4215 node = rbt_next(query->doc_ids, node)) {
4216 fts_ranking_t* ranking;
4217 ranking = rbt_value(fts_ranking_t, node);
4218
4219 ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4220
4221 ulint pos = 0;
4222 fts_string_t word;
4223
4224 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4225 ib::info() << "doc_ids info, value: " << word.f_str;
4226 }
4227 }
4228 }
4229
4230 /*************************************************************//**
4231 This function implements a simple "blind" query expansion search:
4232 words in documents found in the first search pass will be used as
4233 search arguments to search the document again, thus "expand"
4234 the search result set.
4235 @return DB_SUCCESS if success, otherwise the error code */
4236 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4237 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4238 fts_expand_query(
4239 /*=============*/
4240 dict_index_t* index, /*!< in: FTS index to search */
4241 fts_query_t* query) /*!< in: FTS query instance */
4242 {
4243 const ib_rbt_node_t* node;
4244 const ib_rbt_node_t* token_node;
4245 fts_doc_t result_doc;
4246 dberr_t error = DB_SUCCESS;
4247 const fts_index_cache_t*index_cache;
4248
4249 /* If no doc is found in first search pass, return */
4250 if (!rbt_size(query->doc_ids)) {
4251 return(error);
4252 }
4253
4254 /* Init "result_doc", to hold words from the first search pass */
4255 fts_doc_init(&result_doc);
4256
4257 rw_lock_x_lock(&index->table->fts->cache->lock);
4258 index_cache = fts_find_index_cache(index->table->fts->cache, index);
4259 rw_lock_x_unlock(&index->table->fts->cache->lock);
4260
4261 ut_a(index_cache);
4262
4263 result_doc.tokens = rbt_create_arg_cmp(
4264 sizeof(fts_token_t), innobase_fts_text_cmp,
4265 (void*) index_cache->charset);
4266
4267 result_doc.charset = index_cache->charset;
4268 result_doc.parser = index_cache->index->parser;
4269
4270 query->total_size += SIZEOF_RBT_CREATE;
4271
4272 if (UNIV_UNLIKELY(fts_enable_diag_print)) {
4273 fts_print_doc_id(query);
4274 }
4275
4276 for (node = rbt_first(query->doc_ids);
4277 node;
4278 node = rbt_next(query->doc_ids, node)) {
4279
4280 fts_ranking_t* ranking;
4281 ulint prev_token_size;
4282 ulint estimate_size;
4283
4284 prev_token_size = rbt_size(result_doc.tokens);
4285
4286 ranking = rbt_value(fts_ranking_t, node);
4287
4288 /* Fetch the documents with the doc_id from the
4289 result of first seach pass. Since we do not
4290 store document-to-word mapping, we need to
4291 fetch the original document and parse them.
4292 Future optimization could be done here if we
4293 support some forms of document-to-word mapping */
4294 fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4295 FTS_FETCH_DOC_BY_ID_EQUAL,
4296 fts_query_expansion_fetch_doc,
4297 &result_doc);
4298
4299 /* Estimate memory used, see fts_process_token and fts_token_t.
4300 We ignore token size here. */
4301 estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4302 * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4303 + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4304 query->total_size += estimate_size;
4305
4306 if (query->total_size > fts_result_cache_limit) {
4307 error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4308 goto func_exit;
4309 }
4310 }
4311
4312 /* Remove words that have already been searched in the first pass */
4313 for (ulint i = 0; i < query->word_vector->size(); i++) {
4314 fts_string_t word = query->word_vector->at(i);
4315 ib_rbt_bound_t parent;
4316
4317 if (query->wildcard_words
4318 && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4319 /* If it's a wildcard word, remove words having
4320 it as prefix. */
4321 while (rbt_search_cmp(result_doc.tokens,
4322 &parent, &word, NULL,
4323 innobase_fts_text_cmp_prefix)
4324 == 0) {
4325 ut_free(rbt_remove_node(result_doc.tokens,
4326 parent.last));
4327 }
4328 } else {
4329 /* We don't check return value, because the word may
4330 have been deleted by a previous wildcard word as its
4331 prefix, e.g. ('g * good'). */
4332 rbt_delete(result_doc.tokens, &word);
4333 }
4334 }
4335
4336 /* Search the table the second time with expanded search list */
4337 for (token_node = rbt_first(result_doc.tokens);
4338 token_node;
4339 token_node = rbt_next(result_doc.tokens, token_node)) {
4340 fts_token_t* mytoken;
4341 mytoken = rbt_value(fts_token_t, token_node);
4342
4343 /* '%' in the end is treated as prefix search,
4344 it can cause assert failure, so we skip it. */
4345 if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4346 continue;
4347 }
4348
4349 ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4350 fts_query_add_word_freq(query, &mytoken->text);
4351 error = fts_query_union(query, &mytoken->text);
4352
4353 if (error != DB_SUCCESS) {
4354 break;
4355 }
4356 }
4357
4358 func_exit:
4359 fts_doc_free(&result_doc);
4360
4361 return(error);
4362 }
4363 /*************************************************************//**
4364 This function finds documents that contain all words in a
4365 phrase or proximity search. And if proximity search, verify
4366 the words are close enough to each other, as in specified distance.
4367 This function is called for phrase and proximity search.
4368 @return TRUE if documents are found, FALSE if otherwise */
4369 static
4370 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4371 fts_phrase_or_proximity_search(
4372 /*===========================*/
4373 fts_query_t* query, /*!< in/out: query instance.
4374 query->doc_ids might be instantiated
4375 with qualified doc IDs */
4376 ib_vector_t* tokens) /*!< in: Tokens contain words */
4377 {
4378 ulint n_matched;
4379 ulint i;
4380 ibool matched = FALSE;
4381 ulint num_token = ib_vector_size(tokens);
4382 fts_match_t* match[MAX_PROXIMITY_ITEM];
4383 ibool end_list = FALSE;
4384
4385 /* Number of matched documents for the first token */
4386 n_matched = ib_vector_size(query->match_array[0]);
4387
4388 /* We have a set of match list for each word, we shall
4389 walk through the list and find common documents that
4390 contain all the matching words. */
4391 for (i = 0; i < n_matched; i++) {
4392 ulint j;
4393 ulint k = 0;
4394 fts_proximity_t qualified_pos;
4395
4396 match[0] = static_cast<fts_match_t*>(
4397 ib_vector_get(query->match_array[0], i));
4398
4399 /* For remaining match list for the token(word), we
4400 try to see if there is a document with the same
4401 doc id */
4402 for (j = 1; j < num_token; j++) {
4403 match[j] = static_cast<fts_match_t*>(
4404 ib_vector_get(query->match_array[j], k));
4405
4406 while (match[j]->doc_id < match[0]->doc_id
4407 && k < ib_vector_size(query->match_array[j])) {
4408 match[j] = static_cast<fts_match_t*>(
4409 ib_vector_get(
4410 query->match_array[j], k));
4411 k++;
4412 }
4413
4414 if (match[j]->doc_id > match[0]->doc_id) {
4415 /* no match */
4416 if (query->flags & FTS_PHRASE) {
4417 match[0]->doc_id = 0;
4418 }
4419 break;
4420 }
4421
4422 if (k == ib_vector_size(query->match_array[j])) {
4423 end_list = TRUE;
4424
4425 if (query->flags & FTS_PHRASE) {
4426 ulint s;
4427 /* Since i is the last doc id in the
4428 match_array[j], remove all doc ids > i
4429 from the match_array[0]. */
4430 fts_match_t* match_temp;
4431 for (s = i + 1; s < n_matched; s++) {
4432 match_temp = static_cast<
4433 fts_match_t*>(ib_vector_get(
4434 query->match_array[0], s));
4435 match_temp->doc_id = 0;
4436 }
4437
4438 if (match[j]->doc_id !=
4439 match[0]->doc_id) {
4440 /* no match */
4441 match[0]->doc_id = 0;
4442 }
4443 }
4444
4445 if (match[j]->doc_id != match[0]->doc_id) {
4446 goto func_exit;
4447 }
4448 }
4449
4450 /* FIXME: A better solution will be a counter array
4451 remember each run's last position. So we don't
4452 reset it here very time */
4453 k = 0;
4454 }
4455
4456 if (j != num_token) {
4457 continue;
4458 }
4459
4460 /* For this matching doc, we need to further
4461 verify whether the words in the doc are close
4462 to each other, and within the distance specified
4463 in the proximity search */
4464 if (query->flags & FTS_PHRASE) {
4465 matched = TRUE;
4466 } else if (fts_proximity_get_positions(
4467 match, num_token, ULINT_MAX, &qualified_pos)) {
4468
4469 /* Fetch the original documents and count the
4470 words in between matching words to see that is in
4471 specified distance */
4472 if (fts_query_is_in_proximity_range(
4473 query, match, &qualified_pos)) {
4474 /* If so, mark we find a matching doc */
4475 query->error = fts_query_process_doc_id(
4476 query, match[0]->doc_id, 0);
4477 if (query->error != DB_SUCCESS) {
4478 matched = FALSE;
4479 goto func_exit;
4480 }
4481
4482 matched = TRUE;
4483 for (ulint z = 0; z < num_token; z++) {
4484 fts_string_t* token;
4485 token = static_cast<fts_string_t*>(
4486 ib_vector_get(tokens, z));
4487 fts_query_add_word_to_document(
4488 query, match[0]->doc_id, token);
4489 }
4490 }
4491 }
4492
4493 if (end_list) {
4494 break;
4495 }
4496 }
4497
4498 func_exit:
4499 return(matched);
4500 }
4501
4502 /*************************************************************//**
4503 This function checks whether words in result documents are close to
4504 each other (within proximity range as specified by "distance").
4505 If "distance" is MAX_ULINT, then it will find all combinations of
4506 positions of matching words and store min and max positions
4507 in the "qualified_pos" for later verification.
4508 @return true if words are close to each other, false if otherwise */
4509 static
4510 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4511 fts_proximity_get_positions(
4512 /*========================*/
4513 fts_match_t** match, /*!< in: query instance */
4514 ulint num_match, /*!< in: number of matching
4515 items */
4516 ulint distance, /*!< in: distance value
4517 for proximity search */
4518 fts_proximity_t* qualified_pos) /*!< out: the position info
4519 records ranges containing
4520 all matching words. */
4521 {
4522 ulint i;
4523 ulint idx[MAX_PROXIMITY_ITEM];
4524 ulint num_pos[MAX_PROXIMITY_ITEM];
4525 ulint min_idx;
4526
4527 qualified_pos->n_pos = 0;
4528
4529 ut_a(num_match <= MAX_PROXIMITY_ITEM);
4530
4531 /* Each word could appear multiple times in a doc. So
4532 we need to walk through each word's position list, and find
4533 closest distance between different words to see if
4534 they are in the proximity distance. */
4535
4536 /* Assume each word's position list is sorted, we
4537 will just do a walk through to all words' lists
4538 similar to a the merge phase of a merge sort */
4539 for (i = 0; i < num_match; i++) {
4540 /* idx is the current position we are checking
4541 for a particular word */
4542 idx[i] = 0;
4543
4544 /* Number of positions for this word */
4545 num_pos[i] = ib_vector_size(match[i]->positions);
4546 }
4547
4548 /* Start with the first word */
4549 min_idx = 0;
4550
4551 while (idx[min_idx] < num_pos[min_idx]) {
4552 ulint position[MAX_PROXIMITY_ITEM];
4553 ulint min_pos = ULINT_MAX;
4554 ulint max_pos = 0;
4555
4556 /* Check positions in each word position list, and
4557 record the max/min position */
4558 for (i = 0; i < num_match; i++) {
4559 position[i] = *(ulint*) ib_vector_get_const(
4560 match[i]->positions, idx[i]);
4561
4562 if (position[i] == ULINT_UNDEFINED) {
4563 break;
4564 }
4565
4566 if (position[i] < min_pos) {
4567 min_pos = position[i];
4568 min_idx = i;
4569 }
4570
4571 if (position[i] > max_pos) {
4572 max_pos = position[i];
4573 }
4574 }
4575
4576 /* If max and min position are within range, we
4577 find a good match */
4578 if (max_pos - min_pos <= distance
4579 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4580 /* The charset has variable character
4581 length encoding, record the min_pos and
4582 max_pos, we will need to verify the actual
4583 number of characters */
4584 qualified_pos->min_pos.push_back(min_pos);
4585 qualified_pos->max_pos.push_back(max_pos);
4586 qualified_pos->n_pos++;
4587 }
4588
4589 /* Otherwise, move to the next position is the
4590 list for the word with the smallest position */
4591 idx[min_idx]++;
4592 }
4593
4594 return(qualified_pos->n_pos != 0);
4595 }
4596