1 /*****************************************************************************
2
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file fts/fts0que.cc
29 Full Text Search functionality.
30
31 Created 2007/03/27 Sunny Bains
32 Completed 2011/7/10 Sunny and Jimmy Yang
33 *******************************************************/
34
35 #include "dict0dict.h" /* dict_table_get_n_rows() */
36 #include "ut0rbt.h"
37 #include "row0sel.h"
38 #include "fts0fts.h"
39 #include "fts0priv.h"
40 #include "fts0ast.h"
41 #include "fts0pars.h"
42 #include "fts0types.h"
43 #include "ha_prototypes.h"
44 #include <ctype.h>
45
46 #ifndef UNIV_NONINL
47 #include "fts0types.ic"
48 #include "fts0vlc.ic"
49 #endif
50
51 #include <vector>
52
53 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
54
55 #define RANK_DOWNGRADE (-1.0F)
56 #define RANK_UPGRADE (1.0F)
57
58 /* Maximum number of words supported in a phrase or proximity search. */
59 #define MAX_PROXIMITY_ITEM 128
60
61 /* Memory used by rbt itself for create and node add */
62 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
63 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
64
65 /*Initial byte length for 'words' in fts_ranking_t */
66 #define RANKING_WORDS_INIT_LEN 4
67
68 // FIXME: Need to have a generic iterator that traverses the ilist.
69
70 typedef std::vector<fts_string_t> word_vector_t;
71
72 struct fts_word_freq_t;
73
74 /** State of an FTS query. */
75 struct fts_query_t {
76 mem_heap_t* heap; /*!< Heap to use for allocations */
77
78 trx_t* trx; /*!< The query transaction */
79
80 dict_index_t* index; /*!< The FTS index to search */
81 /*!< FTS auxiliary common table def */
82 fts_table_t fts_common_table;
83
84 fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
85
86 ulint total_size; /*!< total memory size used by query */
87
88 fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
89 filtered from the output */
90
91 fts_ast_node_t* root; /*!< Abstract syntax tree */
92
93 fts_ast_node_t* cur_node; /*!< Current tree node */
94
95 ib_rbt_t* word_map; /*!< Matched word map for
96 searching by word*/
97
98 word_vector_t* word_vector; /*!< Matched word vector for
99 searching by index */
100
101 ib_rbt_t* doc_ids; /*!< The current set of matching
102 doc ids, elements are of
103 type fts_ranking_t */
104
105 ib_rbt_t* intersection; /*!< The doc ids that were found in
106 doc_ids, this tree will become
107 the new doc_ids, elements are of type
108 fts_ranking_t */
109
110 /*!< Prepared statement to read the
111 nodes from the FTS INDEX */
112 que_t* read_nodes_graph;
113
114 fts_ast_oper_t oper; /*!< Current boolean mode operator */
115
116 /*!< TRUE if we want to collect the
117 word positions within the document */
118 ibool collect_positions;
119
120 ulint flags; /*!< Specify the full text search type,
121 such as boolean search, phrase
122 search, proximity search etc. */
123
124 ulint distance; /*!< The proximity distance of a
125 phrase search. */
126
127 /*!< These doc ids are used as a
128 boundary condition when searching the
129 FTS index rows */
130
131 doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
132
133 doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
134
135 bool boolean_mode; /*!< TRUE if boolean mode query */
136
137 ib_vector_t* matched; /*!< Array of matching documents
138 (fts_match_t) to search for a phrase */
139
140 ib_vector_t** match_array; /*!< Used for proximity search, contains
141 position info for each matched word
142 in the word list */
143
144 ib_uint64_t total_docs; /*!< The total number of documents */
145
146 ulint total_words; /*!< The total number of words */
147
148 dberr_t error; /*!< Error code if any, that is
149 encountered during query processing */
150
151 ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
152 document, its elements are of type
153 fts_word_freq_t */
154
155 bool multi_exist; /*!< multiple FTS_EXIST oper */
156 ulint nested_exp_count; /*!< number of nested sub expression
157 limit */
158 };
159
160 /** For phrase matching, first we collect the documents and the positions
161 then we match. */
162 struct fts_match_t {
163 doc_id_t doc_id; /*!< Document id */
164
165 ulint start; /*!< Start the phrase match from
166 this offset within the positions
167 vector. */
168
169 ib_vector_t* positions; /*!< Offsets of a word in a
170 document */
171 };
172
173 /** For matching tokens in a phrase search. We use this data structure in
174 the callback that determines whether a document should be accepted or
175 rejected for a phrase search. */
176 struct fts_select_t {
177 doc_id_t doc_id; /*!< The document id to match */
178
179 ulint min_pos; /*!< For found to be TRUE at least
180 one position must be greater than
181 min_pos. */
182
183 ibool found; /*!< TRUE if found */
184
185 fts_word_freq_t*
186 word_freq; /*!< Word frequency instance of the
187 current word being looked up in
188 the FTS index */
189 };
190
191 typedef std::vector<ulint> pos_vector_t;
192
193 /** structure defines a set of ranges for original documents, each of which
194 has a minimum position and maximum position. Text in such range should
195 contain all words in the proximity search. We will need to count the
196 words in such range to make sure it is less than the specified distance
197 of the proximity search */
198 struct fts_proximity_t {
199 ulint n_pos; /*!< number of position set, defines
200 a range (min to max) containing all
201 matching words */
202 pos_vector_t min_pos; /*!< the minimum position (in bytes)
203 of the range */
204 pos_vector_t max_pos; /*!< the maximum position (in bytes)
205 of the range */
206 };
207
208 /** The match positions and tokesn to match */
209 struct fts_phrase_t {
210 ibool found; /*!< Match result */
211
212 const fts_match_t*
213 match; /*!< Positions within text */
214
215 const ib_vector_t*
216 tokens; /*!< Tokens to match */
217
218 ulint distance; /*!< For matching on proximity
219 distance. Can be 0 for exact match */
220 CHARSET_INFO* charset; /*!< Phrase match charset */
221 mem_heap_t* heap; /*!< Heap for word processing */
222 ulint zip_size; /*!< row zip size */
223 fts_proximity_t*proximity_pos; /*!< position info for proximity
224 search verification. Records the min
225 and max position of words matched */
226 };
227
228 /** For storing the frequncy of a word/term in a document */
229 struct fts_doc_freq_t {
230 doc_id_t doc_id; /*!< Document id */
231 ulint freq; /*!< Frequency of a word in a document */
232 };
233
234 /** To determine the word frequency per document. */
235 struct fts_word_freq_t {
236 fts_string_t word; /*!< Word for which we need the freq,
237 it's allocated on the query heap */
238
239 ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
240 word frequencies. The elements are
241 of type fts_doc_freq_t */
242 ib_uint64_t doc_count; /*!< Total number of documents that
243 contain this word */
244 double idf; /*!< Inverse document frequency */
245 };
246
247 /********************************************************************
248 Callback function to fetch the rows in an FTS INDEX record.
249 @return always TRUE */
250 static
251 ibool
252 fts_query_index_fetch_nodes(
253 /*========================*/
254 void* row, /*!< in: sel_node_t* */
255 void* user_arg); /*!< in: pointer to ib_vector_t */
256
257 /********************************************************************
258 Read and filter nodes.
259 @return fts_node_t instance */
260 static
261 dberr_t
262 fts_query_filter_doc_ids(
263 /*=====================*/
264 fts_query_t* query, /*!< in: query instance */
265 const fts_string_t* word, /*!< in: the current word */
266 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
267 const fts_node_t* node, /*!< in: current FTS node */
268 void* data, /*!< in: doc id ilist */
269 ulint len, /*!< in: doc id ilist size */
270 ibool calc_doc_count);/*!< in: whether to remember doc
271 count */
272
273 #if 0
274 /*****************************************************************//***
275 Find a doc_id in a word's ilist.
276 @return TRUE if found. */
277 static
278 ibool
279 fts_query_find_doc_id(
280 /*==================*/
281 fts_select_t* select, /*!< in/out: search the doc id selected,
282 update the frequency if found. */
283 void* data, /*!< in: doc id ilist */
284 ulint len); /*!< in: doc id ilist size */
285 #endif
286
287 /*************************************************************//**
288 This function implements a simple "blind" query expansion search:
289 words in documents found in the first search pass will be used as
290 search arguments to search the document again, thus "expand"
291 the search result set.
292 @return DB_SUCCESS if success, otherwise the error code */
293 static
294 dberr_t
295 fts_expand_query(
296 /*=============*/
297 dict_index_t* index, /*!< in: FTS index to search */
298 fts_query_t* query) /*!< in: query result, to be freed
299 by the client */
300 MY_ATTRIBUTE((nonnull, warn_unused_result));
301 /*************************************************************//**
302 This function finds documents that contain all words in a
303 phrase or proximity search. And if proximity search, verify
304 the words are close enough to each other, as in specified distance.
305 This function is called for phrase and proximity search.
306 @return TRUE if documents are found, FALSE if otherwise */
307 static
308 ibool
309 fts_phrase_or_proximity_search(
310 /*===========================*/
311 fts_query_t* query, /*!< in/out: query instance
312 query->doc_ids might be instantiated
313 with qualified doc IDs */
314 ib_vector_t* tokens); /*!< in: Tokens contain words */
315 /*************************************************************//**
316 This function checks whether words in result documents are close to
317 each other (within proximity range as specified by "distance").
318 If "distance" is MAX_ULINT, then it will find all combinations of
319 positions of matching words and store min and max positions
320 in the "qualified_pos" for later verification.
321 @return true if words are close to each other, false if otherwise */
322 static
323 bool
324 fts_proximity_get_positions(
325 /*========================*/
326 fts_match_t** match, /*!< in: query instance */
327 ulint num_match, /*!< in: number of matching
328 items */
329 ulint distance, /*!< in: distance value
330 for proximity search */
331 fts_proximity_t* qualified_pos); /*!< out: the position info
332 records ranges containing
333 all matching words. */
334 #if 0
335 /********************************************************************
336 Get the total number of words in a documents. */
337 static
338 ulint
339 fts_query_terms_in_document(
340 /*========================*/
341 /*!< out: DB_SUCCESS if all go well
342 else error code */
343 fts_query_t* query, /*!< in: FTS query state */
344 doc_id_t doc_id, /*!< in: the word to check */
345 ulint* total); /*!< out: total words in document */
346 #endif
347
348 /********************************************************************
349 Compare two fts_doc_freq_t doc_ids.
350 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
351 UNIV_INLINE
352 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)353 fts_freq_doc_id_cmp(
354 /*================*/
355 const void* p1, /*!< in: id1 */
356 const void* p2) /*!< in: id2 */
357 {
358 const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
359 const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
360
361 return((int) (fq1->doc_id - fq2->doc_id));
362 }
363
364 #if 0
365 /*******************************************************************//**
366 Print the table used for calculating LCS. */
367 static
368 void
369 fts_print_lcs_table(
370 /*================*/
371 const ulint* table, /*!< in: array to print */
372 ulint n_rows, /*!< in: total no. of rows */
373 ulint n_cols) /*!< in: total no. of cols */
374 {
375 ulint i;
376
377 for (i = 0; i < n_rows; ++i) {
378 ulint j;
379
380 printf("\n");
381
382 for (j = 0; j < n_cols; ++j) {
383
384 printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
385 }
386 }
387 }
388
389 /********************************************************************
390 Find the longest common subsequence between the query string and
391 the document. */
392 static
393 ulint
394 fts_query_lcs(
395 /*==========*/
396 /*!< out: LCS (length) between
397 two ilists */
398 const ulint* p1, /*!< in: word positions of query */
399 ulint len_p1, /*!< in: no. of elements in p1 */
400 const ulint* p2, /*!< in: word positions within document */
401 ulint len_p2) /*!< in: no. of elements in p2 */
402 {
403 int i;
404 ulint len = 0;
405 ulint r = len_p1;
406 ulint c = len_p2;
407 ulint size = (r + 1) * (c + 1) * sizeof(ulint);
408 ulint* table = (ulint*) ut_malloc(size);
409
410 /* Traverse the table backwards, from the last row to the first and
411 also from the last column to the first. We compute the smaller
412 common subsequeces first, then use the caluclated values to determine
413 the longest common subsequence. The result will be in TABLE[0][0]. */
414 for (i = r; i >= 0; --i) {
415 int j;
416
417 for (j = c; j >= 0; --j) {
418
419 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
420
421 FTS_ELEM(table, c, i, j) = 0;
422
423 } else if (p1[i] == p2[j]) {
424
425 FTS_ELEM(table, c, i, j) = FTS_ELEM(
426 table, c, i + 1, j + 1) + 1;
427
428 } else {
429
430 ulint value;
431
432 value = ut_max(
433 FTS_ELEM(table, c, i + 1, j),
434 FTS_ELEM(table, c, i, j + 1));
435
436 FTS_ELEM(table, c, i, j) = value;
437 }
438 }
439 }
440
441 len = FTS_ELEM(table, c, 0, 0);
442
443 fts_print_lcs_table(table, r, c);
444 printf("\nLen=%lu\n", len);
445
446 ut_free(table);
447
448 return(len);
449 }
450 #endif
451
452 /*******************************************************************//**
453 Compare two fts_ranking_t instance on their rank value and doc ids in
454 descending order on the rank and ascending order on doc id.
455 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
456 static
457 int
fts_query_compare_rank(const void * p1,const void * p2)458 fts_query_compare_rank(
459 /*===================*/
460 const void* p1, /*!< in: pointer to elem */
461 const void* p2) /*!< in: pointer to elem */
462 {
463 const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
464 const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
465
466 if (r2->rank < r1->rank) {
467 return(-1);
468 } else if (r2->rank == r1->rank) {
469
470 if (r1->doc_id < r2->doc_id) {
471 return(1);
472 } else if (r1->doc_id > r2->doc_id) {
473 return(1);
474 }
475
476 return(0);
477 }
478
479 return(1);
480 }
481
482 #ifdef FTS_UTF8_DEBUG
483 /*******************************************************************//**
484 Convert string to lowercase.
485 @return lower case string, callers responsibility to delete using
486 ut_free() */
487 static
488 byte*
fts_tolower(const byte * src,ulint len)489 fts_tolower(
490 /*========*/
491 const byte* src, /*!< in: src string */
492 ulint len) /*!< in: src string length */
493 {
494 fts_string_t str;
495 byte* lc_str = ut_malloc(len + 1);
496
497 str.f_len = len;
498 str.f_str = lc_str;
499
500 memcpy(str.f_str, src, len);
501
502 /* Make sure the last byte is NUL terminated */
503 str.f_str[len] = '\0';
504
505 fts_utf8_tolower(&str);
506
507 return(lc_str);
508 }
509
510 /*******************************************************************//**
511 Do a case insensitive search. Doesn't check for NUL byte end marker
512 only relies on len. Convert str2 to lower case before comparing.
513 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
514 static
515 int
fts_utf8_strcmp(const fts_string_t * str1,fts_string_t * str2)516 fts_utf8_strcmp(
517 /*============*/
518 const fts_string_t*
519 str1, /*!< in: should be lower case*/
520
521 fts_string_t* str2) /*!< in: any case. We will use the length
522 of this string during compare as it
523 should be the min of the two strings */
524 {
525 byte b = str2->f_str[str2->f_len];
526
527 ut_a(str2->f_len <= str1->f_len);
528
529 /* We need to write a NUL byte at the end of the string because the
530 string is converted to lowercase by a MySQL function which doesn't
531 care about the length. */
532 str2->f_str[str2->f_len] = 0;
533
534 fts_utf8_tolower(str2);
535
536 /* Restore the value we replaced above. */
537 str2->f_str[str2->f_len] = b;
538
539 return(memcmp(str1->f_str, str2->f_str, str2->f_len));
540 }
541 #endif
542
543 /*******************************************************************//**
544 Create words in ranking */
545 static
546 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)547 fts_ranking_words_create(
548 /*=====================*/
549 fts_query_t* query, /*!< in: query instance */
550 fts_ranking_t* ranking) /*!< in: ranking instance */
551 {
552 ranking->words = static_cast<byte*>(
553 mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
554 ranking->words_len = RANKING_WORDS_INIT_LEN;
555 }
556
557 /*
558 The optimization here is using a char array(bitmap) to replace words rb tree
559 in fts_ranking_t.
560
561 It can save lots of memory except in some cases of QUERY EXPANSION.
562
563 'word_map' is used as a word dictionary, in which the key is a word, the value
564 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
565 if not, we add it into 'word_map', and give it a position(actually a number).
566 then we set the corresponding bit to '1' at the position in the char array 'words'.
567
568 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
569 more quickly than searching by value in 'word_map'. we use 'word_vector'
570 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
571 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
572 by the word.
573 */
574
575 /*******************************************************************//**
576 Add a word into ranking */
577 static
578 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)579 fts_ranking_words_add(
580 /*==================*/
581 fts_query_t* query, /*!< in: query instance */
582 fts_ranking_t* ranking, /*!< in: ranking instance */
583 const fts_string_t* word) /*!< in: term/word to add */
584 {
585 ulint pos;
586 ulint byte_offset;
587 ulint bit_offset;
588 ib_rbt_bound_t parent;
589
590 /* Note: we suppose the word map and vector are append-only. */
591 ut_ad(query->word_vector->size() == rbt_size(query->word_map));
592
593 /* We use ib_rbt to simulate a map, f_n_char means position. */
594 if (rbt_search(query->word_map, &parent, word) == 0) {
595 fts_string_t* result_word;
596
597 result_word = rbt_value(fts_string_t, parent.last);
598 pos = result_word->f_n_char;
599 ut_ad(pos < rbt_size(query->word_map));
600 } else {
601 /* Add the word to map. */
602 fts_string_t new_word;
603
604 pos = rbt_size(query->word_map);
605
606 new_word.f_str = static_cast<byte*>(mem_heap_alloc(query->heap,
607 word->f_len + 1));
608 memcpy(new_word.f_str, word->f_str, word->f_len);
609 new_word.f_str[word->f_len] = 0;
610 new_word.f_len = word->f_len;
611 new_word.f_n_char = pos;
612
613 rbt_add_node(query->word_map, &parent, &new_word);
614 ut_ad(rbt_validate(query->word_map));
615 query->word_vector->push_back(new_word);
616 }
617
618 /* Check words len */
619 byte_offset = pos / CHAR_BIT;
620 if (byte_offset >= ranking->words_len) {
621 byte* words = ranking->words;
622 ulint words_len = ranking->words_len;
623
624 while (byte_offset >= words_len) {
625 words_len *= 2;
626 }
627
628 ranking->words = static_cast<byte*>(
629 mem_heap_zalloc(query->heap, words_len));
630 ut_memcpy(ranking->words, words, ranking->words_len);
631 ranking->words_len = words_len;
632 }
633
634 /* Set ranking words */
635 ut_ad(byte_offset < ranking->words_len);
636 bit_offset = pos % CHAR_BIT;
637 ranking->words[byte_offset] |= 1 << bit_offset;
638 }
639
640 /*******************************************************************//**
641 Get a word from a ranking
642 @return true if it's successful */
643 static
644 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)645 fts_ranking_words_get_next(
646 /*=======================*/
647 const fts_query_t* query, /*!< in: query instance */
648 fts_ranking_t* ranking,/*!< in: ranking instance */
649 ulint* pos, /*!< in/out: word start pos */
650 fts_string_t* word) /*!< in/out: term/word to add */
651 {
652 bool ret = false;
653 ulint max_pos = ranking->words_len * CHAR_BIT;
654
655 /* Search for next word */
656 while (*pos < max_pos) {
657 ulint byte_offset = *pos / CHAR_BIT;
658 ulint bit_offset = *pos % CHAR_BIT;
659
660 if (ranking->words[byte_offset] & (1 << bit_offset)) {
661 ret = true;
662 break;
663 }
664
665 *pos += 1;
666 };
667
668 /* Get next word from word vector */
669 if (ret) {
670 ut_ad(*pos < query->word_vector->size());
671 *word = query->word_vector->at((size_t)*pos);
672 *pos += 1;
673 }
674
675 return ret;
676 }
677
678 /*******************************************************************//**
679 Add a word if it doesn't exist, to the term freq RB tree. We store
680 a pointer to the word that is passed in as the argument.
681 @return pointer to word */
682 static
683 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)684 fts_query_add_word_freq(
685 /*====================*/
686 fts_query_t* query, /*!< in: query instance */
687 const fts_string_t* word) /*!< in: term/word to add */
688 {
689 ib_rbt_bound_t parent;
690
691 /* Lookup the word in our rb tree and add if it doesn't exist. */
692 if (rbt_search(query->word_freqs, &parent, word) != 0) {
693 fts_word_freq_t word_freq;
694
695 memset(&word_freq, 0, sizeof(word_freq));
696
697 word_freq.word.f_str = static_cast<byte*>(
698 mem_heap_alloc(query->heap, word->f_len + 1));
699 memcpy(word_freq.word.f_str, word->f_str, word->f_len);
700 word_freq.word.f_str[word->f_len] = 0;
701 word_freq.word.f_len = word->f_len;
702
703 word_freq.doc_count = 0;
704
705 word_freq.doc_freqs = rbt_create(
706 sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
707
708 parent.last = rbt_add_node(
709 query->word_freqs, &parent, &word_freq);
710
711 query->total_size += word->f_len
712 + SIZEOF_RBT_CREATE
713 + SIZEOF_RBT_NODE_ADD
714 + sizeof(fts_word_freq_t);
715 }
716
717 return(rbt_value(fts_word_freq_t, parent.last));
718 }
719
720 /*******************************************************************//**
721 Add a doc id if it doesn't exist, to the doc freq RB tree.
722 @return pointer to word */
723 static
724 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)725 fts_query_add_doc_freq(
726 /*===================*/
727 fts_query_t* query, /*!< in: query instance */
728 ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
729 doc_id_t doc_id) /*!< in: doc id to add */
730 {
731 ib_rbt_bound_t parent;
732
733 /* Lookup the doc id in our rb tree and add if it doesn't exist. */
734 if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
735 fts_doc_freq_t doc_freq;
736
737 memset(&doc_freq, 0, sizeof(doc_freq));
738
739 doc_freq.freq = 0;
740 doc_freq.doc_id = doc_id;
741
742 parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
743
744 query->total_size += SIZEOF_RBT_NODE_ADD
745 + sizeof(fts_doc_freq_t);
746 }
747
748 return(rbt_value(fts_doc_freq_t, parent.last));
749 }
750
751 /*******************************************************************//**
752 Add the doc id to the query set only if it's not in the
753 deleted array. */
754 static
755 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)756 fts_query_union_doc_id(
757 /*===================*/
758 fts_query_t* query, /*!< in: query instance */
759 doc_id_t doc_id, /*!< in: the doc id to add */
760 fts_rank_t rank) /*!< in: if non-zero, it is the
761 rank associated with the doc_id */
762 {
763 ib_rbt_bound_t parent;
764 ulint size = ib_vector_size(query->deleted->doc_ids);
765 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
766
767 /* Check if the doc id is deleted and it's not already in our set. */
768 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
769 && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
770
771 fts_ranking_t ranking;
772
773 ranking.rank = rank;
774 ranking.doc_id = doc_id;
775 fts_ranking_words_create(query, &ranking);
776
777 rbt_add_node(query->doc_ids, &parent, &ranking);
778
779 query->total_size += SIZEOF_RBT_NODE_ADD
780 + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
781 }
782 }
783
784 /*******************************************************************//**
785 Remove the doc id from the query set only if it's not in the
786 deleted set. */
787 static
788 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)789 fts_query_remove_doc_id(
790 /*====================*/
791 fts_query_t* query, /*!< in: query instance */
792 doc_id_t doc_id) /*!< in: the doc id to add */
793 {
794 ib_rbt_bound_t parent;
795 ulint size = ib_vector_size(query->deleted->doc_ids);
796 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
797
798 /* Check if the doc id is deleted and it's in our set. */
799 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
800 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
801 ut_free(rbt_remove_node(query->doc_ids, parent.last));
802
803 ut_ad(query->total_size >=
804 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
805 query->total_size -= SIZEOF_RBT_NODE_ADD
806 + sizeof(fts_ranking_t);
807 }
808 }
809
810 /*******************************************************************//**
811 Find the doc id in the query set but not in the deleted set, artificialy
812 downgrade or upgrade its ranking by a value and make/initialize its ranking
813 under or above its normal range 0 to 1. This is used for Boolean Search
814 operator such as Negation operator, which makes word's contribution to the
815 row's relevance to be negative */
816 static
817 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)818 fts_query_change_ranking(
819 /*====================*/
820 fts_query_t* query, /*!< in: query instance */
821 doc_id_t doc_id, /*!< in: the doc id to add */
822 ibool downgrade) /*!< in: Whether to downgrade ranking */
823 {
824 ib_rbt_bound_t parent;
825 ulint size = ib_vector_size(query->deleted->doc_ids);
826 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
827
828 /* Check if the doc id is deleted and it's in our set. */
829 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
830 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
831
832 fts_ranking_t* ranking;
833
834 ranking = rbt_value(fts_ranking_t, parent.last);
835
836 ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
837
838 /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
839 and RANK_UPGRADE (0.5) */
840 if (ranking->rank >= 1.0F) {
841 ranking->rank = 1.0F;
842 } else if (ranking->rank <= -1.0F) {
843 ranking->rank = -1.0F;
844 }
845 }
846 }
847
848 /*******************************************************************//**
849 Check the doc id in the query set only if it's not in the
850 deleted array. The doc ids that were found are stored in
851 another rb tree (fts_query_t::intersect). */
852 static
853 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)854 fts_query_intersect_doc_id(
855 /*=======================*/
856 fts_query_t* query, /*!< in: query instance */
857 doc_id_t doc_id, /*!< in: the doc id to add */
858 fts_rank_t rank) /*!< in: if non-zero, it is the
859 rank associated with the doc_id */
860 {
861 ib_rbt_bound_t parent;
862 ulint size = ib_vector_size(query->deleted->doc_ids);
863 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
864 fts_ranking_t* ranking= NULL;
865
866 /* There are three types of intersect:
867 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
868 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
869 if it matches 'b'. if the doc is also in doc_ids, then change the
870 doc's rank, and add 'a' in doc's words.
871 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
872 if it matches 'b' and it's in doc_ids.(multi_exist = true). */
873
874 /* Check if the doc id is deleted and it's in our set */
875 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
876 fts_ranking_t new_ranking;
877
878 if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
879 if (query->multi_exist) {
880 return;
881 } else {
882 new_ranking.words = NULL;
883 }
884 } else {
885 ranking = rbt_value(fts_ranking_t, parent.last);
886
887 /* We've just checked the doc id before */
888 if (ranking->words == NULL) {
889 ut_ad(rbt_search(query->intersection, &parent,
890 ranking) == 0);
891 return;
892 }
893
894 /* Merge rank */
895 rank += ranking->rank;
896 if (rank >= 1.0F) {
897 rank = 1.0F;
898 } else if (rank <= -1.0F) {
899 rank = -1.0F;
900 }
901
902 /* Take words */
903 new_ranking.words = ranking->words;
904 new_ranking.words_len = ranking->words_len;
905 }
906
907 new_ranking.rank = rank;
908 new_ranking.doc_id = doc_id;
909
910 if (rbt_search(query->intersection, &parent,
911 &new_ranking) != 0) {
912 if (new_ranking.words == NULL) {
913 fts_ranking_words_create(query, &new_ranking);
914
915 query->total_size += RANKING_WORDS_INIT_LEN;
916 } else {
917 /* Note that the intersection has taken
918 ownership of the ranking data. */
919 ranking->words = NULL;
920 }
921
922 rbt_add_node(query->intersection,
923 &parent, &new_ranking);
924
925 query->total_size += SIZEOF_RBT_NODE_ADD
926 + sizeof(fts_ranking_t);
927 }
928 }
929 }
930
931 /*******************************************************************//**
932 Free the document ranking rb tree. */
933 static
934 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)935 fts_query_free_doc_ids(
936 /*===================*/
937 fts_query_t* query, /*!< in: query instance */
938 ib_rbt_t* doc_ids) /*!< in: rb tree to free */
939 {
940 const ib_rbt_node_t* node;
941
942 for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
943
944 fts_ranking_t* ranking;
945
946 ranking = rbt_value(fts_ranking_t, node);
947
948 if (ranking->words) {
949 ranking->words = NULL;
950 }
951
952 ut_free(rbt_remove_node(doc_ids, node));
953
954 ut_ad(query->total_size >=
955 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
956 query->total_size -= SIZEOF_RBT_NODE_ADD
957 + sizeof(fts_ranking_t);
958 }
959
960 rbt_free(doc_ids);
961
962 ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
963 query->total_size -= SIZEOF_RBT_CREATE;
964 }
965
966 /**
967 Free the query intersection
968 @param[in] query query instance */
969 static
970 void
fts_query_free_intersection(fts_query_t * query)971 fts_query_free_intersection(
972 fts_query_t* query)
973 {
974 fts_query_free_doc_ids(query, query->intersection);
975 query->intersection = NULL;
976 }
977
978 /*******************************************************************//**
979 Add the word to the documents "list" of matching words from
980 the query. We make a copy of the word from the query heap. */
981 static
982 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)983 fts_query_add_word_to_document(
984 /*===========================*/
985 fts_query_t* query, /*!< in: query to update */
986 doc_id_t doc_id, /*!< in: the document to update */
987 const fts_string_t* word) /*!< in: the token to add */
988 {
989 ib_rbt_bound_t parent;
990 fts_ranking_t* ranking = NULL;
991
992 if (query->flags == FTS_OPT_RANKING) {
993 return;
994 }
995
996 /* First we search the intersection RB tree as it could have
997 taken ownership of the words rb tree instance. */
998 if (query->intersection
999 && rbt_search(query->intersection, &parent, &doc_id) == 0) {
1000
1001 ranking = rbt_value(fts_ranking_t, parent.last);
1002 }
1003
1004 if (ranking == NULL
1005 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
1006
1007 ranking = rbt_value(fts_ranking_t, parent.last);
1008 }
1009
1010 if (ranking != NULL) {
1011 fts_ranking_words_add(query, ranking, word);
1012 }
1013 }
1014
1015 /*******************************************************************//**
1016 Check the node ilist. */
1017 static
1018 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)1019 fts_query_check_node(
1020 /*=================*/
1021 fts_query_t* query, /*!< in: query to update */
1022 const fts_string_t* token, /*!< in: the token to search */
1023 const fts_node_t* node) /*!< in: node to check */
1024 {
1025 /* Skip nodes whose doc ids are out range. */
1026 if (query->oper == FTS_EXIST
1027 && ((query->upper_doc_id > 0
1028 && node->first_doc_id > query->upper_doc_id)
1029 || (query->lower_doc_id > 0
1030 && node->last_doc_id < query->lower_doc_id))) {
1031
1032 /* Ignore */
1033
1034 } else {
1035 int ret;
1036 ib_rbt_bound_t parent;
1037 ulint ilist_size = node->ilist_size;
1038 fts_word_freq_t*word_freqs;
1039
1040 /* The word must exist. */
1041 ret = rbt_search(query->word_freqs, &parent, token);
1042 ut_a(ret == 0);
1043
1044 word_freqs = rbt_value(fts_word_freq_t, parent.last);
1045
1046 query->error = fts_query_filter_doc_ids(
1047 query, token, word_freqs, node,
1048 node->ilist, ilist_size, TRUE);
1049 }
1050 }
1051
1052 /*****************************************************************//**
1053 Search index cache for word with wildcard match.
1054 @return number of words matched */
1055 static
1056 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1057 fts_cache_find_wildcard(
1058 /*====================*/
1059 fts_query_t* query, /*!< in: query instance */
1060 const fts_index_cache_t*index_cache, /*!< in: cache to search */
1061 const fts_string_t* token) /*!< in: token to search */
1062 {
1063 ib_rbt_bound_t parent;
1064 const ib_vector_t* nodes = NULL;
1065 fts_string_t srch_text;
1066 byte term[FTS_MAX_WORD_LEN + 1];
1067 ulint num_word = 0;
1068
1069 srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1070 ? token->f_len - 1
1071 : token->f_len;
1072
1073 strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1074 term[srch_text.f_len] = '\0';
1075 srch_text.f_str = term;
1076
1077 /* Lookup the word in the rb tree */
1078 if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1079 innobase_fts_text_cmp_prefix) == 0) {
1080 const fts_tokenizer_word_t* word;
1081 ulint i;
1082 const ib_rbt_node_t* cur_node;
1083 ibool forward = FALSE;
1084
1085 word = rbt_value(fts_tokenizer_word_t, parent.last);
1086 cur_node = parent.last;
1087
1088 while (innobase_fts_text_cmp_prefix(
1089 index_cache->charset, &srch_text, &word->text) == 0) {
1090
1091 nodes = word->nodes;
1092
1093 for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1094 int ret;
1095 const fts_node_t* node;
1096 ib_rbt_bound_t freq_parent;
1097 fts_word_freq_t* word_freqs;
1098
1099 node = static_cast<const fts_node_t*>(
1100 ib_vector_get_const(nodes, i));
1101
1102 ret = rbt_search(query->word_freqs,
1103 &freq_parent,
1104 &srch_text);
1105
1106 ut_a(ret == 0);
1107
1108 word_freqs = rbt_value(
1109 fts_word_freq_t,
1110 freq_parent.last);
1111
1112 query->error = fts_query_filter_doc_ids(
1113 query, &srch_text,
1114 word_freqs, node,
1115 node->ilist, node->ilist_size, TRUE);
1116
1117 if (query->error != DB_SUCCESS) {
1118 return(0);
1119 }
1120 }
1121
1122 num_word++;
1123
1124 if (!forward) {
1125 cur_node = rbt_prev(
1126 index_cache->words, cur_node);
1127 } else {
1128 cont_search:
1129 cur_node = rbt_next(
1130 index_cache->words, cur_node);
1131 }
1132
1133 if (!cur_node) {
1134 break;
1135 }
1136
1137 word = rbt_value(fts_tokenizer_word_t, cur_node);
1138 }
1139
1140 if (!forward) {
1141 forward = TRUE;
1142 cur_node = parent.last;
1143 goto cont_search;
1144 }
1145 }
1146
1147 return(num_word);
1148 }
1149
1150 /*****************************************************************//**
1151 Set difference.
1152 @return DB_SUCCESS if all go well */
1153 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1154 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1155 fts_query_difference(
1156 /*=================*/
1157 fts_query_t* query, /*!< in: query instance */
1158 const fts_string_t* token) /*!< in: token to search */
1159 {
1160 ulint n_doc_ids= 0;
1161 trx_t* trx = query->trx;
1162 dict_table_t* table = query->index->table;
1163
1164 ut_a(query->oper == FTS_IGNORE);
1165
1166 #ifdef FTS_INTERNAL_DIAG_PRINT
1167 fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
1168 (int) token->f_len, token->f_str);
1169 #endif
1170
1171 if (query->doc_ids) {
1172 n_doc_ids = rbt_size(query->doc_ids);
1173 }
1174
1175 /* There is nothing we can substract from an empty set. */
1176 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1177 ulint i;
1178 fts_fetch_t fetch;
1179 const ib_vector_t* nodes;
1180 const fts_index_cache_t*index_cache;
1181 que_t* graph = NULL;
1182 fts_cache_t* cache = table->fts->cache;
1183 dberr_t error;
1184
1185 rw_lock_x_lock(&cache->lock);
1186
1187 index_cache = fts_find_index_cache(cache, query->index);
1188
1189 /* Must find the index cache */
1190 ut_a(index_cache != NULL);
1191
1192 /* Search the cache for a matching word first. */
1193 if (query->cur_node->term.wildcard
1194 && query->flags != FTS_PROXIMITY
1195 && query->flags != FTS_PHRASE) {
1196 fts_cache_find_wildcard(query, index_cache, token);
1197 } else {
1198 nodes = fts_cache_find_word(index_cache, token);
1199
1200 for (i = 0; nodes && i < ib_vector_size(nodes)
1201 && query->error == DB_SUCCESS; ++i) {
1202 const fts_node_t* node;
1203
1204 node = static_cast<const fts_node_t*>(
1205 ib_vector_get_const(nodes, i));
1206
1207 fts_query_check_node(query, token, node);
1208 }
1209 }
1210
1211 rw_lock_x_unlock(&cache->lock);
1212
1213 /* error is passed by 'query->error' */
1214 if (query->error != DB_SUCCESS) {
1215 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1216 return(query->error);
1217 }
1218
1219 /* Setup the callback args for filtering and
1220 consolidating the ilist. */
1221 fetch.read_arg = query;
1222 fetch.read_record = fts_query_index_fetch_nodes;
1223
1224 error = fts_index_fetch_nodes(
1225 trx, &graph, &query->fts_index_table, token, &fetch);
1226
1227 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1228 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1229 if (error != DB_SUCCESS) {
1230 query->error = error;
1231 }
1232
1233 fts_que_graph_free(graph);
1234 }
1235
1236 /* The size can't increase. */
1237 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1238
1239 return(query->error);
1240 }
1241
1242 /*****************************************************************//**
1243 Intersect the token doc ids with the current set.
1244 @return DB_SUCCESS if all go well */
1245 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1246 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1247 fts_query_intersect(
1248 /*================*/
1249 fts_query_t* query, /*!< in: query instance */
1250 const fts_string_t* token) /*!< in: the token to search */
1251 {
1252 trx_t* trx = query->trx;
1253 dict_table_t* table = query->index->table;
1254
1255 ut_a(query->oper == FTS_EXIST);
1256
1257 #ifdef FTS_INTERNAL_DIAG_PRINT
1258 fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
1259 (int) token->f_len, token->f_str);
1260 #endif
1261
1262 /* If the words set is not empty and multi exist is true,
1263 we know the intersection set is empty in advance. */
1264 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1265 ulint n_doc_ids = 0;
1266 ulint i;
1267 fts_fetch_t fetch;
1268 const ib_vector_t* nodes;
1269 const fts_index_cache_t*index_cache;
1270 que_t* graph = NULL;
1271 fts_cache_t* cache = table->fts->cache;
1272 dberr_t error;
1273
1274 ut_a(!query->intersection);
1275
1276 n_doc_ids = rbt_size(query->doc_ids);
1277
1278 /* Create the rb tree that will hold the doc ids of
1279 the intersection. */
1280 query->intersection = rbt_create(
1281 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1282
1283 query->total_size += SIZEOF_RBT_CREATE;
1284
1285 /* This is to avoid decompressing the ilist if the
1286 node's ilist doc ids are out of range. */
1287 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1288 const ib_rbt_node_t* node;
1289 doc_id_t* doc_id;
1290
1291 node = rbt_first(query->doc_ids);
1292 doc_id = rbt_value(doc_id_t, node);
1293 query->lower_doc_id = *doc_id;
1294
1295 node = rbt_last(query->doc_ids);
1296 doc_id = rbt_value(doc_id_t, node);
1297 query->upper_doc_id = *doc_id;
1298
1299 } else {
1300 query->lower_doc_id = 0;
1301 query->upper_doc_id = 0;
1302 }
1303
1304 /* Search the cache for a matching word first. */
1305
1306 rw_lock_x_lock(&cache->lock);
1307
1308 /* Search for the index specific cache. */
1309 index_cache = fts_find_index_cache(cache, query->index);
1310
1311 /* Must find the index cache. */
1312 ut_a(index_cache != NULL);
1313
1314 if (query->cur_node->term.wildcard) {
1315 /* Wildcard search the index cache */
1316 fts_cache_find_wildcard(query, index_cache, token);
1317 } else {
1318 nodes = fts_cache_find_word(index_cache, token);
1319
1320 for (i = 0; nodes && i < ib_vector_size(nodes)
1321 && query->error == DB_SUCCESS; ++i) {
1322 const fts_node_t* node;
1323
1324 node = static_cast<const fts_node_t*>(
1325 ib_vector_get_const(nodes, i));
1326
1327 fts_query_check_node(query, token, node);
1328 }
1329 }
1330
1331 rw_lock_x_unlock(&cache->lock);
1332
1333 /* error is passed by 'query->error' */
1334 if (query->error != DB_SUCCESS) {
1335 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1336 fts_query_free_intersection(query);
1337 return(query->error);
1338 }
1339
1340 /* Setup the callback args for filtering and
1341 consolidating the ilist. */
1342 fetch.read_arg = query;
1343 fetch.read_record = fts_query_index_fetch_nodes;
1344
1345 error = fts_index_fetch_nodes(
1346 trx, &graph, &query->fts_index_table, token, &fetch);
1347
1348 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1349 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1350 if (error != DB_SUCCESS) {
1351 query->error = error;
1352 }
1353
1354 fts_que_graph_free(graph);
1355
1356 if (query->error == DB_SUCCESS) {
1357 /* Make the intesection (rb tree) the current doc id
1358 set and free the old set. */
1359 fts_query_free_doc_ids(query, query->doc_ids);
1360 query->doc_ids = query->intersection;
1361 query->intersection = NULL;
1362
1363 ut_a(!query->multi_exist || (query->multi_exist
1364 && rbt_size(query->doc_ids) <= n_doc_ids));
1365 } else if (query->intersection != NULL) {
1366 fts_query_free_intersection(query);
1367 }
1368 }
1369
1370 return(query->error);
1371 }
1372
1373 /*****************************************************************//**
1374 Query index cache.
1375 @return DB_SUCCESS if all go well */
1376 static
1377 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1378 fts_query_cache(
1379 /*============*/
1380 fts_query_t* query, /*!< in/out: query instance */
1381 const fts_string_t* token) /*!< in: token to search */
1382 {
1383 const fts_index_cache_t*index_cache;
1384 dict_table_t* table = query->index->table;
1385 fts_cache_t* cache = table->fts->cache;
1386
1387 /* Search the cache for a matching word first. */
1388 rw_lock_x_lock(&cache->lock);
1389
1390 /* Search for the index specific cache. */
1391 index_cache = fts_find_index_cache(cache, query->index);
1392
1393 /* Must find the index cache. */
1394 ut_a(index_cache != NULL);
1395
1396 if (query->cur_node->term.wildcard
1397 && query->flags != FTS_PROXIMITY
1398 && query->flags != FTS_PHRASE) {
1399 /* Wildcard search the index cache */
1400 fts_cache_find_wildcard(query, index_cache, token);
1401 } else {
1402 const ib_vector_t* nodes;
1403 ulint i;
1404
1405 nodes = fts_cache_find_word(index_cache, token);
1406
1407 for (i = 0; nodes && i < ib_vector_size(nodes)
1408 && query->error == DB_SUCCESS; ++i) {
1409 const fts_node_t* node;
1410
1411 node = static_cast<const fts_node_t*>(
1412 ib_vector_get_const(nodes, i));
1413
1414 fts_query_check_node(query, token, node);
1415 }
1416 }
1417
1418 rw_lock_x_unlock(&cache->lock);
1419
1420 return(query->error);
1421 }
1422
1423 /*****************************************************************//**
1424 Set union.
1425 @return DB_SUCCESS if all go well */
1426 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1427 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1428 fts_query_union(
1429 /*============*/
1430 fts_query_t* query, /*!< in: query instance */
1431 fts_string_t* token) /*!< in: token to search */
1432 {
1433 fts_fetch_t fetch;
1434 ulint n_doc_ids = 0;
1435 trx_t* trx = query->trx;
1436 que_t* graph = NULL;
1437 dberr_t error;
1438
1439 ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1440 query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1441
1442 #ifdef FTS_INTERNAL_DIAG_PRINT
1443 fprintf(stderr, "UNION: Searching: '%.*s'\n",
1444 (int) token->f_len, token->f_str);
1445 #endif
1446
1447 if (query->doc_ids) {
1448 n_doc_ids = rbt_size(query->doc_ids);
1449 }
1450
1451 if (token->f_len == 0) {
1452 return(query->error);
1453 }
1454
1455 /* Single '%' would confuse parser in pars_like_rebind(). In addition,
1456 our wildcard search only supports prefix search */
1457 ut_ad(*token->f_str != '%');
1458
1459 fts_query_cache(query, token);
1460
1461 /* Setup the callback args for filtering and
1462 consolidating the ilist. */
1463 fetch.read_arg = query;
1464 fetch.read_record = fts_query_index_fetch_nodes;
1465
1466 /* Read the nodes from disk. */
1467 error = fts_index_fetch_nodes(
1468 trx, &graph, &query->fts_index_table, token, &fetch);
1469
1470 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1471 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1472 if (error != DB_SUCCESS) {
1473 query->error = error;
1474 }
1475
1476 fts_que_graph_free(graph);
1477
1478 if (query->error == DB_SUCCESS) {
1479
1480 /* The size can't decrease. */
1481 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1482
1483 /* Calulate the number of doc ids that were added to
1484 the current doc id set. */
1485 if (query->doc_ids) {
1486 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1487 }
1488 }
1489
1490 return(query->error);
1491 }
1492
1493 /*****************************************************************//**
1494 Depending upon the current query operator process the doc id.
1495 return DB_SUCCESS if all go well
1496 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1497 static
1498 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1499 fts_query_process_doc_id(
1500 /*=====================*/
1501 fts_query_t* query, /*!< in: query instance */
1502 doc_id_t doc_id, /*!< in: doc id to process */
1503 fts_rank_t rank) /*!< in: if non-zero, it is the
1504 rank associated with the doc_id */
1505 {
1506 if (query->flags == FTS_OPT_RANKING) {
1507 return(DB_SUCCESS);
1508 }
1509
1510 switch (query->oper) {
1511 case FTS_NONE:
1512 fts_query_union_doc_id(query, doc_id, rank);
1513 break;
1514
1515 case FTS_EXIST:
1516 fts_query_intersect_doc_id(query, doc_id, rank);
1517 break;
1518
1519 case FTS_IGNORE:
1520 fts_query_remove_doc_id(query, doc_id);
1521 break;
1522
1523 case FTS_NEGATE:
1524 fts_query_change_ranking(query, doc_id, TRUE);
1525 break;
1526
1527 case FTS_DECR_RATING:
1528 fts_query_union_doc_id(query, doc_id, rank);
1529 fts_query_change_ranking(query, doc_id, TRUE);
1530 break;
1531
1532 case FTS_INCR_RATING:
1533 fts_query_union_doc_id(query, doc_id, rank);
1534 fts_query_change_ranking(query, doc_id, FALSE);
1535 break;
1536
1537 default:
1538 ut_error;
1539 }
1540
1541 if (query->total_size > fts_result_cache_limit) {
1542 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1543 } else {
1544 return(DB_SUCCESS);
1545 }
1546 }
1547
1548 /*****************************************************************//**
1549 Merge two result sets. */
1550 static
1551 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1552 fts_merge_doc_ids(
1553 /*==============*/
1554 fts_query_t* query, /*!< in,out: query instance */
1555 const ib_rbt_t* doc_ids) /*!< in: result set to merge */
1556 {
1557 const ib_rbt_node_t* node;
1558
1559 DBUG_ENTER("fts_merge_doc_ids");
1560
1561 ut_a(!query->intersection);
1562
1563 /* To process FTS_EXIST operation (intersection), we need
1564 to create a new result set for fts_query_intersect(). */
1565 if (query->oper == FTS_EXIST) {
1566
1567 query->intersection = rbt_create(
1568 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1569
1570 query->total_size += SIZEOF_RBT_CREATE;
1571 }
1572
1573 /* Merge the elements to the result set. */
1574 for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1575 fts_ranking_t* ranking;
1576 ulint pos = 0;
1577 fts_string_t word;
1578
1579 ranking = rbt_value(fts_ranking_t, node);
1580
1581 query->error = fts_query_process_doc_id(
1582 query, ranking->doc_id, ranking->rank);
1583
1584 if (query->error != DB_SUCCESS) {
1585 if (query->intersection != NULL)
1586 {
1587 ut_a(query->oper == FTS_EXIST);
1588 fts_query_free_intersection(query);
1589 }
1590 DBUG_RETURN(query->error);
1591 }
1592
1593 /* Merge words. Don't need to take operator into account. */
1594 ut_a(ranking->words);
1595 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1596 fts_query_add_word_to_document(query, ranking->doc_id,
1597 &word);
1598 }
1599 }
1600
1601 /* If it is an intersection operation, reset query->doc_ids
1602 to query->intersection and free the old result list. */
1603 if (query->oper == FTS_EXIST && query->intersection != NULL) {
1604 fts_query_free_doc_ids(query, query->doc_ids);
1605 query->doc_ids = query->intersection;
1606 query->intersection = NULL;
1607 }
1608
1609 DBUG_RETURN(DB_SUCCESS);
1610 }
1611
1612 /*****************************************************************//**
1613 Skip non-whitespace in a string. Move ptr to the next word boundary.
1614 @return pointer to first whitespace character or end */
1615 UNIV_INLINE
1616 byte*
fts_query_skip_word(byte * ptr,const byte * end)1617 fts_query_skip_word(
1618 /*================*/
1619 byte* ptr, /*!< in: start of scan */
1620 const byte* end) /*!< in: pointer to end of string */
1621 {
1622 /* TODO: Does this have to be UTF-8 too ? */
1623 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1624 ++ptr;
1625 }
1626
1627 return(ptr);
1628 }
1629
1630 /*****************************************************************//**
1631 Check whether the remaining terms in the phrase match the text.
1632 @return TRUE if matched else FALSE */
1633 static
1634 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1635 fts_query_match_phrase_terms(
1636 /*=========================*/
1637 fts_phrase_t* phrase, /*!< in: phrase to match */
1638 byte** start, /*!< in/out: text to search, we can't
1639 make this const becase we need to
1640 first convert the string to
1641 lowercase */
1642 const byte* end, /*!< in: pointer to the end of
1643 the string to search */
1644 mem_heap_t* heap) /*!< in: heap */
1645 {
1646 ulint i;
1647 byte* ptr = *start;
1648 const ib_vector_t* tokens = phrase->tokens;
1649 ulint distance = phrase->distance;
1650
1651 /* We check only from the second term onwards, since the first
1652 must have matched otherwise we wouldn't be here. */
1653 for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1654 fts_string_t match;
1655 fts_string_t cmp_str;
1656 const fts_string_t* token;
1657 int result;
1658 ulint ret;
1659 ulint offset;
1660
1661 ret = innobase_mysql_fts_get_token(
1662 phrase->charset, ptr, (byte*) end,
1663 &match, &offset);
1664
1665 if (match.f_len > 0) {
1666 /* Get next token to match. */
1667 token = static_cast<const fts_string_t*>(
1668 ib_vector_get_const(tokens, i));
1669
1670 fts_utf8_string_dup(&cmp_str, &match, heap);
1671
1672 result = innobase_fts_text_case_cmp(
1673 phrase->charset, token, &cmp_str);
1674
1675 /* Skip the rest of the tokens if this one doesn't
1676 match and the proximity distance is exceeded. */
1677 if (result
1678 && (distance == ULINT_UNDEFINED
1679 || distance == 0)) {
1680
1681 break;
1682 }
1683
1684 /* This token matched move to the next token. */
1685 if (result == 0) {
1686 /* Advance the text to search by the length
1687 of the last token. */
1688 ptr += ret;
1689
1690 /* Advance to the next token. */
1691 ++i;
1692 } else {
1693
1694 ut_a(distance != ULINT_UNDEFINED);
1695
1696 ptr = fts_query_skip_word(ptr, end);
1697 }
1698
1699 /* Distance can be 0 for exact matches. */
1700 if (distance != ULINT_UNDEFINED && distance > 0) {
1701 --distance;
1702 }
1703 } else {
1704 ptr += ret;
1705 }
1706 }
1707
1708 *start = ptr;
1709
1710 /* Can't be greater than the number of elements. */
1711 ut_a(i <= ib_vector_size(tokens));
1712
1713 /* This is the case for multiple words. */
1714 if (i == ib_vector_size(tokens)) {
1715 phrase->found = TRUE;
1716 }
1717
1718 return(phrase->found);
1719 }
1720
1721 /*****************************************************************//**
1722 Callback function to count the number of words in position ranges,
1723 and see whether the word count is in specified "phrase->distance"
1724 @return true if the number of characters is less than the "distance" */
1725 static
1726 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1727 fts_proximity_is_word_in_range(
1728 /*===========================*/
1729 const fts_phrase_t*
1730 phrase, /*!< in: phrase with the search info */
1731 byte* start, /*!< in: text to search */
1732 ulint total_len) /*!< in: length of text */
1733 {
1734 fts_proximity_t* proximity_pos = phrase->proximity_pos;
1735
1736 ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1737 ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1738
1739 /* Search each matched position pair (with min and max positions)
1740 and count the number of words in the range */
1741 for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1742 ulint cur_pos = proximity_pos->min_pos[i];
1743 ulint n_word = 0;
1744
1745 ut_ad(proximity_pos->max_pos[i] <= total_len);
1746
1747 /* Walk through words in the range and count them */
1748 while (cur_pos <= proximity_pos->max_pos[i]) {
1749 ulint len;
1750 fts_string_t str;
1751 ulint offset = 0;
1752
1753 len = innobase_mysql_fts_get_token(
1754 phrase->charset,
1755 start + cur_pos,
1756 start + total_len, &str, &offset);
1757
1758 if (len == 0) {
1759 break;
1760 }
1761
1762 /* Advances position with "len" bytes */
1763 cur_pos += len;
1764
1765 /* Record the number of words */
1766 if (str.f_n_char > 0) {
1767 n_word++;
1768 }
1769
1770 if (n_word > phrase->distance) {
1771 break;
1772 }
1773 }
1774
1775 /* Check if the number of words is less than specified
1776 "distance" */
1777 if (n_word && n_word <= phrase->distance) {
1778 return(true);
1779 }
1780 }
1781
1782 return(false);
1783 }
1784
1785 /*****************************************************************//**
1786 Callback function to fetch and search the document.
1787 @return TRUE if matched else FALSE */
1788 static
1789 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1790 fts_query_match_phrase(
1791 /*===================*/
1792 fts_phrase_t* phrase, /*!< in: phrase to match */
1793 byte* start, /*!< in: text to search, we can't make
1794 this const becase we need to first
1795 convert the string to lowercase */
1796 ulint cur_len, /*!< in: length of text */
1797 ulint prev_len, /*!< in: total length for searched
1798 doc fields*/
1799 mem_heap_t* heap) /* heap */
1800 {
1801 ulint i;
1802 const fts_string_t* first;
1803 const byte* end = start + cur_len;
1804 const ib_vector_t* tokens = phrase->tokens;
1805 const ib_vector_t* positions = phrase->match->positions;
1806
1807 ut_a(!phrase->found);
1808 ut_a(phrase->match->doc_id > 0);
1809 ut_a(ib_vector_size(tokens) > 0);
1810 ut_a(ib_vector_size(positions) > 0);
1811
1812 first = static_cast<const fts_string_t*>(
1813 ib_vector_get_const(tokens, 0));
1814
1815 ut_a(phrase->match->start < ib_vector_size(positions));
1816
1817 for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1818 ulint pos;
1819 fts_string_t match;
1820 fts_string_t cmp_str;
1821 byte* ptr = start;
1822 ulint ret;
1823 ulint offset;
1824
1825 pos = *(ulint*) ib_vector_get_const(positions, i);
1826
1827 if (pos == ULINT_UNDEFINED) {
1828 break;
1829 }
1830
1831 if (pos < prev_len) {
1832 continue;
1833 }
1834
1835 /* Document positions are calculated from the beginning
1836 of the first field, need to save the length for each
1837 searched field to adjust the doc position when search
1838 phrases. */
1839 pos -= prev_len;
1840 ptr = match.f_str = start + pos;
1841
1842 /* Within limits ? */
1843 if (ptr >= end) {
1844 break;
1845 }
1846
1847 ret = innobase_mysql_fts_get_token(
1848 phrase->charset, start + pos, (byte*) end,
1849 &match, &offset);
1850
1851 if (match.f_len == 0) {
1852 break;
1853 }
1854
1855 fts_utf8_string_dup(&cmp_str, &match, heap);
1856
1857 if (innobase_fts_text_case_cmp(
1858 phrase->charset, first, &cmp_str) == 0) {
1859
1860 /* This is the case for the single word
1861 in the phrase. */
1862 if (ib_vector_size(phrase->tokens) == 1) {
1863 phrase->found = TRUE;
1864 break;
1865 }
1866
1867 ptr += ret;
1868
1869 /* Match the remaining terms in the phrase. */
1870 if (fts_query_match_phrase_terms(phrase, &ptr,
1871 end, heap)) {
1872 break;
1873 }
1874 }
1875 }
1876
1877 return(phrase->found);
1878 }
1879
1880 /*****************************************************************//**
1881 Callback function to fetch and search the document.
1882 @return whether the phrase is found */
1883 static
1884 ibool
fts_query_fetch_document(void * row,void * user_arg)1885 fts_query_fetch_document(
1886 /*=====================*/
1887 void* row, /*!< in: sel_node_t* */
1888 void* user_arg) /*!< in: fts_doc_t* */
1889 {
1890
1891 que_node_t* exp;
1892 sel_node_t* node = static_cast<sel_node_t*>(row);
1893 fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1894 ulint prev_len = 0;
1895 ulint total_len = 0;
1896 byte* document_text = NULL;
1897
1898 exp = node->select_list;
1899
1900 phrase->found = FALSE;
1901
1902 /* For proximity search, we will need to get the whole document
1903 from all fields, so first count the total length of the document
1904 from all the fields */
1905 if (phrase->proximity_pos) {
1906 while (exp) {
1907 ulint field_len;
1908 dfield_t* dfield = que_node_get_val(exp);
1909 byte* data = static_cast<byte*>(
1910 dfield_get_data(dfield));
1911
1912 if (dfield_is_ext(dfield)) {
1913 ulint local_len = dfield_get_len(dfield);
1914
1915 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1916
1917 field_len = mach_read_from_4(
1918 data + local_len + BTR_EXTERN_LEN + 4);
1919 } else {
1920 field_len = dfield_get_len(dfield);
1921 }
1922
1923 if (field_len != UNIV_SQL_NULL) {
1924 total_len += field_len + 1;
1925 }
1926
1927 exp = que_node_get_next(exp);
1928 }
1929
1930 document_text = static_cast<byte*>(mem_heap_zalloc(
1931 phrase->heap, total_len));
1932
1933 if (!document_text) {
1934 return(FALSE);
1935 }
1936 }
1937
1938 exp = node->select_list;
1939
1940 while (exp) {
1941 dfield_t* dfield = que_node_get_val(exp);
1942 byte* data = static_cast<byte*>(
1943 dfield_get_data(dfield));
1944 ulint cur_len;
1945
1946 if (dfield_is_ext(dfield)) {
1947 data = btr_copy_externally_stored_field(
1948 &cur_len, data, phrase->zip_size,
1949 dfield_get_len(dfield), phrase->heap);
1950 } else {
1951 cur_len = dfield_get_len(dfield);
1952 }
1953
1954 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
1955 if (phrase->proximity_pos) {
1956 ut_ad(prev_len + cur_len <= total_len);
1957 memcpy(document_text + prev_len, data, cur_len);
1958 } else {
1959 /* For phrase search */
1960 phrase->found =
1961 fts_query_match_phrase(
1962 phrase,
1963 static_cast<byte*>(data),
1964 cur_len, prev_len,
1965 phrase->heap);
1966 }
1967
1968 /* Document positions are calculated from the beginning
1969 of the first field, need to save the length for each
1970 searched field to adjust the doc position when search
1971 phrases. */
1972 prev_len += cur_len + 1;
1973 }
1974
1975 if (phrase->found) {
1976 break;
1977 }
1978
1979 exp = que_node_get_next(exp);
1980 }
1981
1982 if (phrase->proximity_pos) {
1983 ut_ad(prev_len <= total_len);
1984
1985 phrase->found = fts_proximity_is_word_in_range(
1986 phrase, document_text, total_len);
1987 }
1988
1989 return(phrase->found);
1990 }
1991
1992 #if 0
1993 /********************************************************************
1994 Callback function to check whether a record was found or not. */
1995 static
1996 ibool
1997 fts_query_select(
1998 /*=============*/
1999 void* row, /*!< in: sel_node_t* */
2000 void* user_arg) /*!< in: fts_doc_t* */
2001 {
2002 int i;
2003 que_node_t* exp;
2004 sel_node_t* node = row;
2005 fts_select_t* select = user_arg;
2006
2007 ut_a(select->word_freq);
2008 ut_a(select->word_freq->doc_freqs);
2009
2010 exp = node->select_list;
2011
2012 for (i = 0; exp && !select->found; ++i) {
2013 dfield_t* dfield = que_node_get_val(exp);
2014 void* data = dfield_get_data(dfield);
2015 ulint len = dfield_get_len(dfield);
2016
2017 switch (i) {
2018 case 0: /* DOC_COUNT */
2019 if (len != UNIV_SQL_NULL && len != 0) {
2020
2021 select->word_freq->doc_count +=
2022 mach_read_from_4(data);
2023 }
2024 break;
2025
2026 case 1: /* ILIST */
2027 if (len != UNIV_SQL_NULL && len != 0) {
2028
2029 fts_query_find_doc_id(select, data, len);
2030 }
2031 break;
2032
2033 default:
2034 ut_error;
2035 }
2036
2037 exp = que_node_get_next(exp);
2038 }
2039
2040 return(FALSE);
2041 }
2042
2043 /********************************************************************
2044 Read the rows from the FTS index, that match word and where the
2045 doc id is between first and last doc id.
2046 @return DB_SUCCESS if all go well else error code */
2047 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2048 dberr_t
2049 fts_query_find_term(
2050 /*================*/
2051 fts_query_t* query, /*!< in: FTS query state */
2052 que_t** graph, /*!< in: prepared statement */
2053 const fts_string_t* word, /*!< in: the word to fetch */
2054 doc_id_t doc_id, /*!< in: doc id to match */
2055 ulint* min_pos,/*!< in/out: pos found must be
2056 greater than this minimum value. */
2057 ibool* found) /*!< out: TRUE if found else FALSE */
2058 {
2059 pars_info_t* info;
2060 dberr_t error;
2061 fts_select_t select;
2062 doc_id_t match_doc_id;
2063 trx_t* trx = query->trx;
2064
2065 trx->op_info = "fetching FTS index matching nodes";
2066
2067 if (*graph) {
2068 info = (*graph)->info;
2069 } else {
2070 info = pars_info_create();
2071 }
2072
2073 select.found = FALSE;
2074 select.doc_id = doc_id;
2075 select.min_pos = *min_pos;
2076 select.word_freq = fts_query_add_word_freq(query, word->f_str);
2077
2078 pars_info_bind_function(info, "my_func", fts_query_select, &select);
2079 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2080
2081 /* Convert to "storage" byte order. */
2082 fts_write_doc_id((byte*) &match_doc_id, doc_id);
2083
2084 fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2085
2086 fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2087
2088 if (!*graph) {
2089 ulint selected;
2090
2091 selected = fts_select_index(*word->f_str);
2092
2093 query->fts_index_table.suffix = fts_get_suffix(selected);
2094
2095 *graph = fts_parse_sql(
2096 &query->fts_index_table,
2097 info,
2098 "DECLARE FUNCTION my_func;\n"
2099 "DECLARE CURSOR c IS"
2100 " SELECT doc_count, ilist\n"
2101 " FROM \"%s\"\n"
2102 " WHERE word LIKE :word AND "
2103 " first_doc_id <= :min_doc_id AND "
2104 " last_doc_id >= :max_doc_id\n"
2105 " ORDER BY first_doc_id;\n"
2106 "BEGIN\n"
2107 "\n"
2108 "OPEN c;\n"
2109 "WHILE 1 = 1 LOOP\n"
2110 " FETCH c INTO my_func();\n"
2111 " IF c % NOTFOUND THEN\n"
2112 " EXIT;\n"
2113 " END IF;\n"
2114 "END LOOP;\n"
2115 "CLOSE c;");
2116 }
2117
2118 for(;;) {
2119 error = fts_eval_sql(trx, *graph);
2120
2121 if (error == DB_SUCCESS) {
2122
2123 break; /* Exit the loop. */
2124 } else {
2125 ut_print_timestamp(stderr);
2126
2127 if (error == DB_LOCK_WAIT_TIMEOUT) {
2128 fprintf(stderr, " InnoDB: Warning: lock wait "
2129 "timeout reading FTS index. "
2130 "Retrying!\n");
2131
2132 trx->error_state = DB_SUCCESS;
2133 } else {
2134 fprintf(stderr, " InnoDB: Error: %lu "
2135 "while reading FTS index.\n", error);
2136
2137 break; /* Exit the loop. */
2138 }
2139 }
2140 }
2141
2142 /* Value to return */
2143 *found = select.found;
2144
2145 if (*found) {
2146 *min_pos = select.min_pos;
2147 }
2148
2149 return(error);
2150 }
2151
2152 /********************************************************************
2153 Callback aggregator for int columns. */
2154 static
2155 ibool
2156 fts_query_sum(
2157 /*==========*/
2158 /*!< out: always returns TRUE */
2159 void* row, /*!< in: sel_node_t* */
2160 void* user_arg) /*!< in: ulint* */
2161 {
2162
2163 que_node_t* exp;
2164 sel_node_t* node = row;
2165 ulint* total = user_arg;
2166
2167 exp = node->select_list;
2168
2169 while (exp) {
2170 dfield_t* dfield = que_node_get_val(exp);
2171 void* data = dfield_get_data(dfield);
2172 ulint len = dfield_get_len(dfield);
2173
2174 if (len != UNIV_SQL_NULL && len != 0) {
2175 *total += mach_read_from_4(data);
2176 }
2177
2178 exp = que_node_get_next(exp);
2179 }
2180
2181 return(TRUE);
2182 }
2183
2184 /********************************************************************
2185 Calculate the total documents that contain a particular word (term).
2186 @return DB_SUCCESS if all go well else error code */
2187 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2188 dberr_t
2189 fts_query_total_docs_containing_term(
2190 /*=================================*/
2191 fts_query_t* query, /*!< in: FTS query state */
2192 const fts_string_t* word, /*!< in: the word to check */
2193 ulint* total) /*!< out: documents containing word */
2194 {
2195 pars_info_t* info;
2196 dberr_t error;
2197 que_t* graph;
2198 ulint selected;
2199 trx_t* trx = query->trx;
2200
2201 trx->op_info = "fetching FTS index document count";
2202
2203 *total = 0;
2204
2205 info = pars_info_create();
2206
2207 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2208 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2209
2210 selected = fts_select_index(*word->f_str);
2211
2212 query->fts_index_table.suffix = fts_get_suffix(selected);
2213
2214 graph = fts_parse_sql(
2215 &query->fts_index_table,
2216 info,
2217 "DECLARE FUNCTION my_func;\n"
2218 "DECLARE CURSOR c IS"
2219 " SELECT doc_count\n"
2220 " FROM %s\n"
2221 " WHERE word = :word "
2222 " ORDER BY first_doc_id;\n"
2223 "BEGIN\n"
2224 "\n"
2225 "OPEN c;\n"
2226 "WHILE 1 = 1 LOOP\n"
2227 " FETCH c INTO my_func();\n"
2228 " IF c % NOTFOUND THEN\n"
2229 " EXIT;\n"
2230 " END IF;\n"
2231 "END LOOP;\n"
2232 "CLOSE c;");
2233
2234 for(;;) {
2235 error = fts_eval_sql(trx, graph);
2236
2237 if (error == DB_SUCCESS) {
2238
2239 break; /* Exit the loop. */
2240 } else {
2241 ut_print_timestamp(stderr);
2242
2243 if (error == DB_LOCK_WAIT_TIMEOUT) {
2244 fprintf(stderr, " InnoDB: Warning: lock wait "
2245 "timeout reading FTS index. "
2246 "Retrying!\n");
2247
2248 trx->error_state = DB_SUCCESS;
2249 } else {
2250 fprintf(stderr, " InnoDB: Error: %lu "
2251 "while reading FTS index.\n", error);
2252
2253 break; /* Exit the loop. */
2254 }
2255 }
2256 }
2257
2258 fts_que_graph_free(graph);
2259
2260 return(error);
2261 }
2262
2263 /********************************************************************
2264 Get the total number of words in a documents.
2265 @return DB_SUCCESS if all go well else error code */
2266 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2267 dberr_t
2268 fts_query_terms_in_document(
2269 /*========================*/
2270 fts_query_t* query, /*!< in: FTS query state */
2271 doc_id_t doc_id, /*!< in: the word to check */
2272 ulint* total) /*!< out: total words in document */
2273 {
2274 pars_info_t* info;
2275 dberr_t error;
2276 que_t* graph;
2277 doc_id_t read_doc_id;
2278 trx_t* trx = query->trx;
2279
2280 trx->op_info = "fetching FTS document term count";
2281
2282 *total = 0;
2283
2284 info = pars_info_create();
2285
2286 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2287
2288 /* Convert to "storage" byte order. */
2289 fts_write_doc_id((byte*) &read_doc_id, doc_id);
2290 fts_bind_doc_id(info, "doc_id", &read_doc_id);
2291
2292 query->fts_index_table.suffix = "DOC_ID";
2293
2294 graph = fts_parse_sql(
2295 &query->fts_index_table,
2296 info,
2297 "DECLARE FUNCTION my_func;\n"
2298 "DECLARE CURSOR c IS"
2299 " SELECT count\n"
2300 " FROM \"%s\"\n"
2301 " WHERE doc_id = :doc_id "
2302 "BEGIN\n"
2303 "\n"
2304 "OPEN c;\n"
2305 "WHILE 1 = 1 LOOP\n"
2306 " FETCH c INTO my_func();\n"
2307 " IF c % NOTFOUND THEN\n"
2308 " EXIT;\n"
2309 " END IF;\n"
2310 "END LOOP;\n"
2311 "CLOSE c;");
2312
2313 for(;;) {
2314 error = fts_eval_sql(trx, graph);
2315
2316 if (error == DB_SUCCESS) {
2317
2318 break; /* Exit the loop. */
2319 } else {
2320 ut_print_timestamp(stderr);
2321
2322 if (error == DB_LOCK_WAIT_TIMEOUT) {
2323 fprintf(stderr, " InnoDB: Warning: lock wait "
2324 "timeout reading FTS doc id table. "
2325 "Retrying!\n");
2326
2327 trx->error_state = DB_SUCCESS;
2328 } else {
2329 fprintf(stderr, " InnoDB: Error: %lu "
2330 "while reading FTS doc id table.\n",
2331 error);
2332
2333 break; /* Exit the loop. */
2334 }
2335 }
2336 }
2337
2338 fts_que_graph_free(graph);
2339
2340 return(error);
2341 }
2342 #endif
2343
2344 /*****************************************************************//**
2345 Retrieve the document and match the phrase tokens.
2346 @return DB_SUCCESS or error code */
2347 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2348 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,ibool * found)2349 fts_query_match_document(
2350 /*=====================*/
2351 ib_vector_t* tokens, /*!< in: phrase tokens */
2352 fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
2353 fts_match_t* match, /*!< in: doc id and positions */
2354 ulint distance, /*!< in: proximity distance */
2355 ibool* found) /*!< out: TRUE if phrase found */
2356 {
2357 dberr_t error;
2358 fts_phrase_t phrase;
2359
2360 memset(&phrase, 0x0, sizeof(phrase));
2361
2362 phrase.match = match; /* Positions to match */
2363 phrase.tokens = tokens; /* Tokens to match */
2364 phrase.distance = distance;
2365 phrase.charset = get_doc->index_cache->charset;
2366 phrase.zip_size = dict_table_zip_size(
2367 get_doc->index_cache->index->table);
2368 phrase.heap = mem_heap_create(512);
2369
2370 *found = phrase.found = FALSE;
2371
2372 error = fts_doc_fetch_by_doc_id(
2373 get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2374 fts_query_fetch_document, &phrase);
2375
2376 if (error != DB_SUCCESS) {
2377 ut_print_timestamp(stderr);
2378 fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
2379 ut_strerr(error));
2380 } else {
2381 *found = phrase.found;
2382 }
2383
2384 mem_heap_free(phrase.heap);
2385
2386 return(error);
2387 }
2388
2389 /*****************************************************************//**
2390 This function fetches the original documents and count the
2391 words in between matching words to see that is in specified distance
2392 @return DB_SUCCESS if all OK */
2393 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2394 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2395 fts_query_is_in_proximity_range(
2396 /*============================*/
2397 const fts_query_t* query, /*!< in: query instance */
2398 fts_match_t** match, /*!< in: query instance */
2399 fts_proximity_t* qualified_pos) /*!< in: position info for
2400 qualified ranges */
2401 {
2402 fts_get_doc_t get_doc;
2403 fts_cache_t* cache = query->index->table->fts->cache;
2404 dberr_t err;
2405 fts_phrase_t phrase;
2406
2407 memset(&get_doc, 0x0, sizeof(get_doc));
2408 memset(&phrase, 0x0, sizeof(phrase));
2409
2410 rw_lock_x_lock(&cache->lock);
2411 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2412 rw_lock_x_unlock(&cache->lock);
2413 ut_a(get_doc.index_cache != NULL);
2414
2415 phrase.distance = query->distance;
2416 phrase.charset = get_doc.index_cache->charset;
2417 phrase.zip_size = dict_table_zip_size(
2418 get_doc.index_cache->index->table);
2419 phrase.heap = mem_heap_create(512);
2420 phrase.proximity_pos = qualified_pos;
2421 phrase.found = FALSE;
2422
2423 err = fts_doc_fetch_by_doc_id(
2424 &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2425 fts_query_fetch_document, &phrase);
2426
2427 if (err != DB_SUCCESS) {
2428 ib_logf(IB_LOG_LEVEL_ERROR,
2429 "Error: (%s) in verification phase of proximity "
2430 "search", ut_strerr(err));
2431 }
2432
2433 /* Free the prepared statement. */
2434 if (get_doc.get_document_graph) {
2435 fts_que_graph_free(get_doc.get_document_graph);
2436 get_doc.get_document_graph = NULL;
2437 }
2438
2439 mem_heap_free(phrase.heap);
2440
2441 return(err == DB_SUCCESS && phrase.found);
2442 }
2443
2444 /*****************************************************************//**
2445 Iterate over the matched document ids and search the for the
2446 actual phrase in the text.
2447 @return DB_SUCCESS if all OK */
2448 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2449 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2450 fts_query_search_phrase(
2451 /*====================*/
2452 fts_query_t* query, /*!< in: query instance */
2453 ib_vector_t* orig_tokens, /*!< in: tokens to search,
2454 with any stopwords in the
2455 original phrase */
2456 ib_vector_t* tokens) /*!< in: tokens that does
2457 not include stopwords and
2458 can be used to calculate
2459 ranking */
2460 {
2461 ulint i;
2462 fts_get_doc_t get_doc;
2463 ulint n_matched;
2464 fts_cache_t* cache = query->index->table->fts->cache;
2465
2466 n_matched = ib_vector_size(query->matched);
2467
2468 /* Setup the doc retrieval infrastructure. */
2469 memset(&get_doc, 0x0, sizeof(get_doc));
2470
2471 rw_lock_x_lock(&cache->lock);
2472
2473 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2474
2475 /* Must find the index cache */
2476 ut_a(get_doc.index_cache != NULL);
2477
2478 rw_lock_x_unlock(&cache->lock);
2479
2480 #ifdef FTS_INTERNAL_DIAG_PRINT
2481 ut_print_timestamp(stderr);
2482 fprintf(stderr, " Start phrase search\n");
2483 #endif
2484
2485 /* Read the document from disk and do the actual
2486 match, matching documents will be added to the current
2487 doc id set. */
2488 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2489 fts_match_t* match;
2490 ibool found = FALSE;
2491
2492 match = static_cast<fts_match_t*>(
2493 ib_vector_get(query->matched, i));
2494
2495 /* Skip the document ids that were filtered out by
2496 an earlier pass. */
2497 if (match->doc_id != 0) {
2498
2499 query->error = fts_query_match_document(
2500 orig_tokens, &get_doc,
2501 match, query->distance, &found);
2502
2503 if (query->error == DB_SUCCESS && found) {
2504 ulint z;
2505
2506 query->error = fts_query_process_doc_id(query,
2507 match->doc_id, 0);
2508 if (query->error != DB_SUCCESS) {
2509 goto func_exit;
2510 }
2511
2512 for (z = 0; z < ib_vector_size(tokens); z++) {
2513 fts_string_t* token;
2514 token = static_cast<fts_string_t*>(
2515 ib_vector_get(tokens, z));
2516 fts_query_add_word_to_document(
2517 query, match->doc_id, token);
2518 }
2519 }
2520 }
2521 }
2522
2523 func_exit:
2524 /* Free the prepared statement. */
2525 if (get_doc.get_document_graph) {
2526 fts_que_graph_free(get_doc.get_document_graph);
2527 get_doc.get_document_graph = NULL;
2528 }
2529
2530 return(query->error);
2531 }
2532
2533 /*****************************************************************//**
2534 Text/Phrase search.
2535 @return DB_SUCCESS or error code */
2536 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2537 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_string_t * phrase)2538 fts_query_phrase_search(
2539 /*====================*/
2540 fts_query_t* query, /*!< in: query instance */
2541 const fts_string_t* phrase) /*!< in: token to search */
2542 {
2543 ib_vector_t* tokens;
2544 ib_vector_t* orig_tokens;
2545 mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2546 ulint len = phrase->f_len;
2547 ulint cur_pos = 0;
2548 ib_alloc_t* heap_alloc;
2549 ulint num_token;
2550 CHARSET_INFO* charset;
2551
2552 charset = query->fts_index_table.charset;
2553
2554 heap_alloc = ib_heap_allocator_create(heap);
2555
2556 tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2557 orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2558
2559 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2560 query->flags = FTS_PROXIMITY;
2561 } else {
2562 query->flags = FTS_PHRASE;
2563 }
2564
2565 /* Split the phrase into tokens. */
2566 while (cur_pos < len) {
2567 fts_cache_t* cache = query->index->table->fts->cache;
2568 ib_rbt_bound_t parent;
2569 ulint offset;
2570 ulint cur_len;
2571 fts_string_t result_str;
2572
2573 cur_len = innobase_mysql_fts_get_token(
2574 charset,
2575 reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
2576 reinterpret_cast<const byte*>(phrase->f_str) + len,
2577 &result_str, &offset);
2578
2579 if (cur_len == 0) {
2580 break;
2581 }
2582
2583 cur_pos += cur_len;
2584
2585 if (result_str.f_n_char == 0) {
2586 continue;
2587 }
2588
2589 fts_string_t* token = static_cast<fts_string_t*>(
2590 ib_vector_push(tokens, NULL));
2591
2592 token->f_str = static_cast<byte*>(
2593 mem_heap_alloc(heap, result_str.f_len + 1));
2594 ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
2595
2596 token->f_len = result_str.f_len;
2597 token->f_str[token->f_len] = 0;
2598
2599 if (cache->stopword_info.cached_stopword
2600 && rbt_search(cache->stopword_info.cached_stopword,
2601 &parent, token) != 0
2602 && result_str.f_n_char >= fts_min_token_size
2603 && result_str.f_n_char <= fts_max_token_size) {
2604 /* Add the word to the RB tree so that we can
2605 calculate it's frequencey within a document. */
2606 fts_query_add_word_freq(query, token);
2607 } else {
2608 ib_vector_pop(tokens);
2609 }
2610
2611 /* we will start to store all words including stopwords
2612 in the "orig_tokens" vector, but skip any leading words
2613 that are stopwords */
2614 if (!ib_vector_is_empty(tokens)) {
2615 fts_string_t* orig_token = static_cast<fts_string_t*>(
2616 ib_vector_push(orig_tokens, NULL));
2617
2618 orig_token->f_str = token->f_str;
2619 orig_token->f_len = token->f_len;
2620 }
2621 }
2622
2623 num_token = ib_vector_size(tokens);
2624 if (num_token > MAX_PROXIMITY_ITEM) {
2625 query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2626 goto func_exit;
2627 }
2628
2629 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2630
2631 /* Ignore empty strings. */
2632 if (num_token > 0) {
2633 fts_string_t* token;
2634 fts_fetch_t fetch;
2635 trx_t* trx = query->trx;
2636 fts_ast_oper_t oper = query->oper;
2637 que_t* graph = NULL;
2638 ulint i;
2639 dberr_t error;
2640
2641 /* Create the vector for storing matching document ids
2642 and the positions of the first token of the phrase. */
2643 if (!query->matched) {
2644 ib_alloc_t* heap_alloc;
2645
2646 heap_alloc = ib_heap_allocator_create(heap);
2647
2648 if (!(query->flags & FTS_PROXIMITY)
2649 && !(query->flags & FTS_PHRASE)) {
2650 query->matched = ib_vector_create(
2651 heap_alloc, sizeof(fts_match_t),
2652 64);
2653 } else {
2654 ut_a(num_token <= MAX_PROXIMITY_ITEM);
2655 query->match_array =
2656 (ib_vector_t**) mem_heap_alloc(
2657 heap,
2658 num_token *
2659 sizeof(query->matched));
2660
2661 for (i = 0; i < num_token; i++) {
2662 query->match_array[i] =
2663 ib_vector_create(
2664 heap_alloc, sizeof(fts_match_t),
2665 64);
2666 }
2667
2668 query->matched = query->match_array[0];
2669 }
2670 }
2671
2672 /* Setup the callback args for filtering and consolidating
2673 the ilist. */
2674 fetch.read_arg = query;
2675 fetch.read_record = fts_query_index_fetch_nodes;
2676
2677 for (i = 0; i < num_token; i++) {
2678 /* Search for the first word from the phrase. */
2679 token = static_cast<fts_string_t*>(
2680 ib_vector_get(tokens, i));
2681
2682 if (query->flags & FTS_PROXIMITY
2683 || query->flags & FTS_PHRASE) {
2684 query->matched = query->match_array[i];
2685 }
2686
2687 error = fts_index_fetch_nodes(
2688 trx, &graph, &query->fts_index_table,
2689 token, &fetch);
2690
2691 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2692 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2693 if (error != DB_SUCCESS) {
2694 query->error = error;
2695 }
2696
2697 fts_que_graph_free(graph);
2698 graph = NULL;
2699
2700 fts_query_cache(query, token);
2701
2702 if (!(query->flags & FTS_PHRASE)
2703 && !(query->flags & FTS_PROXIMITY)) {
2704 break;
2705 }
2706
2707 /* If any of the token can't be found,
2708 no need to continue match */
2709 if (ib_vector_is_empty(query->match_array[i])
2710 || query->error != DB_SUCCESS) {
2711 goto func_exit;
2712 }
2713 }
2714
2715 /* Just a single word, no need to fetch the original
2716 documents to do phrase matching */
2717 if (ib_vector_size(orig_tokens) == 1
2718 && !ib_vector_is_empty(query->match_array[0])) {
2719 fts_match_t* match;
2720 ulint n_matched;
2721
2722 n_matched = ib_vector_size(query->match_array[0]);
2723
2724 for (i = 0; i < n_matched; i++) {
2725 match = static_cast<fts_match_t*>(
2726 ib_vector_get(
2727 query->match_array[0], i));
2728
2729 query->error = fts_query_process_doc_id(
2730 query, match->doc_id, 0);
2731 if (query->error != DB_SUCCESS) {
2732 goto func_exit;
2733 }
2734
2735 fts_query_add_word_to_document(
2736 query, match->doc_id, token);
2737 }
2738 query->oper = oper;
2739 goto func_exit;
2740 }
2741
2742 /* If we are doing proximity search, verify the distance
2743 between all words, and check they are in specified distance. */
2744 if (query->flags & FTS_PROXIMITY) {
2745 fts_phrase_or_proximity_search(query, tokens);
2746 } else {
2747 ibool matched;
2748
2749 /* Phrase Search case:
2750 We filter out the doc ids that don't contain
2751 all the tokens in the phrase. It's cheaper to
2752 search the ilist than bringing the documents in
2753 and then doing a search through the text. Isolated
2754 testing shows this also helps in mitigating disruption
2755 of the buffer cache. */
2756 matched = fts_phrase_or_proximity_search(query, tokens);
2757 query->matched = query->match_array[0];
2758
2759 /* Read the actual text in and search for the phrase. */
2760 if (matched) {
2761 ut_ad(query->error == DB_SUCCESS);
2762 query->error = fts_query_search_phrase(
2763 query, orig_tokens, tokens);
2764 }
2765 }
2766
2767 /* Restore original operation. */
2768 query->oper = oper;
2769
2770 if (query->error != DB_SUCCESS) {
2771 goto func_exit;
2772 }
2773 }
2774
2775 func_exit:
2776 mem_heap_free(heap);
2777
2778 /* Don't need it anymore. */
2779 query->matched = NULL;
2780
2781 return(query->error);
2782 }
2783
2784 /*****************************************************************//**
2785 Find the word and evaluate.
2786 @return DB_SUCCESS if all go well */
2787 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2788 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2789 fts_query_execute(
2790 /*==============*/
2791 fts_query_t* query, /*!< in: query instance */
2792 fts_string_t* token) /*!< in: token to search */
2793 {
2794 switch (query->oper) {
2795 case FTS_NONE:
2796 case FTS_NEGATE:
2797 case FTS_INCR_RATING:
2798 case FTS_DECR_RATING:
2799 query->error = fts_query_union(query, token);
2800 break;
2801
2802 case FTS_EXIST:
2803 query->error = fts_query_intersect(query, token);
2804 break;
2805
2806 case FTS_IGNORE:
2807 query->error = fts_query_difference(query, token);
2808 break;
2809
2810 default:
2811 ut_error;
2812 }
2813
2814 return(query->error);
2815 }
2816
2817 /*****************************************************************//**
2818 Create a wildcard string. It's the responsibility of the caller to
2819 free the byte* pointer. It's allocated using ut_malloc().
2820 @return ptr to allocated memory */
2821 static
2822 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2823 fts_query_get_token(
2824 /*================*/
2825 fts_ast_node_t* node, /*!< in: the current sub tree */
2826 fts_string_t* token) /*!< in: token to create */
2827 {
2828 ulint str_len;
2829 byte* new_ptr = NULL;
2830
2831 str_len = node->term.ptr->len;
2832
2833 ut_a(node->type == FTS_AST_TERM);
2834
2835 token->f_len = str_len;
2836 token->f_str = node->term.ptr->str;
2837
2838 if (node->term.wildcard) {
2839
2840 token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
2841 token->f_len = str_len + 1;
2842
2843 memcpy(token->f_str, node->term.ptr->str, str_len);
2844
2845 token->f_str[str_len] = '%';
2846 token->f_str[token->f_len] = 0;
2847
2848 new_ptr = token->f_str;
2849 }
2850
2851 return(new_ptr);
2852 }
2853
2854 /*****************************************************************//**
2855 Visit every node of the AST. */
2856 static
2857 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2858 fts_query_visitor(
2859 /*==============*/
2860 fts_ast_oper_t oper, /*!< in: current operator */
2861 fts_ast_node_t* node, /*!< in: The root of the current subtree*/
2862 void* arg) /*!< in: callback arg*/
2863 {
2864 byte* ptr;
2865 fts_string_t token;
2866 fts_query_t* query = static_cast<fts_query_t*>(arg);
2867
2868 ut_a(node);
2869 DBUG_ENTER("fts_query_visitor");
2870 DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2871
2872 token.f_n_char = 0;
2873 query->oper = oper;
2874 query->cur_node = node;
2875
2876 switch (node->type) {
2877 case FTS_AST_TEXT:
2878 token.f_str = node->text.ptr->str;
2879 token.f_len = node->text.ptr->len;
2880
2881 if (query->oper == FTS_EXIST) {
2882 ut_ad(query->intersection == NULL);
2883 query->intersection = rbt_create(
2884 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
2885
2886 query->total_size += SIZEOF_RBT_CREATE;
2887 }
2888
2889 /* Set the current proximity distance. */
2890 query->distance = node->text.distance;
2891
2892 /* Force collection of doc ids and the positions. */
2893 query->collect_positions = TRUE;
2894
2895 query->error = fts_query_phrase_search(query, &token);
2896
2897 query->collect_positions = FALSE;
2898
2899 if (query->oper == FTS_EXIST) {
2900 fts_query_free_doc_ids(query, query->doc_ids);
2901 query->doc_ids = query->intersection;
2902 query->intersection = NULL;
2903 }
2904
2905 break;
2906
2907 case FTS_AST_TERM:
2908 token.f_str = node->term.ptr->str;
2909 token.f_len = node->term.ptr->len;
2910
2911 /* Add the word to our RB tree that will be used to
2912 calculate this terms per document frequency. */
2913 fts_query_add_word_freq(query, &token);
2914
2915 ptr = fts_query_get_token(node, &token);
2916 query->error = fts_query_execute(query, &token);
2917
2918 if (ptr) {
2919 ut_free(ptr);
2920 }
2921 break;
2922
2923 case FTS_AST_SUBEXP_LIST:
2924 query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
2925 break;
2926
2927 default:
2928 ut_error;
2929 }
2930
2931 if (query->oper == FTS_EXIST) {
2932 query->multi_exist = true;
2933 }
2934
2935 DBUG_RETURN(query->error);
2936 }
2937
2938 /*****************************************************************//**
2939 Process (nested) sub-expression, create a new result set to store the
2940 sub-expression result by processing nodes under current sub-expression
2941 list. Merge the sub-expression result with that of parent expression list.
2942 @return DB_SUCCESS if all well */
2943 UNIV_INTERN
2944 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)2945 fts_ast_visit_sub_exp(
2946 /*==================*/
2947 fts_ast_node_t* node, /*!< in,out: current root node */
2948 fts_ast_callback visitor, /*!< in: callback function */
2949 void* arg) /*!< in,out: arg for callback */
2950 {
2951 fts_ast_oper_t cur_oper;
2952 fts_query_t* query = static_cast<fts_query_t*>(arg);
2953 ib_rbt_t* parent_doc_ids;
2954 ib_rbt_t* subexpr_doc_ids;
2955 dberr_t error = DB_SUCCESS;
2956 bool will_be_ignored = false;
2957 bool multi_exist;
2958
2959 DBUG_ENTER("fts_ast_visit_sub_exp");
2960
2961 /* sub-expression list may contains sub-expressions.
2962 So we increase sub-expression depth counter.
2963 If this counter reaches to the threshold then
2964 we abort the search opertion and reports an error */
2965 if (query->nested_exp_count > FTS_MAX_NESTED_EXP) {
2966 query->error = DB_FTS_TOO_MANY_NESTED_EXP;
2967 DBUG_RETURN(query->error);
2968 }
2969 query->nested_exp_count++;
2970
2971
2972 ut_a(node->type == FTS_AST_SUBEXP_LIST);
2973
2974 cur_oper = query->oper;
2975
2976 /* Save current result set */
2977 parent_doc_ids = query->doc_ids;
2978
2979 /* Create new result set to store the sub-expression result. We
2980 will merge this result set with the parent after processing. */
2981 query->doc_ids = rbt_create(sizeof(fts_ranking_t),
2982 fts_ranking_doc_id_cmp);
2983
2984 query->total_size += SIZEOF_RBT_CREATE;
2985
2986 multi_exist = query->multi_exist;
2987 query->multi_exist = false;
2988 /* Process nodes in current sub-expression and store its
2989 result set in query->doc_ids we created above. */
2990 error = fts_ast_visit(FTS_NONE, node, visitor,
2991 arg, &will_be_ignored);
2992
2993 /* Reinstate parent node state */
2994 query->multi_exist = multi_exist;
2995 query->oper = cur_oper;
2996
2997 /* Merge the sub-expression result with the parent result set. */
2998 subexpr_doc_ids = query->doc_ids;
2999 query->doc_ids = parent_doc_ids;
3000 if (error == DB_SUCCESS) {
3001 error = fts_merge_doc_ids(query, subexpr_doc_ids);
3002 }
3003
3004 /* Free current result set. Result already merged into parent. */
3005 fts_query_free_doc_ids(query, subexpr_doc_ids);
3006
3007 DBUG_RETURN(error);
3008 }
3009
3010 #if 0
3011 /*****************************************************************//***
3012 Check if the doc id exists in the ilist.
3013 @return TRUE if doc id found */
3014 static
3015 ulint
3016 fts_query_find_doc_id(
3017 /*==================*/
3018 fts_select_t* select, /*!< in/out: contains the doc id to
3019 find, we update the word freq if
3020 document found */
3021 void* data, /*!< in: doc id ilist */
3022 ulint len) /*!< in: doc id ilist size */
3023 {
3024 byte* ptr = data;
3025 doc_id_t doc_id = 0;
3026 ulint decoded = 0;
3027
3028 /* Decode the ilist and search for selected doc_id. We also
3029 calculate the frequency of the word in the document if found. */
3030 while (decoded < len && !select->found) {
3031 ulint freq = 0;
3032 ulint min_pos = 0;
3033 ulint last_pos = 0;
3034 ulint pos = fts_decode_vlc(&ptr);
3035
3036 /* Add the delta. */
3037 doc_id += pos;
3038
3039 while (*ptr) {
3040 ++freq;
3041 last_pos += fts_decode_vlc(&ptr);
3042
3043 /* Only if min_pos is not set and the current
3044 term exists in a position greater than the
3045 min_pos of the previous term. */
3046 if (min_pos == 0 && last_pos > select->min_pos) {
3047 min_pos = last_pos;
3048 }
3049 }
3050
3051 /* Skip the end of word position marker. */
3052 ++ptr;
3053
3054 /* Bytes decoded so far. */
3055 decoded = ptr - (byte*) data;
3056
3057 /* A word may exist in the document but we only consider a
3058 match if it exists in a position that is greater than the
3059 position of the previous term. */
3060 if (doc_id == select->doc_id && min_pos > 0) {
3061 fts_doc_freq_t* doc_freq;
3062
3063 /* Add the doc id to the doc freq rb tree, if
3064 the doc id doesn't exist it will be created. */
3065 doc_freq = fts_query_add_doc_freq(
3066 select->word_freq->doc_freqs, doc_id);
3067
3068 /* Avoid duplicating the frequency tally */
3069 if (doc_freq->freq == 0) {
3070 doc_freq->freq = freq;
3071 }
3072
3073 select->found = TRUE;
3074 select->min_pos = min_pos;
3075 }
3076 }
3077
3078 return(select->found);
3079 }
3080 #endif
3081
3082 /*****************************************************************//**
3083 Read and filter nodes.
3084 @return DB_SUCCESS if all go well,
3085 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3086 static
3087 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3088 fts_query_filter_doc_ids(
3089 /*=====================*/
3090 fts_query_t* query, /*!< in: query instance */
3091 const fts_string_t* word, /*!< in: the current word */
3092 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
3093 const fts_node_t* node, /*!< in: current FTS node */
3094 void* data, /*!< in: doc id ilist */
3095 ulint len, /*!< in: doc id ilist size */
3096 ibool calc_doc_count) /*!< in: whether to remember doc count */
3097 {
3098 byte* ptr = static_cast<byte*>(data);
3099 doc_id_t doc_id = 0;
3100 ulint decoded = 0;
3101 ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3102
3103 /* Decode the ilist and add the doc ids to the query doc_id set. */
3104 while (decoded < len) {
3105 ulint freq = 0;
3106 fts_doc_freq_t* doc_freq;
3107 fts_match_t* match = NULL;
3108 ulint last_pos = 0;
3109 ulint pos = fts_decode_vlc(&ptr);
3110
3111 /* Some sanity checks. */
3112 if (doc_id == 0) {
3113 ut_a(pos == node->first_doc_id);
3114 }
3115
3116 /* Add the delta. */
3117 doc_id += pos;
3118
3119 if (calc_doc_count) {
3120 word_freq->doc_count++;
3121 }
3122
3123 /* We simply collect the matching instances here. */
3124 if (query->collect_positions) {
3125 ib_alloc_t* heap_alloc;
3126
3127 /* Create a new fts_match_t instance. */
3128 match = static_cast<fts_match_t*>(
3129 ib_vector_push(query->matched, NULL));
3130
3131 match->start = 0;
3132 match->doc_id = doc_id;
3133 heap_alloc = ib_vector_allocator(query->matched);
3134
3135 /* Allocate from the same heap as the
3136 parent container. */
3137 match->positions = ib_vector_create(
3138 heap_alloc, sizeof(ulint), 64);
3139
3140 query->total_size += sizeof(fts_match_t)
3141 + sizeof(ib_vector_t)
3142 + sizeof(ulint) * 64;
3143 }
3144
3145 /* Unpack the positions within the document. */
3146 while (*ptr) {
3147 last_pos += fts_decode_vlc(&ptr);
3148
3149 /* Collect the matching word positions, for phrase
3150 matching later. */
3151 if (query->collect_positions) {
3152 ib_vector_push(match->positions, &last_pos);
3153 }
3154
3155 ++freq;
3156 }
3157
3158 /* End of list marker. */
3159 last_pos = (ulint) -1;
3160
3161 if (query->collect_positions) {
3162 ut_a(match != NULL);
3163 ib_vector_push(match->positions, &last_pos);
3164 }
3165
3166 /* Add the doc id to the doc freq rb tree, if the doc id
3167 doesn't exist it will be created. */
3168 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3169
3170 /* Avoid duplicating frequency tally. */
3171 if (doc_freq->freq == 0) {
3172 doc_freq->freq = freq;
3173 }
3174
3175 /* Skip the end of word position marker. */
3176 ++ptr;
3177
3178 /* Bytes decoded so far */
3179 decoded = ptr - (byte*) data;
3180
3181 /* We simply collect the matching documents and the
3182 positions here and match later. */
3183 if (!query->collect_positions) {
3184 /* We ignore error here and will check it later */
3185 fts_query_process_doc_id(query, doc_id, 0);
3186
3187 /* Add the word to the document's matched RB tree. */
3188 fts_query_add_word_to_document(query, doc_id, word);
3189 }
3190 }
3191
3192 /* Some sanity checks. */
3193 ut_a(doc_id == node->last_doc_id);
3194
3195 if (query->total_size > fts_result_cache_limit) {
3196 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3197 } else {
3198 return(DB_SUCCESS);
3199 }
3200 }
3201
3202 /*****************************************************************//**
3203 Read the FTS INDEX row.
3204 @return DB_SUCCESS if all go well. */
3205 static
3206 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3207 fts_query_read_node(
3208 /*================*/
3209 fts_query_t* query, /*!< in: query instance */
3210 const fts_string_t* word, /*!< in: current word */
3211 que_node_t* exp) /*!< in: query graph node */
3212 {
3213 int i;
3214 int ret;
3215 fts_node_t node;
3216 ib_rbt_bound_t parent;
3217 fts_word_freq_t* word_freq;
3218 ibool skip = FALSE;
3219 fts_string_t term;
3220 byte buf[FTS_MAX_WORD_LEN + 1];
3221 dberr_t error = DB_SUCCESS;
3222
3223 ut_a(query->cur_node->type == FTS_AST_TERM ||
3224 query->cur_node->type == FTS_AST_TEXT);
3225
3226 memset(&node, 0, sizeof(node));
3227 term.f_str = buf;
3228
3229 /* Need to consider the wildcard search case, the word frequency
3230 is created on the search string not the actual word. So we need
3231 to assign the frequency on search string behalf. */
3232 if (query->cur_node->type == FTS_AST_TERM
3233 && query->cur_node->term.wildcard) {
3234 term.f_len = query->cur_node->term.ptr->len;
3235 ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3236 memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3237 } else {
3238 term.f_len = word->f_len;
3239 ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3240 memcpy(term.f_str, word->f_str, word->f_len);
3241 }
3242
3243 /* Lookup the word in our rb tree, it must exist. */
3244 ret = rbt_search(query->word_freqs, &parent, &term);
3245
3246 ut_a(ret == 0);
3247
3248 word_freq = rbt_value(fts_word_freq_t, parent.last);
3249
3250 /* Start from 1 since the first column has been read by the caller.
3251 Also, we rely on the order of the columns projected, to filter
3252 out ilists that are out of range and we always want to read
3253 the doc_count irrespective of the suitablility of the row. */
3254
3255 for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3256
3257 dfield_t* dfield = que_node_get_val(exp);
3258 byte* data = static_cast<byte*>(
3259 dfield_get_data(dfield));
3260 ulint len = dfield_get_len(dfield);
3261
3262 ut_a(len != UNIV_SQL_NULL);
3263
3264 /* Note: The column numbers below must match the SELECT. */
3265
3266 switch (i) {
3267 case 1: /* DOC_COUNT */
3268 word_freq->doc_count += mach_read_from_4(data);
3269 break;
3270
3271 case 2: /* FIRST_DOC_ID */
3272 node.first_doc_id = fts_read_doc_id(data);
3273
3274 /* Skip nodes whose doc ids are out range. */
3275 if (query->oper == FTS_EXIST
3276 && query->upper_doc_id > 0
3277 && node.first_doc_id > query->upper_doc_id) {
3278 skip = TRUE;
3279 }
3280 break;
3281
3282 case 3: /* LAST_DOC_ID */
3283 node.last_doc_id = fts_read_doc_id(data);
3284
3285 /* Skip nodes whose doc ids are out range. */
3286 if (query->oper == FTS_EXIST
3287 && query->lower_doc_id > 0
3288 && node.last_doc_id < query->lower_doc_id) {
3289 skip = TRUE;
3290 }
3291 break;
3292
3293 case 4: /* ILIST */
3294
3295 error = fts_query_filter_doc_ids(
3296 query, &word_freq->word, word_freq,
3297 &node, data, len, FALSE);
3298
3299 break;
3300
3301 default:
3302 ut_error;
3303 }
3304 }
3305
3306 if (!skip) {
3307 /* Make sure all columns were read. */
3308
3309 ut_a(i == 5);
3310 }
3311
3312 return error;
3313 }
3314
3315 /*****************************************************************//**
3316 Callback function to fetch the rows in an FTS INDEX record.
3317 @return always returns TRUE */
3318 static
3319 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3320 fts_query_index_fetch_nodes(
3321 /*========================*/
3322 void* row, /*!< in: sel_node_t* */
3323 void* user_arg) /*!< in: pointer to fts_fetch_t */
3324 {
3325 fts_string_t key;
3326 sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3327 fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3328 fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3329 que_node_t* exp = sel_node->select_list;
3330 dfield_t* dfield = que_node_get_val(exp);
3331 void* data = dfield_get_data(dfield);
3332 ulint dfield_len = dfield_get_len(dfield);
3333
3334 key.f_str = static_cast<byte*>(data);
3335 key.f_len = dfield_len;
3336
3337 ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3338
3339 /* Note: we pass error out by 'query->error' */
3340 query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3341
3342 if (query->error != DB_SUCCESS) {
3343 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3344 return(FALSE);
3345 } else {
3346 return(TRUE);
3347 }
3348 }
3349
3350 /*****************************************************************//**
3351 Calculate the inverse document frequency (IDF) for all the terms. */
3352 static
3353 void
fts_query_calculate_idf(fts_query_t * query)3354 fts_query_calculate_idf(
3355 /*====================*/
3356 fts_query_t* query) /*!< in: Query state */
3357 {
3358 const ib_rbt_node_t* node;
3359 ib_uint64_t total_docs = query->total_docs;
3360
3361 /* We need to free any instances of fts_doc_freq_t that we
3362 may have allocated. */
3363 for (node = rbt_first(query->word_freqs);
3364 node;
3365 node = rbt_next(query->word_freqs, node)) {
3366
3367 fts_word_freq_t* word_freq;
3368
3369 word_freq = rbt_value(fts_word_freq_t, node);
3370
3371 if (word_freq->doc_count > 0) {
3372 if (total_docs == word_freq->doc_count) {
3373 /* QP assume ranking > 0 if we find
3374 a match. Since Log10(1) = 0, we cannot
3375 make IDF a zero value if do find a
3376 word in all documents. So let's make
3377 it an arbitrary very small number */
3378 word_freq->idf = log10(1.0001);
3379 } else {
3380 word_freq->idf = log10(
3381 total_docs
3382 / (double) word_freq->doc_count);
3383 }
3384 }
3385
3386 if (fts_enable_diag_print) {
3387 fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
3388 " %6.5lf\n",
3389 word_freq->word.f_str,
3390 query->total_docs, word_freq->doc_count,
3391 word_freq->idf);
3392 }
3393 }
3394 }
3395
3396 /*****************************************************************//**
3397 Calculate the ranking of the document. */
3398 static
3399 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3400 fts_query_calculate_ranking(
3401 /*========================*/
3402 const fts_query_t* query, /*!< in: query state */
3403 fts_ranking_t* ranking) /*!< in: Document to rank */
3404 {
3405 ulint pos = 0;
3406 fts_string_t word;
3407
3408 /* At this stage, ranking->rank should not exceed the 1.0
3409 bound */
3410 ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3411 ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3412
3413 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3414 int ret;
3415 ib_rbt_bound_t parent;
3416 double weight;
3417 fts_doc_freq_t* doc_freq;
3418 fts_word_freq_t* word_freq;
3419
3420 ret = rbt_search(query->word_freqs, &parent, &word);
3421
3422 /* It must exist. */
3423 ut_a(ret == 0);
3424
3425 word_freq = rbt_value(fts_word_freq_t, parent.last);
3426
3427 ret = rbt_search(
3428 word_freq->doc_freqs, &parent, &ranking->doc_id);
3429
3430 /* It must exist. */
3431 ut_a(ret == 0);
3432
3433 doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3434
3435 weight = (double) doc_freq->freq * word_freq->idf;
3436
3437 ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3438 }
3439 }
3440
3441 /*****************************************************************//**
3442 Add ranking to the result set. */
3443 static
3444 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3445 fts_query_add_ranking(
3446 /*==================*/
3447 fts_query_t* query, /*!< in: query state */
3448 ib_rbt_t* ranking_tree, /*!< in: ranking tree */
3449 const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
3450 {
3451 ib_rbt_bound_t parent;
3452
3453 /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3454 if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3455 fts_ranking_t* ranking;
3456
3457 ranking = rbt_value(fts_ranking_t, parent.last);
3458
3459 ranking->rank += new_ranking->rank;
3460
3461 ut_a(ranking->words == NULL);
3462 } else {
3463 rbt_add_node(ranking_tree, &parent, new_ranking);
3464
3465 query->total_size += SIZEOF_RBT_NODE_ADD
3466 + sizeof(fts_ranking_t);
3467 }
3468 }
3469
3470 /*****************************************************************//**
3471 Retrieve the FTS Relevance Ranking result for doc with doc_id
3472 @return the relevance ranking value, 0 if no ranking value
3473 present. */
3474 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3475 fts_retrieve_ranking(
3476 /*=================*/
3477 fts_result_t* result, /*!< in: FTS result structure */
3478 doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
3479 {
3480 ib_rbt_bound_t parent;
3481 fts_ranking_t new_ranking;
3482
3483 DBUG_ENTER("fts_retrieve_ranking");
3484
3485 if (!result || !result->rankings_by_id) {
3486 DBUG_RETURN(0);
3487 }
3488
3489 new_ranking.doc_id = doc_id;
3490
3491 /* Lookup the ranking in our rb tree */
3492 if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3493 fts_ranking_t* ranking;
3494
3495 ranking = rbt_value(fts_ranking_t, parent.last);
3496
3497 DBUG_RETURN(ranking->rank);
3498 }
3499
3500 DBUG_RETURN(0);
3501 }
3502
3503 /*****************************************************************//**
3504 Create the result and copy the data to it. */
3505 static
3506 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3507 fts_query_prepare_result(
3508 /*=====================*/
3509 fts_query_t* query, /*!< in: Query state */
3510 fts_result_t* result) /*!< in: result this can contain
3511 data from a previous search on
3512 another FTS index */
3513 {
3514 const ib_rbt_node_t* node;
3515 bool result_is_null = false;
3516
3517 DBUG_ENTER("fts_query_prepare_result");
3518
3519 if (result == NULL) {
3520 result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3521
3522 memset(result, 0x0, sizeof(*result));
3523
3524 result->rankings_by_id = rbt_create(
3525 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3526
3527 query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3528 result_is_null = true;
3529 }
3530
3531 if (query->flags == FTS_OPT_RANKING) {
3532 fts_word_freq_t* word_freq;
3533 ulint size = ib_vector_size(query->deleted->doc_ids);
3534 fts_update_t* array =
3535 (fts_update_t*) query->deleted->doc_ids->data;
3536
3537 node = rbt_first(query->word_freqs);
3538 ut_ad(node);
3539 word_freq = rbt_value(fts_word_freq_t, node);
3540
3541 for (node = rbt_first(word_freq->doc_freqs);
3542 node;
3543 node = rbt_next(word_freq->doc_freqs, node)) {
3544 fts_doc_freq_t* doc_freq;
3545 fts_ranking_t ranking;
3546
3547 doc_freq = rbt_value(fts_doc_freq_t, node);
3548
3549 /* Don't put deleted docs into result */
3550 if (fts_bsearch(array, 0, static_cast<int>(size),
3551 doc_freq->doc_id) >= 0) {
3552 /* one less matching doc count */
3553 --word_freq->doc_count;
3554 continue;
3555 }
3556
3557 ranking.doc_id = doc_freq->doc_id;
3558 ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3559 ranking.words = NULL;
3560
3561 fts_query_add_ranking(query, result->rankings_by_id,
3562 &ranking);
3563
3564 if (query->total_size > fts_result_cache_limit) {
3565 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3566 fts_query_free_result(result);
3567 DBUG_RETURN(NULL);
3568 }
3569 }
3570
3571 /* Calculate IDF only after we exclude the deleted items */
3572 fts_query_calculate_idf(query);
3573
3574 node = rbt_first(query->word_freqs);
3575 word_freq = rbt_value(fts_word_freq_t, node);
3576
3577 /* Calculate the ranking for each doc */
3578 for (node = rbt_first(result->rankings_by_id);
3579 node != NULL;
3580 node = rbt_next(result->rankings_by_id, node)) {
3581
3582 fts_ranking_t* ranking;
3583
3584 ranking = rbt_value(fts_ranking_t, node);
3585
3586 ranking->rank = static_cast<fts_rank_t>(
3587 ranking->rank * word_freq->idf * word_freq->idf);
3588 }
3589
3590 DBUG_RETURN(result);
3591 }
3592
3593 ut_a(rbt_size(query->doc_ids) > 0);
3594
3595 for (node = rbt_first(query->doc_ids);
3596 node;
3597 node = rbt_next(query->doc_ids, node)) {
3598
3599 fts_ranking_t* ranking;
3600
3601 ranking = rbt_value(fts_ranking_t, node);
3602 fts_query_calculate_ranking(query, ranking);
3603
3604 // FIXME: I think we may requre this information to improve the
3605 // ranking of doc ids which have more word matches from
3606 // different FTS indexes.
3607
3608 /* We don't need these anymore free the resources. */
3609 ranking->words = NULL;
3610
3611 if (!result_is_null) {
3612 fts_query_add_ranking(query, result->rankings_by_id, ranking);
3613
3614 if (query->total_size > fts_result_cache_limit) {
3615 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3616 fts_query_free_result(result);
3617 DBUG_RETURN(NULL);
3618 }
3619 }
3620 }
3621
3622 if (result_is_null) {
3623 /* Use doc_ids directly */
3624 rbt_free(result->rankings_by_id);
3625 result->rankings_by_id = query->doc_ids;
3626 query->doc_ids = NULL;
3627 }
3628
3629 DBUG_RETURN(result);
3630 }
3631
3632 /*****************************************************************//**
3633 Get the result of the query. Calculate the similarity coefficient. */
3634 static
3635 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3636 fts_query_get_result(
3637 /*=================*/
3638 fts_query_t* query, /*!< in: query instance */
3639 fts_result_t* result) /*!< in: result */
3640 {
3641 DBUG_ENTER("fts_query_get_result");
3642
3643 if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3644 /* Copy the doc ids to the result. */
3645 result = fts_query_prepare_result(query, result);
3646 } else {
3647 /* Create an empty result instance. */
3648 result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3649 memset(result, 0, sizeof(*result));
3650 }
3651
3652 DBUG_RETURN(result);
3653 }
3654
3655 /*****************************************************************//**
3656 FTS Query free resources and reset. */
3657 static
3658 void
fts_query_free(fts_query_t * query)3659 fts_query_free(
3660 /*===========*/
3661 fts_query_t* query) /*!< in: query instance to free*/
3662 {
3663
3664 if (query->read_nodes_graph) {
3665 fts_que_graph_free(query->read_nodes_graph);
3666 }
3667
3668 if (query->root) {
3669 fts_ast_free_node(query->root);
3670 }
3671
3672 if (query->deleted) {
3673 fts_doc_ids_free(query->deleted);
3674 }
3675
3676 if (query->intersection) {
3677 fts_query_free_doc_ids(query, query->intersection);
3678 query->intersection = NULL;
3679 }
3680
3681 if (query->doc_ids) {
3682 fts_query_free_doc_ids(query, query->doc_ids);
3683 }
3684
3685 if (query->word_freqs) {
3686 const ib_rbt_node_t* node;
3687
3688 /* We need to free any instances of fts_doc_freq_t that we
3689 may have allocated. */
3690 for (node = rbt_first(query->word_freqs);
3691 node;
3692 node = rbt_next(query->word_freqs, node)) {
3693
3694 fts_word_freq_t* word_freq;
3695
3696 word_freq = rbt_value(fts_word_freq_t, node);
3697
3698 /* We need to cast away the const. */
3699 rbt_free(word_freq->doc_freqs);
3700 }
3701
3702 rbt_free(query->word_freqs);
3703 }
3704
3705 ut_a(!query->intersection);
3706
3707 if (query->word_map) {
3708 rbt_free(query->word_map);
3709 }
3710
3711 if (query->word_vector) {
3712 delete query->word_vector;
3713 }
3714
3715 if (query->heap) {
3716 mem_heap_free(query->heap);
3717 }
3718
3719 memset(query, 0, sizeof(*query));
3720 }
3721
3722 /*****************************************************************//**
3723 Parse the query using flex/bison. */
3724 static
3725 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3726 fts_query_parse(
3727 /*============*/
3728 fts_query_t* query, /*!< in: query instance */
3729 byte* query_str, /*!< in: query string */
3730 ulint query_len) /*!< in: query string length */
3731 {
3732 int error;
3733 fts_ast_state_t state;
3734 bool mode = query->boolean_mode;
3735 DBUG_ENTER("fts_query_parse");
3736
3737 memset(&state, 0x0, sizeof(state));
3738
3739 /* Setup the scanner to use, this depends on the mode flag. */
3740 state.lexer = fts_lexer_create(mode, query_str, query_len);
3741 state.charset = query->fts_index_table.charset;
3742 error = fts_parse(&state);
3743 fts_lexer_free(state.lexer);
3744 state.lexer = NULL;
3745
3746 /* Error during parsing ? */
3747 if (error) {
3748 /* Free the nodes that were allocated during parsing. */
3749 fts_ast_state_free(&state);
3750 } else {
3751 query->root = state.root;
3752 }
3753
3754 DBUG_RETURN(state.root);
3755 }
3756
3757 /*******************************************************************//**
3758 FTS Query optimization
3759 Set FTS_OPT_RANKING if it is a simple term query */
3760 static
3761 void
fts_query_can_optimize(fts_query_t * query,uint flags)3762 fts_query_can_optimize(
3763 /*===================*/
3764 fts_query_t* query, /*!< in/out: query instance */
3765 uint flags) /*!< In: FTS search mode */
3766 {
3767 fts_ast_node_t* node = query->root;
3768
3769 if (flags & FTS_EXPAND) {
3770 return;
3771 }
3772
3773 /* Check if it has only a term without oper */
3774 ut_ad(node->type == FTS_AST_LIST);
3775 node = node->list.head;
3776 if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3777 query->flags = FTS_OPT_RANKING;
3778 }
3779 }
3780
3781 /*******************************************************************//**
3782 Pre-process the query string
3783 1) make it lower case
3784 2) in boolean mode, if there is '-' or '+' that is immediately proceeded
3785 and followed by valid word, make it a space
3786 @return the processed string */
3787 static
3788 byte*
fts_query_str_preprocess(const byte * query_str,ulint query_len,ulint * result_len,CHARSET_INFO * charset,bool boolean_mode)3789 fts_query_str_preprocess(
3790 /*=====================*/
3791 const byte* query_str, /*!< in: FTS query */
3792 ulint query_len, /*!< in: FTS query string len */
3793 ulint *result_len, /*!< out: result string length */
3794 CHARSET_INFO* charset, /*!< in: string charset */
3795 bool boolean_mode) /*!< in: is boolean mode */
3796 {
3797 ulint cur_pos = 0;
3798 ulint str_len;
3799 byte* str_ptr;
3800 bool in_phrase = false;
3801
3802 /* Convert the query string to lower case before parsing. We own
3803 the ut_malloc'ed result and so remember to free it before return. */
3804
3805 str_len = query_len * charset->casedn_multiply + 1;
3806 str_ptr = static_cast<byte*>(ut_malloc(str_len));
3807
3808 /* For binary collations, a case sensitive search is
3809 performed. Hence don't convert to lower case. */
3810 if (my_binary_compare(charset)) {
3811 memcpy(str_ptr, query_str, query_len);
3812 str_ptr[query_len]= 0;
3813 *result_len= query_len;
3814 } else {
3815 *result_len = innobase_fts_casedn_str(
3816 charset, const_cast<char*>
3817 (reinterpret_cast<const char*>( query_str)),
3818 query_len,
3819 reinterpret_cast<char*>(str_ptr), str_len);
3820 }
3821
3822 ut_ad(*result_len < str_len);
3823
3824 str_ptr[*result_len] = 0;
3825
3826 /* If it is boolean mode, no need to check for '-/+' */
3827 if (!boolean_mode) {
3828 return(str_ptr);
3829 }
3830
3831 /* Otherwise, we travese the string to find any '-/+' that are
3832 immediately proceeded and followed by valid search word.
3833 NOTE: we should not do so for CJK languages, this should
3834 be taken care of in our CJK implementation */
3835 while (cur_pos < *result_len) {
3836 fts_string_t str;
3837 ulint offset;
3838 ulint cur_len;
3839
3840 cur_len = innobase_mysql_fts_get_token(
3841 charset, str_ptr + cur_pos, str_ptr + *result_len,
3842 &str, &offset);
3843
3844 if (cur_len == 0 || str.f_str == NULL) {
3845 /* No valid word found */
3846 break;
3847 }
3848
3849 /* Check if we are in a phrase, if so, no need to do
3850 replacement of '-/+'. */
3851 for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
3852 if ((char) (*ptr) == '"' ) {
3853 in_phrase = !in_phrase;
3854 }
3855 }
3856
3857 /* Find those are not leading '-/+' and also not in a phrase */
3858 if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
3859 && !in_phrase) {
3860 char* last_op = reinterpret_cast<char*>(
3861 str_ptr + cur_pos);
3862
3863 if (*last_op == '-' || *last_op == '+') {
3864 *last_op = ' ';
3865 }
3866 }
3867
3868 cur_pos += cur_len;
3869 }
3870
3871 return(str_ptr);
3872 }
3873
3874 /*******************************************************************//**
3875 FTS Query entry point.
3876 @return DB_SUCCESS if successful otherwise error code */
3877 UNIV_INTERN
3878 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3879 fts_query(
3880 /*======*/
3881 trx_t* trx, /*!< in: transaction */
3882 dict_index_t* index, /*!< in: The FTS index to search */
3883 uint flags, /*!< in: FTS search mode */
3884 const byte* query_str, /*!< in: FTS query */
3885 ulint query_len, /*!< in: FTS query string len
3886 in bytes */
3887 fts_result_t** result) /*!< in/out: result doc ids */
3888 {
3889 fts_query_t query;
3890 dberr_t error = DB_SUCCESS;
3891 byte* lc_query_str;
3892 ulint result_len;
3893 bool boolean_mode;
3894 trx_t* query_trx;
3895 CHARSET_INFO* charset;
3896 ulint start_time_ms;
3897 bool will_be_ignored = false;
3898
3899 boolean_mode = flags & FTS_BOOL;
3900
3901 *result = NULL;
3902 memset(&query, 0x0, sizeof(query));
3903 query_trx = trx_allocate_for_background();
3904 query_trx->op_info = "FTS query";
3905
3906 start_time_ms = ut_time_ms();
3907
3908 query.trx = query_trx;
3909 query.index = index;
3910 query.boolean_mode = boolean_mode;
3911 query.deleted = fts_doc_ids_create();
3912 query.cur_node = NULL;
3913
3914 query.fts_common_table.type = FTS_COMMON_TABLE;
3915 query.fts_common_table.table_id = index->table->id;
3916 query.fts_common_table.parent = index->table->name;
3917 query.fts_common_table.table = index->table;
3918
3919 charset = fts_index_get_charset(index);
3920
3921 query.fts_index_table.type = FTS_INDEX_TABLE;
3922 query.fts_index_table.index_id = index->id;
3923 query.fts_index_table.table_id = index->table->id;
3924 query.fts_index_table.parent = index->table->name;
3925 query.fts_index_table.charset = charset;
3926 query.fts_index_table.table = index->table;
3927
3928 query.word_map = rbt_create_arg_cmp(
3929 sizeof(fts_string_t), innobase_fts_text_cmp, charset);
3930 query.word_vector = new word_vector_t;
3931 query.error = DB_SUCCESS;
3932
3933 /* Setup the RB tree that will be used to collect per term
3934 statistics. */
3935 query.word_freqs = rbt_create_arg_cmp(
3936 sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset);
3937
3938 query.total_size += SIZEOF_RBT_CREATE;
3939
3940 query.total_docs = dict_table_get_n_rows(index->table);
3941
3942 #ifdef FTS_DOC_STATS_DEBUG
3943 if (ft_enable_diag_print) {
3944 error = fts_get_total_word_count(
3945 trx, query.index, &query.total_words);
3946
3947 if (error != DB_SUCCESS) {
3948 goto func_exit;
3949 }
3950
3951 fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
3952 query.total_docs, query.total_words);
3953 }
3954 #endif /* FTS_DOC_STATS_DEBUG */
3955
3956 query.fts_common_table.suffix = "DELETED";
3957
3958 /* Read the deleted doc_ids, we need these for filtering. */
3959 error = fts_table_fetch_doc_ids(
3960 NULL, &query.fts_common_table, query.deleted);
3961
3962 if (error != DB_SUCCESS) {
3963 goto func_exit;
3964 }
3965
3966 query.fts_common_table.suffix = "DELETED_CACHE";
3967
3968 error = fts_table_fetch_doc_ids(
3969 NULL, &query.fts_common_table, query.deleted);
3970
3971 if (error != DB_SUCCESS) {
3972 goto func_exit;
3973 }
3974
3975 /* Get the deleted doc ids that are in the cache. */
3976 fts_cache_append_deleted_doc_ids(
3977 index->table->fts->cache, query.deleted->doc_ids);
3978 DEBUG_SYNC_C("fts_deleted_doc_ids_append");
3979
3980 /* Sort the vector so that we can do a binary search over the ids. */
3981 ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
3982
3983 #if 0
3984 /* Convert the query string to lower case before parsing. We own
3985 the ut_malloc'ed result and so remember to free it before return. */
3986
3987 lc_query_str_len = query_len * charset->casedn_multiply + 1;
3988 lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
3989
3990 result_len = innobase_fts_casedn_str(
3991 charset, (char*) query_str, query_len,
3992 (char*) lc_query_str, lc_query_str_len);
3993
3994 ut_ad(result_len < lc_query_str_len);
3995
3996 lc_query_str[result_len] = 0;
3997
3998 #endif
3999
4000 lc_query_str = fts_query_str_preprocess(
4001 query_str, query_len, &result_len, charset, boolean_mode);
4002
4003 query.heap = mem_heap_create(128);
4004
4005 /* Create the rb tree for the doc id (current) set. */
4006 query.doc_ids = rbt_create(
4007 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4008
4009 query.total_size += SIZEOF_RBT_CREATE;
4010
4011 /* Parse the input query string. */
4012 if (fts_query_parse(&query, lc_query_str, result_len)) {
4013 fts_ast_node_t* ast = query.root;
4014
4015 /* Optimize query to check if it's a single term */
4016 fts_query_can_optimize(&query, flags);
4017
4018 DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4019 fts_result_cache_limit = 2048;
4020 );
4021
4022 /* Traverse the Abstract Syntax Tree (AST) and execute
4023 the query. */
4024 query.error = fts_ast_visit(
4025 FTS_NONE, ast, fts_query_visitor,
4026 &query, &will_be_ignored);
4027
4028 /* If query expansion is requested, extend the search
4029 with first search pass result */
4030 if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4031 query.error = fts_expand_query(index, &query);
4032 }
4033
4034 /* Calculate the inverse document frequency of the terms. */
4035 if (query.error == DB_SUCCESS
4036 && query.flags != FTS_OPT_RANKING) {
4037 fts_query_calculate_idf(&query);
4038 }
4039
4040 /* Copy the result from the query state, so that we can
4041 return it to the caller. */
4042 if (query.error == DB_SUCCESS) {
4043 *result = fts_query_get_result(&query, *result);
4044 }
4045
4046 error = query.error;
4047 } else {
4048 /* still return an empty result set */
4049 *result = static_cast<fts_result_t*>(
4050 ut_malloc(sizeof(**result)));
4051 memset(*result, 0, sizeof(**result));
4052 }
4053
4054 ut_free(lc_query_str);
4055
4056 if (fts_enable_diag_print && (*result)) {
4057 ulint diff_time = ut_time_ms() - start_time_ms;
4058 fprintf(stderr, "FTS Search Processing time: %ld secs:"
4059 " %ld millisec: row(s) %d \n",
4060 diff_time / 1000, diff_time % 1000,
4061 (*result)->rankings_by_id
4062 ? (int) rbt_size((*result)->rankings_by_id)
4063 : -1);
4064
4065 /* Log memory consumption & result size */
4066 ib_logf(IB_LOG_LEVEL_INFO,
4067 "Full Search Memory: "
4068 "%lu (bytes), Row: %lu .",
4069 query.total_size,
4070 (*result)->rankings_by_id
4071 ? rbt_size((*result)->rankings_by_id)
4072 : 0);
4073 }
4074
4075 func_exit:
4076 fts_query_free(&query);
4077
4078 trx_free_for_background(query_trx);
4079
4080 return(error);
4081 }
4082
4083 /*****************************************************************//**
4084 FTS Query free result, returned by fts_query(). */
4085
4086 void
fts_query_free_result(fts_result_t * result)4087 fts_query_free_result(
4088 /*==================*/
4089 fts_result_t* result) /*!< in: result instance to free.*/
4090 {
4091 if (result) {
4092 if (result->rankings_by_id != NULL) {
4093 rbt_free(result->rankings_by_id);
4094 result->rankings_by_id = NULL;
4095 }
4096 if (result->rankings_by_rank != NULL) {
4097 rbt_free(result->rankings_by_rank);
4098 result->rankings_by_rank = NULL;
4099 }
4100
4101 ut_free(result);
4102 result = NULL;
4103 }
4104 }
4105
4106 /*****************************************************************//**
4107 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4108
4109 void
fts_query_sort_result_on_rank(fts_result_t * result)4110 fts_query_sort_result_on_rank(
4111 /*==========================*/
4112 fts_result_t* result) /*!< out: result instance to sort.*/
4113 {
4114 const ib_rbt_node_t* node;
4115 ib_rbt_t* ranked;
4116
4117 ut_a(result->rankings_by_id != NULL);
4118 if (result->rankings_by_rank) {
4119 rbt_free(result->rankings_by_rank);
4120 }
4121
4122 ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4123
4124 /* We need to free any instances of fts_doc_freq_t that we
4125 may have allocated. */
4126 for (node = rbt_first(result->rankings_by_id);
4127 node;
4128 node = rbt_next(result->rankings_by_id, node)) {
4129
4130 fts_ranking_t* ranking;
4131
4132 ranking = rbt_value(fts_ranking_t, node);
4133
4134 ut_a(ranking->words == NULL);
4135
4136 rbt_insert(ranked, ranking, ranking);
4137 }
4138
4139 /* Reset the current node too. */
4140 result->current = NULL;
4141 result->rankings_by_rank = ranked;
4142 }
4143
4144 #ifdef UNIV_DEBUG
4145 /*******************************************************************//**
4146 A debug function to print result doc_id set. */
4147 static
4148 void
fts_print_doc_id(fts_query_t * query)4149 fts_print_doc_id(
4150 /*=============*/
4151 fts_query_t* query) /*!< in : tree that stores doc_ids.*/
4152 {
4153 const ib_rbt_node_t* node;
4154
4155 /* Iterate each member of the doc_id set */
4156 for (node = rbt_first(query->doc_ids);
4157 node;
4158 node = rbt_next(query->doc_ids, node)) {
4159 fts_ranking_t* ranking;
4160 ranking = rbt_value(fts_ranking_t, node);
4161
4162 ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, doc_id: %ld \n",
4163 (ulint) ranking->doc_id);
4164
4165 ulint pos = 0;
4166 fts_string_t word;
4167
4168 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4169 ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, value: %s \n", word.f_str);
4170 }
4171 }
4172 }
4173 #endif
4174
4175 /*************************************************************//**
4176 This function implements a simple "blind" query expansion search:
4177 words in documents found in the first search pass will be used as
4178 search arguments to search the document again, thus "expand"
4179 the search result set.
4180 @return DB_SUCCESS if success, otherwise the error code */
4181 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4182 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4183 fts_expand_query(
4184 /*=============*/
4185 dict_index_t* index, /*!< in: FTS index to search */
4186 fts_query_t* query) /*!< in: FTS query instance */
4187 {
4188 const ib_rbt_node_t* node;
4189 const ib_rbt_node_t* token_node;
4190 fts_doc_t result_doc;
4191 dberr_t error = DB_SUCCESS;
4192 const fts_index_cache_t*index_cache;
4193
4194 /* If no doc is found in first search pass, return */
4195 if (!rbt_size(query->doc_ids)) {
4196 return(error);
4197 }
4198
4199 /* Init "result_doc", to hold words from the first search pass */
4200 fts_doc_init(&result_doc);
4201
4202 rw_lock_x_lock(&index->table->fts->cache->lock);
4203 index_cache = fts_find_index_cache(index->table->fts->cache, index);
4204 rw_lock_x_unlock(&index->table->fts->cache->lock);
4205
4206 ut_a(index_cache);
4207
4208 result_doc.tokens = rbt_create_arg_cmp(
4209 sizeof(fts_token_t), innobase_fts_text_cmp,
4210 index_cache->charset);
4211
4212 result_doc.charset = index_cache->charset;
4213
4214 query->total_size += SIZEOF_RBT_CREATE;
4215 #ifdef UNIV_DEBUG
4216 fts_print_doc_id(query);
4217 #endif
4218
4219 for (node = rbt_first(query->doc_ids);
4220 node;
4221 node = rbt_next(query->doc_ids, node)) {
4222
4223 fts_ranking_t* ranking;
4224 ulint pos;
4225 fts_string_t word;
4226 ulint prev_token_size;
4227 ulint estimate_size;
4228
4229 prev_token_size = rbt_size(result_doc.tokens);
4230
4231 ranking = rbt_value(fts_ranking_t, node);
4232
4233 /* Fetch the documents with the doc_id from the
4234 result of first seach pass. Since we do not
4235 store document-to-word mapping, we need to
4236 fetch the original document and parse them.
4237 Future optimization could be done here if we
4238 support some forms of document-to-word mapping */
4239 fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4240 FTS_FETCH_DOC_BY_ID_EQUAL,
4241 fts_query_expansion_fetch_doc,
4242 &result_doc);
4243
4244 /* Remove words that have already been searched in the
4245 first pass */
4246 pos = 0;
4247 while (fts_ranking_words_get_next(query, ranking, &pos,
4248 &word)) {
4249 ibool ret;
4250
4251 ret = rbt_delete(result_doc.tokens, &word);
4252
4253 /* The word must exist in the doc we found */
4254 if (!ret) {
4255 ib_logf(IB_LOG_LEVEL_ERROR, "Did not "
4256 "find word %s in doc %ld for query "
4257 "expansion search.\n", word.f_str,
4258 (ulint) ranking->doc_id);
4259 }
4260 }
4261
4262 /* Estimate memory used, see fts_process_token and fts_token_t.
4263 We ignore token size here. */
4264 estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4265 * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4266 + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4267 query->total_size += estimate_size;
4268
4269 if (query->total_size > fts_result_cache_limit) {
4270 error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4271 goto func_exit;
4272 }
4273 }
4274
4275 /* Search the table the second time with expanded search list */
4276 for (token_node = rbt_first(result_doc.tokens);
4277 token_node;
4278 token_node = rbt_next(result_doc.tokens, token_node)) {
4279 fts_token_t* mytoken;
4280 mytoken = rbt_value(fts_token_t, token_node);
4281
4282 ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4283 fts_query_add_word_freq(query, &mytoken->text);
4284 error = fts_query_union(query, &mytoken->text);
4285
4286 if (error != DB_SUCCESS) {
4287 break;
4288 }
4289 }
4290
4291 func_exit:
4292 fts_doc_free(&result_doc);
4293
4294 return(error);
4295 }
4296 /*************************************************************//**
4297 This function finds documents that contain all words in a
4298 phrase or proximity search. And if proximity search, verify
4299 the words are close enough to each other, as in specified distance.
4300 This function is called for phrase and proximity search.
4301 @return TRUE if documents are found, FALSE if otherwise */
4302 static
4303 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4304 fts_phrase_or_proximity_search(
4305 /*===========================*/
4306 fts_query_t* query, /*!< in/out: query instance.
4307 query->doc_ids might be instantiated
4308 with qualified doc IDs */
4309 ib_vector_t* tokens) /*!< in: Tokens contain words */
4310 {
4311 ulint n_matched;
4312 ulint i;
4313 ibool matched = FALSE;
4314 ulint num_token = ib_vector_size(tokens);
4315 fts_match_t* match[MAX_PROXIMITY_ITEM];
4316 ibool end_list = FALSE;
4317
4318 /* Number of matched documents for the first token */
4319 n_matched = ib_vector_size(query->match_array[0]);
4320
4321 /* We have a set of match list for each word, we shall
4322 walk through the list and find common documents that
4323 contain all the matching words. */
4324 for (i = 0; i < n_matched; i++) {
4325 ulint j;
4326 ulint k = 0;
4327 fts_proximity_t qualified_pos;
4328
4329 match[0] = static_cast<fts_match_t*>(
4330 ib_vector_get(query->match_array[0], i));
4331
4332 /* For remaining match list for the token(word), we
4333 try to see if there is a document with the same
4334 doc id */
4335 for (j = 1; j < num_token; j++) {
4336 match[j] = static_cast<fts_match_t*>(
4337 ib_vector_get(query->match_array[j], k));
4338
4339 while (match[j]->doc_id < match[0]->doc_id
4340 && k < ib_vector_size(query->match_array[j])) {
4341 match[j] = static_cast<fts_match_t*>(
4342 ib_vector_get(
4343 query->match_array[j], k));
4344 k++;
4345 }
4346
4347 if (match[j]->doc_id > match[0]->doc_id) {
4348 /* no match */
4349 if (query->flags & FTS_PHRASE) {
4350 match[0]->doc_id = 0;
4351 }
4352 break;
4353 }
4354
4355 if (k == ib_vector_size(query->match_array[j])) {
4356 end_list = TRUE;
4357
4358 if (match[j]->doc_id != match[0]->doc_id) {
4359 /* no match */
4360 if (query->flags & FTS_PHRASE) {
4361 ulint s;
4362
4363 match[0]->doc_id = 0;
4364
4365 for (s = i + 1; s < n_matched;
4366 s++) {
4367 match[0] = static_cast<
4368 fts_match_t*>(
4369 ib_vector_get(
4370 query->match_array[0],
4371 s));
4372 match[0]->doc_id = 0;
4373 }
4374 }
4375
4376 goto func_exit;
4377 }
4378 }
4379
4380 /* FIXME: A better solution will be a counter array
4381 remember each run's last position. So we don't
4382 reset it here very time */
4383 k = 0;
4384 }
4385
4386 if (j != num_token) {
4387 continue;
4388 }
4389
4390 /* For this matching doc, we need to further
4391 verify whether the words in the doc are close
4392 to each other, and within the distance specified
4393 in the proximity search */
4394 if (query->flags & FTS_PHRASE) {
4395 matched = TRUE;
4396 } else if (fts_proximity_get_positions(
4397 match, num_token, ULINT_MAX, &qualified_pos)) {
4398
4399 /* Fetch the original documents and count the
4400 words in between matching words to see that is in
4401 specified distance */
4402 if (fts_query_is_in_proximity_range(
4403 query, match, &qualified_pos)) {
4404 /* If so, mark we find a matching doc */
4405 query->error = fts_query_process_doc_id(
4406 query, match[0]->doc_id, 0);
4407 if (query->error != DB_SUCCESS) {
4408 matched = FALSE;
4409 goto func_exit;
4410 }
4411
4412 matched = TRUE;
4413 for (ulint z = 0; z < num_token; z++) {
4414 fts_string_t* token;
4415 token = static_cast<fts_string_t*>(
4416 ib_vector_get(tokens, z));
4417 fts_query_add_word_to_document(
4418 query, match[0]->doc_id, token);
4419 }
4420 }
4421 }
4422
4423 if (end_list) {
4424 break;
4425 }
4426 }
4427
4428 func_exit:
4429 return(matched);
4430 }
4431
4432 /*************************************************************//**
4433 This function checks whether words in result documents are close to
4434 each other (within proximity range as specified by "distance").
4435 If "distance" is MAX_ULINT, then it will find all combinations of
4436 positions of matching words and store min and max positions
4437 in the "qualified_pos" for later verification.
4438 @return true if words are close to each other, false if otherwise */
4439 static
4440 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4441 fts_proximity_get_positions(
4442 /*========================*/
4443 fts_match_t** match, /*!< in: query instance */
4444 ulint num_match, /*!< in: number of matching
4445 items */
4446 ulint distance, /*!< in: distance value
4447 for proximity search */
4448 fts_proximity_t* qualified_pos) /*!< out: the position info
4449 records ranges containing
4450 all matching words. */
4451 {
4452 ulint i;
4453 ulint idx[MAX_PROXIMITY_ITEM];
4454 ulint num_pos[MAX_PROXIMITY_ITEM];
4455 ulint min_idx;
4456
4457 qualified_pos->n_pos = 0;
4458
4459 ut_a(num_match <= MAX_PROXIMITY_ITEM);
4460
4461 /* Each word could appear multiple times in a doc. So
4462 we need to walk through each word's position list, and find
4463 closest distance between different words to see if
4464 they are in the proximity distance. */
4465
4466 /* Assume each word's position list is sorted, we
4467 will just do a walk through to all words' lists
4468 similar to a the merge phase of a merge sort */
4469 for (i = 0; i < num_match; i++) {
4470 /* idx is the current position we are checking
4471 for a particular word */
4472 idx[i] = 0;
4473
4474 /* Number of positions for this word */
4475 num_pos[i] = ib_vector_size(match[i]->positions);
4476 }
4477
4478 /* Start with the first word */
4479 min_idx = 0;
4480
4481 while (idx[min_idx] < num_pos[min_idx]) {
4482 ulint position[MAX_PROXIMITY_ITEM];
4483 ulint min_pos = ULINT_MAX;
4484 ulint max_pos = 0;
4485
4486 /* Check positions in each word position list, and
4487 record the max/min position */
4488 for (i = 0; i < num_match; i++) {
4489 position[i] = *(ulint*) ib_vector_get_const(
4490 match[i]->positions, idx[i]);
4491
4492 if (position[i] == ULINT_UNDEFINED) {
4493 break;
4494 }
4495
4496 if (position[i] < min_pos) {
4497 min_pos = position[i];
4498 min_idx = i;
4499 }
4500
4501 if (position[i] > max_pos) {
4502 max_pos = position[i];
4503 }
4504 }
4505
4506 /* If max and min position are within range, we
4507 find a good match */
4508 if (max_pos - min_pos <= distance
4509 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4510 /* The charset has variable character
4511 length encoding, record the min_pos and
4512 max_pos, we will need to verify the actual
4513 number of characters */
4514 qualified_pos->min_pos.push_back(min_pos);
4515 qualified_pos->max_pos.push_back(max_pos);
4516 qualified_pos->n_pos++;
4517 }
4518
4519 /* Otherwise, move to the next position is the
4520 list for the word with the smallest position */
4521 idx[min_idx]++;
4522 }
4523
4524 return(qualified_pos->n_pos != 0);
4525 }
4526