1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file fts/fts0que.cc
22 Full Text Search functionality.
23 
24 Created 2007/03/27 Sunny Bains
25 Completed 2011/7/10 Sunny and Jimmy Yang
26 *******************************************************/
27 
28 #include "dict0dict.h"
29 #include "ut0rbt.h"
30 #include "row0sel.h"
31 #include "fts0fts.h"
32 #include "fts0priv.h"
33 #include "fts0ast.h"
34 #include "fts0pars.h"
35 #include "fts0types.h"
36 #include "fts0plugin.h"
37 #include "fts0vlc.h"
38 
39 #include <iomanip>
40 #include <vector>
41 
42 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
43 
44 #define RANK_DOWNGRADE		(-1.0F)
45 #define RANK_UPGRADE		(1.0F)
46 
47 /* Maximum number of words supported in a phrase or proximity search. */
48 #define MAX_PROXIMITY_ITEM	128
49 
50 /* Memory used by rbt itself for create and node add */
51 #define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
52 #define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
53 
54 /*Initial byte length for 'words' in fts_ranking_t */
55 #define RANKING_WORDS_INIT_LEN	4
56 
57 // FIXME: Need to have a generic iterator that traverses the ilist.
58 
59 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> >	word_vector_t;
60 
61 struct fts_word_freq_t;
62 
63 /** State of an FTS query. */
64 struct fts_query_t {
65 	mem_heap_t*	heap;		/*!< Heap to use for allocations */
66 
67 	trx_t*		trx;		/*!< The query transaction */
68 
69 	dict_index_t*	index;		/*!< The FTS index to search */
70 					/*!< FTS auxiliary common table def */
71 
72 	fts_table_t	fts_common_table;
73 
74 	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
75 
76 	size_t		total_size;	/*!< total memory size used by query */
77 
78 	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
79 					filtered from the output */
80 
81 	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
82 
83 	fts_ast_node_t* cur_node;	/*!< Current tree node */
84 
85 	ib_rbt_t*	word_map;	/*!< Matched word map for
86 					searching by word*/
87 
88 	word_vector_t*	word_vector;	/*!< Matched word vector for
89 					searching by index */
90 
91 	ib_rbt_t*       doc_ids;	/*!< The current set of matching
92 					doc ids, elements are of
93 					type fts_ranking_t */
94 
95 	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
96 					doc_ids, this tree will become
97 					the new doc_ids, elements are of type
98 					fts_ranking_t */
99 
100 					/*!< Prepared statement to read the
101 					nodes from the FTS INDEX */
102 	que_t*		read_nodes_graph;
103 
104 	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
105 
106 					/*!< TRUE if we want to collect the
107 					word positions within the document */
108 	ibool		collect_positions;
109 
110 	ulint		flags;		/*!< Specify the full text search type,
111 					such as  boolean search, phrase
112 					search, proximity search etc. */
113 
114 	ulint		distance;	/*!< The proximity distance of a
115 					phrase search. */
116 
117 					/*!< These doc ids are used as a
118 					boundary condition when searching the
119 					FTS index rows */
120 
121 	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
122 
123 	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
124 
125 	bool		boolean_mode;	/*!< TRUE if boolean mode query */
126 
127 	ib_vector_t*	matched;	/*!< Array of matching documents
128 					(fts_match_t) to search for a phrase */
129 
130 	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
131 					position info for each matched word
132 					in the word list */
133 
134 	ib_uint64_t	total_docs;	/*!< The total number of documents */
135 
136 	ulint		total_words;	/*!< The total number of words */
137 
138 	dberr_t		error;		/*!< Error code if any, that is
139 					encountered during query processing */
140 
141 	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
142 					document, its elements are of type
143 					fts_word_freq_t */
144 
145 	ib_rbt_t*	wildcard_words;	/*!< words with wildcard */
146 
147 	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
148 	byte		visiting_sub_exp; /*!< count of nested
149 					fts_ast_visit_sub_exp() */
150 
151 	st_mysql_ftparser*	parser;	/*!< fts plugin parser */
152 };
153 
154 /** For phrase matching, first we collect the documents and the positions
155 then we match. */
156 struct fts_match_t {
157 	doc_id_t	doc_id;		/*!< Document id */
158 
159 	ulint		start;		/*!< Start the phrase match from
160 					this offset within the positions
161 					vector. */
162 
163 	ib_vector_t*	positions;	/*!< Offsets of a word in a
164 					document */
165 };
166 
167 /** For matching tokens in a phrase search. We use this data structure in
168 the callback that determines whether a document should be accepted or
169 rejected for a phrase search. */
170 struct fts_select_t {
171 	doc_id_t	doc_id;		/*!< The document id to match */
172 
173 	ulint		min_pos;	/*!< For found to be TRUE at least
174 					one position must be greater than
175 					min_pos. */
176 
177 	ibool		found;		/*!< TRUE if found */
178 
179 	fts_word_freq_t*
180 			word_freq;	/*!< Word frequency instance of the
181 					current word being looked up in
182 					the FTS index */
183 };
184 
185 typedef std::vector<ulint, ut_allocator<ulint> >       pos_vector_t;
186 
187 /** structure defines a set of ranges for original documents, each of which
188 has a minimum position and maximum position. Text in such range should
189 contain all words in the proximity search. We will need to count the
190 words in such range to make sure it is less than the specified distance
191 of the proximity search */
192 struct fts_proximity_t {
193 	ulint		n_pos;		/*!< number of position set, defines
194 					a range (min to max) containing all
195 					matching words */
196 	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
197 					of the range */
198 	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
199 					of the range */
200 };
201 
202 /** The match positions and tokesn to match */
203 struct fts_phrase_t {
fts_phrase_tfts_phrase_t204 	fts_phrase_t(const dict_table_t* table)
205 		:
206 		found(false),
207 		match(NULL),
208 		tokens(NULL),
209 		distance(0),
210 		charset(NULL),
211 		heap(NULL),
212 		zip_size(table->space->zip_size()),
213 		proximity_pos(NULL),
214 		parser(NULL)
215 	{
216 	}
217 
218 	/** Match result */
219 	ibool			found;
220 
221 	/** Positions within text */
222 	const fts_match_t*	match;
223 
224 	/** Tokens to match */
225 	const ib_vector_t*	tokens;
226 
227 	/** For matching on proximity distance. Can be 0 for exact match */
228 	ulint			distance;
229 
230 	/** Phrase match charset */
231 	CHARSET_INFO*		charset;
232 
233 	/** Heap for word processing */
234 	mem_heap_t*		heap;
235 
236 	/** ROW_FORMAT=COMPRESSED page size, or 0 */
237 	const ulint		zip_size;
238 
239 	/** Position info for proximity search verification. Records the
240 	min and max position of words matched */
241 	fts_proximity_t*	proximity_pos;
242 
243 	/** FTS plugin parser */
244 	st_mysql_ftparser*	parser;
245 };
246 
247 /** Paramter passed to fts phrase match by parser */
248 struct fts_phrase_param_t {
249 	fts_phrase_t*	phrase;		/*!< Match phrase instance */
250 	ulint		token_index;	/*!< Index of token to match next */
251 	mem_heap_t*	heap;		/*!< Heap for word processing */
252 };
253 
254 /** For storing the frequncy of a word/term in a document */
255 struct fts_doc_freq_t {
256 	doc_id_t	doc_id;		/*!< Document id */
257 	ulint		freq;		/*!< Frequency of a word in a document */
258 };
259 
260 /** To determine the word frequency per document. */
261 struct fts_word_freq_t {
262 	fts_string_t	word;		/*!< Word for which we need the freq,
263 					it's allocated on the query heap */
264 
265 	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
266 					word frequencies. The elements are
267 					of type fts_doc_freq_t */
268 	ib_uint64_t	doc_count;	/*!< Total number of documents that
269 					contain this word */
270 	double		idf;		/*!< Inverse document frequency */
271 };
272 
273 /********************************************************************
274 Callback function to fetch the rows in an FTS INDEX record.
275 @return always TRUE */
276 static
277 ibool
278 fts_query_index_fetch_nodes(
279 /*========================*/
280 	void*		row,		/*!< in: sel_node_t* */
281 	void*		user_arg);	/*!< in: pointer to ib_vector_t */
282 
283 /********************************************************************
284 Read and filter nodes.
285 @return fts_node_t instance */
286 static
287 dberr_t
288 fts_query_filter_doc_ids(
289 /*=====================*/
290 	fts_query_t*		query,		/*!< in: query instance */
291 	const fts_string_t*	word,		/*!< in: the current word */
292 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
293 	const fts_node_t*	node,		/*!< in: current FTS node */
294 	void*			data,		/*!< in: doc id ilist */
295 	ulint			len,		/*!< in: doc id ilist size */
296 	ibool			calc_doc_count);/*!< in: whether to remember doc
297 						count */
298 
299 /** Process (nested) sub-expression, create a new result set to store the
300 sub-expression result by processing nodes under current sub-expression
301 list. Merge the sub-expression result with that of parent expression list.
302 @param[in,out]	node	current root node
303 @param[in,out]	visitor	callback function
304 @param[in,out]	arg	argument for callback
305 @return DB_SUCCESS if all go well */
306 static
307 dberr_t
308 fts_ast_visit_sub_exp(
309 	fts_ast_node_t*		node,
310 	fts_ast_callback	visitor,
311 	void*			arg);
312 
313 #if 0
314 /*****************************************************************//***
315 Find a doc_id in a word's ilist.
316 @return TRUE if found. */
317 static
318 ibool
319 fts_query_find_doc_id(
320 /*==================*/
321 	fts_select_t*	select,		/*!< in/out: search the doc id selected,
322 					update the frequency if found. */
323 	void*		data,		/*!< in: doc id ilist */
324 	ulint		len);		/*!< in: doc id ilist size */
325 #endif
326 
327 /*************************************************************//**
328 This function implements a simple "blind" query expansion search:
329 words in documents found in the first search pass will be used as
330 search arguments to search the document again, thus "expand"
331 the search result set.
332 @return DB_SUCCESS if success, otherwise the error code */
333 static
334 dberr_t
335 fts_expand_query(
336 /*=============*/
337 	dict_index_t*	index,		/*!< in: FTS index to search */
338 	fts_query_t*	query)		/*!< in: query result, to be freed
339 					by the client */
340 	MY_ATTRIBUTE((nonnull, warn_unused_result));
341 /*************************************************************//**
342 This function finds documents that contain all words in a
343 phrase or proximity search. And if proximity search, verify
344 the words are close enough to each other, as in specified distance.
345 This function is called for phrase and proximity search.
346 @return TRUE if documents are found, FALSE if otherwise */
347 static
348 ibool
349 fts_phrase_or_proximity_search(
350 /*===========================*/
351 	fts_query_t*	query,		/*!< in/out:  query instance
352 					query->doc_ids might be instantiated
353 					with qualified doc IDs */
354 	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
355 /*************************************************************//**
356 This function checks whether words in result documents are close to
357 each other (within proximity range as specified by "distance").
358 If "distance" is MAX_ULINT, then it will find all combinations of
359 positions of matching words and store min and max positions
360 in the "qualified_pos" for later verification.
361 @return true if words are close to each other, false if otherwise */
362 static
363 bool
364 fts_proximity_get_positions(
365 /*========================*/
366 	fts_match_t**		match,		/*!< in: query instance */
367 	ulint			num_match,	/*!< in: number of matching
368 						items */
369 	ulint			distance,	/*!< in: distance value
370 						for proximity search */
371 	fts_proximity_t*	qualified_pos);	/*!< out: the position info
372 						records ranges containing
373 						all matching words. */
374 #if 0
375 /********************************************************************
376 Get the total number of words in a documents. */
377 static
378 ulint
379 fts_query_terms_in_document(
380 /*========================*/
381 					/*!< out: DB_SUCCESS if all go well
382 					else error code */
383 	fts_query_t*	query,		/*!< in: FTS query state */
384 	doc_id_t	doc_id,		/*!< in: the word to check */
385 	ulint*		total);		/*!< out: total words in document */
386 #endif
387 
388 /********************************************************************
389 Compare two fts_doc_freq_t doc_ids.
390 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
391 UNIV_INLINE
392 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)393 fts_freq_doc_id_cmp(
394 /*================*/
395 	const void*	p1,			/*!< in: id1 */
396 	const void*	p2)			/*!< in: id2 */
397 {
398 	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
399 	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
400 
401 	return((int) (fq1->doc_id - fq2->doc_id));
402 }
403 
404 #if 0
405 /*******************************************************************//**
406 Print the table used for calculating LCS. */
407 static
408 void
409 fts_print_lcs_table(
410 /*================*/
411 	const ulint*	table,		/*!< in: array to print */
412 	ulint		n_rows,		/*!< in: total no. of rows */
413 	ulint		n_cols)		/*!< in: total no. of cols */
414 {
415 	ulint		i;
416 
417 	for (i = 0; i < n_rows; ++i) {
418 		ulint	j;
419 
420 		printf("\n");
421 
422 		for (j = 0; j < n_cols; ++j) {
423 
424 			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425 		}
426 	}
427 }
428 
429 /********************************************************************
430 Find the longest common subsequence between the query string and
431 the document. */
432 static
433 ulint
434 fts_query_lcs(
435 /*==========*/
436 					/*!< out: LCS (length) between
437 					two ilists */
438 	const	ulint*	p1,		/*!< in: word positions of query */
439 	ulint	len_p1,			/*!< in: no. of elements in p1 */
440 	const	ulint*	p2,		/*!< in: word positions within document */
441 	ulint	len_p2)			/*!< in: no. of elements in p2 */
442 {
443 	int	i;
444 	ulint	len = 0;
445 	ulint	r = len_p1;
446 	ulint	c = len_p2;
447 	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
448 	ulint*	table = (ulint*) ut_malloc_nokey(size);
449 
450 	/* Traverse the table backwards, from the last row to the first and
451 	also from the last column to the first. We compute the smaller
452 	common subsequeces first, then use the caluclated values to determine
453 	the longest common subsequence. The result will be in TABLE[0][0]. */
454 	for (i = r; i >= 0; --i) {
455 		int	j;
456 
457 		for (j = c; j >= 0; --j) {
458 
459 			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
460 
461 				FTS_ELEM(table, c, i, j) = 0;
462 
463 			} else if (p1[i] == p2[j]) {
464 
465 				FTS_ELEM(table, c, i, j) = FTS_ELEM(
466 					table, c, i + 1, j + 1) + 1;
467 
468 			} else {
469 
470 				ulint	value;
471 
472 				value = ut_max(
473 					FTS_ELEM(table, c, i + 1, j),
474 					FTS_ELEM(table, c, i, j + 1));
475 
476 				FTS_ELEM(table, c, i, j) = value;
477 			}
478 		}
479 	}
480 
481 	len = FTS_ELEM(table, c, 0, 0);
482 
483 	fts_print_lcs_table(table, r, c);
484 	printf("\nLen=" ULINTPF "\n", len);
485 
486 	ut_free(table);
487 
488 	return(len);
489 }
490 #endif
491 
492 /*******************************************************************//**
493 Compare two fts_ranking_t instance on their rank value and doc ids in
494 descending order on the rank and ascending order on doc id.
495 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
496 static
497 int
fts_query_compare_rank(const void * p1,const void * p2)498 fts_query_compare_rank(
499 /*===================*/
500 	const void*	p1,		/*!< in: pointer to elem */
501 	const void*	p2)		/*!< in: pointer to elem */
502 {
503 	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
504 	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
505 
506 	if (r2->rank < r1->rank) {
507 		return(-1);
508 	} else if (r2->rank == r1->rank) {
509 
510 		if (r1->doc_id < r2->doc_id) {
511 			return(1);
512 		} else if (r1->doc_id > r2->doc_id) {
513 			return(1);
514 		}
515 
516 		return(0);
517 	}
518 
519 	return(1);
520 }
521 
522 /*******************************************************************//**
523 Create words in ranking */
524 static
525 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)526 fts_ranking_words_create(
527 /*=====================*/
528 	fts_query_t*	query,		/*!< in: query instance */
529 	fts_ranking_t*	ranking)	/*!< in: ranking instance */
530 {
531 	ranking->words = static_cast<byte*>(
532 		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533 	ranking->words_len = RANKING_WORDS_INIT_LEN;
534 }
535 
536 /*
537 The optimization here is using a char array(bitmap) to replace words rb tree
538 in fts_ranking_t.
539 
540 It can save lots of memory except in some cases of QUERY EXPANSION.
541 
542 'word_map' is used as a word dictionary, in which the key is a word, the value
543 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544 if not, we add it into 'word_map', and give it a position(actually a number).
545 then we set the corresponding bit to '1' at the position in the char array 'words'.
546 
547 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548 more quickly than searching by value in 'word_map'. we use 'word_vector'
549 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551 by the word.
552 */
553 
554 /*******************************************************************//**
555 Add a word into ranking */
556 static
557 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)558 fts_ranking_words_add(
559 /*==================*/
560 	fts_query_t*		query,		/*!< in: query instance */
561 	fts_ranking_t*		ranking,	/*!< in: ranking instance */
562 	const fts_string_t*	word)		/*!< in: term/word to add */
563 {
564 	ulint	pos;
565 	ulint	byte_offset;
566 	ulint	bit_offset;
567 	ib_rbt_bound_t	parent;
568 
569 	/* Note: we suppose the word map and vector are append-only. */
570 	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571 
572 	/* We use ib_rbt to simulate a map, f_n_char means position. */
573 	if (rbt_search(query->word_map, &parent, word) == 0) {
574 		fts_string_t*	result_word;
575 
576 		result_word = rbt_value(fts_string_t, parent.last);
577 		pos = result_word->f_n_char;
578 		ut_ad(pos < rbt_size(query->word_map));
579 	} else {
580 		/* Add the word to map. */
581 		fts_string_t	new_word;
582 
583 		pos = rbt_size(query->word_map);
584 
585 		fts_string_dup(&new_word, word, query->heap);
586 		new_word.f_n_char = pos;
587 
588 		rbt_add_node(query->word_map, &parent, &new_word);
589 		ut_ad(rbt_validate(query->word_map));
590 		query->word_vector->push_back(new_word);
591 	}
592 
593 	/* Check words len */
594 	byte_offset = pos / CHAR_BIT;
595 	if (byte_offset >= ranking->words_len) {
596 		byte*	words = ranking->words;
597 		ulint	words_len = ranking->words_len;
598 
599 		while (byte_offset >= words_len) {
600 			words_len *= 2;
601 		}
602 
603 		ranking->words = static_cast<byte*>(
604 			mem_heap_zalloc(query->heap, words_len));
605 		memcpy(ranking->words, words, ranking->words_len);
606 		ranking->words_len = words_len;
607 	}
608 
609 	/* Set ranking words */
610 	ut_ad(byte_offset < ranking->words_len);
611 	bit_offset = pos % CHAR_BIT;
612 	ranking->words[byte_offset] = static_cast<byte>(
613 		ranking->words[byte_offset] | 1 << bit_offset);
614 }
615 
616 /*******************************************************************//**
617 Get a word from a ranking
618 @return true if it's successful */
619 static
620 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)621 fts_ranking_words_get_next(
622 /*=======================*/
623 	const	fts_query_t*	query,	/*!< in: query instance */
624 	fts_ranking_t*		ranking,/*!< in: ranking instance */
625 	ulint*			pos,	/*!< in/out: word start pos */
626 	fts_string_t*		word)	/*!< in/out: term/word to add */
627 {
628 	bool	ret = false;
629 	ulint	max_pos = ranking->words_len * CHAR_BIT;
630 
631 	/* Search for next word */
632 	while (*pos < max_pos) {
633 		ulint	byte_offset = *pos / CHAR_BIT;
634 		ulint	bit_offset = *pos % CHAR_BIT;
635 
636 		if (ranking->words[byte_offset] & (1 << bit_offset)) {
637 			ret = true;
638 			break;
639 		}
640 
641 		*pos += 1;
642 	};
643 
644 	/* Get next word from word vector */
645 	if (ret) {
646 		ut_ad(*pos < query->word_vector->size());
647 		*word = query->word_vector->at((size_t)*pos);
648 		*pos += 1;
649 	}
650 
651 	return ret;
652 }
653 
654 /*******************************************************************//**
655 Add a word if it doesn't exist, to the term freq RB tree. We store
656 a pointer to the word that is passed in as the argument.
657 @return pointer to word */
658 static
659 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)660 fts_query_add_word_freq(
661 /*====================*/
662 	fts_query_t*		query,		/*!< in: query instance */
663 	const fts_string_t*	word)		/*!< in: term/word to add */
664 {
665 	ib_rbt_bound_t		parent;
666 
667 	/* Lookup the word in our rb tree and add if it doesn't exist. */
668 	if (rbt_search(query->word_freqs, &parent, word) != 0) {
669 		fts_word_freq_t	word_freq;
670 
671 		memset(&word_freq, 0, sizeof(word_freq));
672 
673 		fts_string_dup(&word_freq.word, word, query->heap);
674 
675 		word_freq.doc_count = 0;
676 
677 		word_freq.doc_freqs = rbt_create(
678 			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
679 
680 		parent.last = rbt_add_node(
681 			query->word_freqs, &parent, &word_freq);
682 
683 		query->total_size += word->f_len
684 			+ SIZEOF_RBT_CREATE
685 			+ SIZEOF_RBT_NODE_ADD
686 			+ sizeof(fts_word_freq_t);
687 	}
688 
689 	return(rbt_value(fts_word_freq_t, parent.last));
690 }
691 
692 /*******************************************************************//**
693 Add a doc id if it doesn't exist, to the doc freq RB tree.
694 @return pointer to word */
695 static
696 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)697 fts_query_add_doc_freq(
698 /*===================*/
699 	fts_query_t*	query,		/*!< in: query instance	*/
700 	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
701 	doc_id_t	doc_id)		/*!< in: doc id to add */
702 {
703 	ib_rbt_bound_t	parent;
704 
705 	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
706 	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
707 		fts_doc_freq_t	doc_freq;
708 
709 		memset(&doc_freq, 0, sizeof(doc_freq));
710 
711 		doc_freq.freq = 0;
712 		doc_freq.doc_id = doc_id;
713 
714 		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
715 
716 		query->total_size += SIZEOF_RBT_NODE_ADD
717 			+ sizeof(fts_doc_freq_t);
718 	}
719 
720 	return(rbt_value(fts_doc_freq_t, parent.last));
721 }
722 
723 /*******************************************************************//**
724 Add the doc id to the query set only if it's not in the
725 deleted array. */
726 static
727 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)728 fts_query_union_doc_id(
729 /*===================*/
730 	fts_query_t*	query,		/*!< in: query instance */
731 	doc_id_t	doc_id,		/*!< in: the doc id to add */
732 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
733 					rank associated with the doc_id */
734 {
735 	ib_rbt_bound_t	parent;
736 	ulint		size = ib_vector_size(query->deleted->doc_ids);
737 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
738 
739 	/* Check if the doc id is deleted and it's not already in our set. */
740 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
741 	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
742 
743 		fts_ranking_t	ranking;
744 
745 		ranking.rank = rank;
746 		ranking.doc_id = doc_id;
747 		fts_ranking_words_create(query, &ranking);
748 
749 		rbt_add_node(query->doc_ids, &parent, &ranking);
750 
751 		query->total_size += SIZEOF_RBT_NODE_ADD
752 			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
753 	}
754 }
755 
756 /*******************************************************************//**
757 Remove the doc id from the query set only if it's not in the
758 deleted set. */
759 static
760 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)761 fts_query_remove_doc_id(
762 /*====================*/
763 	fts_query_t*	query,		/*!< in: query instance */
764 	doc_id_t	doc_id)		/*!< in: the doc id to add */
765 {
766 	ib_rbt_bound_t	parent;
767 	ulint		size = ib_vector_size(query->deleted->doc_ids);
768 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
769 
770 	/* Check if the doc id is deleted and it's in our set. */
771 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
772 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
773 		ut_free(rbt_remove_node(query->doc_ids, parent.last));
774 
775 		ut_ad(query->total_size >=
776 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
777 		query->total_size -= SIZEOF_RBT_NODE_ADD
778 			+ sizeof(fts_ranking_t);
779 	}
780 }
781 
782 /*******************************************************************//**
783 Find the doc id in the query set but not in the deleted set, artificialy
784 downgrade or upgrade its ranking by a value and make/initialize its ranking
785 under or above its normal range 0 to 1. This is used for Boolean Search
786 operator such as Negation operator, which makes word's contribution to the
787 row's relevance to be negative */
788 static
789 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)790 fts_query_change_ranking(
791 /*====================*/
792 	fts_query_t*	query,		/*!< in: query instance */
793 	doc_id_t	doc_id,		/*!< in: the doc id to add */
794 	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
795 {
796 	ib_rbt_bound_t	parent;
797 	ulint		size = ib_vector_size(query->deleted->doc_ids);
798 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
799 
800 	/* Check if the doc id is deleted and it's in our set. */
801 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
802 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
803 
804 		fts_ranking_t*	ranking;
805 
806 		ranking = rbt_value(fts_ranking_t, parent.last);
807 
808 		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
809 
810 		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
811 		and RANK_UPGRADE (0.5) */
812 		if (ranking->rank >= 1.0F) {
813 			ranking->rank = 1.0F;
814 		} else if (ranking->rank <= -1.0F) {
815 			ranking->rank = -1.0F;
816 		}
817 	}
818 }
819 
820 /*******************************************************************//**
821 Check the doc id in the query set only if it's not in the
822 deleted array. The doc ids that were found are stored in
823 another rb tree (fts_query_t::intersect). */
824 static
825 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)826 fts_query_intersect_doc_id(
827 /*=======================*/
828 	fts_query_t*	query,		/*!< in: query instance */
829 	doc_id_t	doc_id,		/*!< in: the doc id to add */
830 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
831 					rank associated with the doc_id */
832 {
833 	ib_rbt_bound_t	parent;
834 	ulint		size = ib_vector_size(query->deleted->doc_ids);
835 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
836 	fts_ranking_t*	ranking= NULL;
837 
838 	/* There are three types of intersect:
839 	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
840 	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
841 	      if it matches 'b'. if the doc is also in  doc_ids, then change the
842 	      doc's rank, and add 'a' in doc's words.
843 	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
844 	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
845 
846 	/* Check if the doc id is deleted and it's in our set */
847 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
848 		fts_ranking_t	new_ranking;
849 
850 		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
851 			if (query->multi_exist) {
852 				return;
853 			} else {
854 				new_ranking.words = NULL;
855 			}
856 		} else {
857 			ranking = rbt_value(fts_ranking_t, parent.last);
858 
859 			/* We've just checked the doc id before */
860 			if (ranking->words == NULL) {
861 				ut_ad(rbt_search(query->intersection, &parent,
862 					ranking) == 0);
863 				return;
864 			}
865 
866 			/* Merge rank */
867 			rank += ranking->rank;
868 			if (rank >= 1.0F) {
869 				rank = 1.0F;
870 			} else if (rank <= -1.0F) {
871 				rank = -1.0F;
872 			}
873 
874 			/* Take words */
875 			new_ranking.words = ranking->words;
876 			new_ranking.words_len = ranking->words_len;
877 		}
878 
879 		new_ranking.rank = rank;
880 		new_ranking.doc_id = doc_id;
881 
882 		if (rbt_search(query->intersection, &parent,
883 			       &new_ranking) != 0) {
884 			if (new_ranking.words == NULL) {
885 				fts_ranking_words_create(query, &new_ranking);
886 
887 				query->total_size += RANKING_WORDS_INIT_LEN;
888 			} else {
889 				/* Note that the intersection has taken
890 				ownership of the ranking data. */
891 				ranking->words = NULL;
892 			}
893 
894 			rbt_add_node(query->intersection,
895 				     &parent, &new_ranking);
896 
897 			query->total_size += SIZEOF_RBT_NODE_ADD
898 				+ sizeof(fts_ranking_t);
899 		}
900 	}
901 }
902 
903 /*******************************************************************//**
904 Free the document ranking rb tree. */
905 static
906 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)907 fts_query_free_doc_ids(
908 /*===================*/
909 	fts_query_t*	query,		/*!< in: query instance */
910 	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
911 {
912 	const ib_rbt_node_t*	node;
913 
914 	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
915 
916 		fts_ranking_t*	ranking;
917 
918 		ranking = rbt_value(fts_ranking_t, node);
919 
920 		if (ranking->words) {
921 			ranking->words = NULL;
922 		}
923 
924 		ut_free(rbt_remove_node(doc_ids, node));
925 
926 		ut_ad(query->total_size >=
927 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
928 		query->total_size -= SIZEOF_RBT_NODE_ADD
929 			+ sizeof(fts_ranking_t);
930 	}
931 
932 	rbt_free(doc_ids);
933 
934 	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
935 	query->total_size -= SIZEOF_RBT_CREATE;
936 }
937 
938 /*******************************************************************//**
939 Add the word to the documents "list" of matching words from
940 the query. We make a copy of the word from the query heap. */
941 static
942 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)943 fts_query_add_word_to_document(
944 /*===========================*/
945 	fts_query_t*		query,	/*!< in: query to update */
946 	doc_id_t		doc_id,	/*!< in: the document to update */
947 	const fts_string_t*	word)	/*!< in: the token to add */
948 {
949 	ib_rbt_bound_t		parent;
950 	fts_ranking_t*		ranking = NULL;
951 
952 	if (query->flags == FTS_OPT_RANKING) {
953 		return;
954 	}
955 
956 	/* First we search the intersection RB tree as it could have
957 	taken ownership of the words rb tree instance. */
958 	if (query->intersection
959 	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
960 
961 		ranking = rbt_value(fts_ranking_t, parent.last);
962 	}
963 
964 	if (ranking == NULL
965 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
966 
967 		ranking = rbt_value(fts_ranking_t, parent.last);
968 	}
969 
970 	if (ranking != NULL) {
971 		fts_ranking_words_add(query, ranking, word);
972 	}
973 }
974 
975 /*******************************************************************//**
976 Check the node ilist. */
977 static
978 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)979 fts_query_check_node(
980 /*=================*/
981 	fts_query_t*		query,	/*!< in: query to update */
982 	const fts_string_t*	token,	/*!< in: the token to search */
983 	const fts_node_t*	node)	/*!< in: node to check */
984 {
985 	/* Skip nodes whose doc ids are out range. */
986 	if (query->oper == FTS_EXIST
987 	    && ((query->upper_doc_id > 0
988 		&& node->first_doc_id > query->upper_doc_id)
989 		|| (query->lower_doc_id > 0
990 		    && node->last_doc_id < query->lower_doc_id))) {
991 
992 		/* Ignore */
993 
994 	} else {
995 		int		ret;
996 		ib_rbt_bound_t	parent;
997 		ulint		ilist_size = node->ilist_size;
998 		fts_word_freq_t*word_freqs;
999 
1000 		/* The word must exist. */
1001 		ret = rbt_search(query->word_freqs, &parent, token);
1002 		ut_a(ret == 0);
1003 
1004 		word_freqs = rbt_value(fts_word_freq_t, parent.last);
1005 
1006 		query->error = fts_query_filter_doc_ids(
1007 					query, token, word_freqs, node,
1008 					node->ilist, ilist_size, TRUE);
1009 	}
1010 }
1011 
1012 /*****************************************************************//**
1013 Search index cache for word with wildcard match.
1014 @return number of words matched */
1015 static
1016 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1017 fts_cache_find_wildcard(
1018 /*====================*/
1019 	fts_query_t*		query,		/*!< in: query instance */
1020 	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
1021 	const fts_string_t*	token)		/*!< in: token to search */
1022 {
1023 	ib_rbt_bound_t		parent;
1024 	const ib_vector_t*	nodes = NULL;
1025 	fts_string_t		srch_text;
1026 	byte			term[FTS_MAX_WORD_LEN + 1];
1027 	ulint			num_word = 0;
1028 
1029 	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1030 			? token->f_len - 1
1031 			: token->f_len;
1032 
1033 	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1034 	term[srch_text.f_len] = '\0';
1035 	srch_text.f_str = term;
1036 
1037 	/* Lookup the word in the rb tree */
1038 	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1039 			   innobase_fts_text_cmp_prefix) == 0) {
1040 		const fts_tokenizer_word_t*     word;
1041 		ulint				i;
1042 		const ib_rbt_node_t*		cur_node;
1043 		ibool				forward = FALSE;
1044 
1045 		word = rbt_value(fts_tokenizer_word_t, parent.last);
1046 		cur_node = parent.last;
1047 
1048 		while (innobase_fts_text_cmp_prefix(
1049 			index_cache->charset, &srch_text, &word->text) == 0) {
1050 
1051 			nodes = word->nodes;
1052 
1053 			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1054 				int                     ret;
1055 				const fts_node_t*       node;
1056 				ib_rbt_bound_t          freq_parent;
1057 				fts_word_freq_t*	word_freqs;
1058 
1059 				node = static_cast<const fts_node_t*>(
1060 					ib_vector_get_const(nodes, i));
1061 
1062 				ret = rbt_search(query->word_freqs,
1063 						 &freq_parent,
1064 						 &srch_text);
1065 
1066 				ut_a(ret == 0);
1067 
1068 				word_freqs = rbt_value(
1069 					fts_word_freq_t,
1070 					freq_parent.last);
1071 
1072 				query->error = fts_query_filter_doc_ids(
1073 					query, &srch_text,
1074 					word_freqs, node,
1075 					node->ilist, node->ilist_size, TRUE);
1076 
1077 				if (query->error != DB_SUCCESS) {
1078 					return(0);
1079 				}
1080 			}
1081 
1082 			num_word++;
1083 
1084 			if (!forward) {
1085 				cur_node = rbt_prev(
1086 					index_cache->words, cur_node);
1087 			} else {
1088 cont_search:
1089 				cur_node = rbt_next(
1090 					index_cache->words, cur_node);
1091 			}
1092 
1093 			if (!cur_node) {
1094 				break;
1095 			}
1096 
1097 			word = rbt_value(fts_tokenizer_word_t, cur_node);
1098 		}
1099 
1100 		if (!forward) {
1101 			forward = TRUE;
1102 			cur_node = parent.last;
1103 			goto cont_search;
1104 		}
1105 	}
1106 
1107 	return(num_word);
1108 }
1109 
1110 /*****************************************************************//**
1111 Set difference.
1112 @return DB_SUCCESS if all go well */
1113 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1114 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1115 fts_query_difference(
1116 /*=================*/
1117 	fts_query_t*		query,	/*!< in: query instance */
1118 	const fts_string_t*	token)	/*!< in: token to search */
1119 {
1120 	ulint			n_doc_ids= 0;
1121 	trx_t*			trx = query->trx;
1122 	dict_table_t*		table = query->index->table;
1123 
1124 	ut_a(query->oper == FTS_IGNORE);
1125 
1126 #ifdef FTS_INTERNAL_DIAG_PRINT
1127 	{
1128 		ib::info	out;
1129 		out << "DIFFERENCE: Searching: '";
1130 		out.write(token->f_str, token->f_len);
1131 		out << "'";
1132 	}
1133 #endif
1134 
1135 	if (query->doc_ids) {
1136 		n_doc_ids = rbt_size(query->doc_ids);
1137 	}
1138 
1139 	/* There is nothing we can substract from an empty set. */
1140 	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1141 		ulint			i;
1142 		fts_fetch_t		fetch;
1143 		const ib_vector_t*	nodes;
1144 		const fts_index_cache_t*index_cache;
1145 		que_t*			graph = NULL;
1146 		fts_cache_t*		cache = table->fts->cache;
1147 		dberr_t			error;
1148 
1149 		rw_lock_x_lock(&cache->lock);
1150 
1151 		index_cache = fts_find_index_cache(cache, query->index);
1152 
1153 		/* Must find the index cache */
1154 		ut_a(index_cache != NULL);
1155 
1156 		/* Search the cache for a matching word first. */
1157 		if (query->cur_node->term.wildcard
1158 		    && query->flags != FTS_PROXIMITY
1159 		    && query->flags != FTS_PHRASE) {
1160 			fts_cache_find_wildcard(query, index_cache, token);
1161 		} else {
1162 			nodes = fts_cache_find_word(index_cache, token);
1163 
1164 			for (i = 0; nodes && i < ib_vector_size(nodes)
1165 			     && query->error == DB_SUCCESS; ++i) {
1166 				const fts_node_t*	node;
1167 
1168 				node = static_cast<const fts_node_t*>(
1169 					ib_vector_get_const(nodes, i));
1170 
1171 				fts_query_check_node(query, token, node);
1172 			}
1173 		}
1174 
1175 		rw_lock_x_unlock(&cache->lock);
1176 
1177 		/* error is passed by 'query->error' */
1178 		if (query->error != DB_SUCCESS) {
1179 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1180 			return(query->error);
1181 		}
1182 
1183 		/* Setup the callback args for filtering and
1184 		consolidating the ilist. */
1185 		fetch.read_arg = query;
1186 		fetch.read_record = fts_query_index_fetch_nodes;
1187 
1188 		error = fts_index_fetch_nodes(
1189 			trx, &graph, &query->fts_index_table, token, &fetch);
1190 
1191 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1192 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1193 		if (error != DB_SUCCESS) {
1194 			query->error = error;
1195 		}
1196 
1197 		fts_que_graph_free(graph);
1198 	}
1199 
1200 	/* The size can't increase. */
1201 	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1202 
1203 	return(query->error);
1204 }
1205 
1206 /*****************************************************************//**
1207 Intersect the token doc ids with the current set.
1208 @return DB_SUCCESS if all go well */
1209 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1210 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1211 fts_query_intersect(
1212 /*================*/
1213 	fts_query_t*		query,	/*!< in: query instance */
1214 	const fts_string_t*	token)	/*!< in: the token to search */
1215 {
1216 	trx_t*			trx = query->trx;
1217 	dict_table_t*		table = query->index->table;
1218 
1219 	ut_a(query->oper == FTS_EXIST);
1220 
1221 #ifdef FTS_INTERNAL_DIAG_PRINT
1222 	{
1223 		ib::info	out;
1224 		out << "INTERSECT: Searching: '";
1225 		out.write(token->f_str, token->f_len);
1226 		out << "'";
1227 	}
1228 #endif
1229 
1230 	/* If the words set is not empty and multi exist is true,
1231 	we know the intersection set is empty in advance. */
1232 	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1233 		ulint                   n_doc_ids = 0;
1234 		ulint			i;
1235 		fts_fetch_t		fetch;
1236 		const ib_vector_t*	nodes;
1237 		const fts_index_cache_t*index_cache;
1238 		que_t*			graph = NULL;
1239 		fts_cache_t*		cache = table->fts->cache;
1240 		dberr_t			error;
1241 
1242 		ut_a(!query->intersection);
1243 
1244 		n_doc_ids = rbt_size(query->doc_ids);
1245 
1246 		/* Create the rb tree that will hold the doc ids of
1247 		the intersection. */
1248 		query->intersection = rbt_create(
1249 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1250 
1251 		query->total_size += SIZEOF_RBT_CREATE;
1252 
1253 		/* This is to avoid decompressing the ilist if the
1254 		node's ilist doc ids are out of range. */
1255 		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1256 			const ib_rbt_node_t*	node;
1257 			doc_id_t*		doc_id;
1258 
1259 			node = rbt_first(query->doc_ids);
1260 			doc_id = rbt_value(doc_id_t, node);
1261 			query->lower_doc_id = *doc_id;
1262 
1263 			node = rbt_last(query->doc_ids);
1264 			doc_id = rbt_value(doc_id_t, node);
1265 			query->upper_doc_id = *doc_id;
1266 
1267 		} else {
1268 			query->lower_doc_id = 0;
1269 			query->upper_doc_id = 0;
1270 		}
1271 
1272 		/* Search the cache for a matching word first. */
1273 
1274 		rw_lock_x_lock(&cache->lock);
1275 
1276 		/* Search for the index specific cache. */
1277 		index_cache = fts_find_index_cache(cache, query->index);
1278 
1279 		/* Must find the index cache. */
1280 		ut_a(index_cache != NULL);
1281 
1282 		if (query->cur_node->term.wildcard) {
1283 			/* Wildcard search the index cache */
1284 			fts_cache_find_wildcard(query, index_cache, token);
1285 		} else {
1286 			nodes = fts_cache_find_word(index_cache, token);
1287 
1288 			for (i = 0; nodes && i < ib_vector_size(nodes)
1289 			     && query->error == DB_SUCCESS; ++i) {
1290 				const fts_node_t*	node;
1291 
1292 				node = static_cast<const fts_node_t*>(
1293 					ib_vector_get_const(nodes, i));
1294 
1295 				fts_query_check_node(query, token, node);
1296 			}
1297 		}
1298 
1299 		rw_lock_x_unlock(&cache->lock);
1300 
1301 		/* error is passed by 'query->error' */
1302 		if (query->error != DB_SUCCESS) {
1303 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1304 			return(query->error);
1305 		}
1306 
1307 		/* Setup the callback args for filtering and
1308 		consolidating the ilist. */
1309 		fetch.read_arg = query;
1310 		fetch.read_record = fts_query_index_fetch_nodes;
1311 
1312 		error = fts_index_fetch_nodes(
1313 			trx, &graph, &query->fts_index_table, token, &fetch);
1314 
1315 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1316 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1317 		if (error != DB_SUCCESS) {
1318 			query->error = error;
1319 		}
1320 
1321 		fts_que_graph_free(graph);
1322 
1323 		if (query->error == DB_SUCCESS) {
1324 			/* Make the intesection (rb tree) the current doc id
1325 			set and free the old set. */
1326 			fts_query_free_doc_ids(query, query->doc_ids);
1327 			query->doc_ids = query->intersection;
1328 			query->intersection = NULL;
1329 
1330 			ut_a(!query->multi_exist || (query->multi_exist
1331 			     && rbt_size(query->doc_ids) <= n_doc_ids));
1332 		}
1333 	}
1334 
1335 	return(query->error);
1336 }
1337 
1338 /*****************************************************************//**
1339 Query index cache.
1340 @return DB_SUCCESS if all go well */
1341 static
1342 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1343 fts_query_cache(
1344 /*============*/
1345 	fts_query_t*		query,	/*!< in/out: query instance */
1346 	const fts_string_t*	token)	/*!< in: token to search */
1347 {
1348 	const fts_index_cache_t*index_cache;
1349 	dict_table_t*		table = query->index->table;
1350 	fts_cache_t*		cache = table->fts->cache;
1351 
1352 	/* Search the cache for a matching word first. */
1353 	rw_lock_x_lock(&cache->lock);
1354 
1355 	/* Search for the index specific cache. */
1356 	index_cache = fts_find_index_cache(cache, query->index);
1357 
1358 	/* Must find the index cache. */
1359 	ut_a(index_cache != NULL);
1360 
1361 	if (query->cur_node->term.wildcard
1362 	    && query->flags != FTS_PROXIMITY
1363 	    && query->flags != FTS_PHRASE) {
1364 		/* Wildcard search the index cache */
1365 		fts_cache_find_wildcard(query, index_cache, token);
1366 	} else {
1367 		const ib_vector_t*      nodes;
1368 		ulint			i;
1369 
1370 		nodes = fts_cache_find_word(index_cache, token);
1371 
1372 		for (i = 0; nodes && i < ib_vector_size(nodes)
1373 		     && query->error == DB_SUCCESS; ++i) {
1374 			const fts_node_t*	node;
1375 
1376 			node = static_cast<const fts_node_t*>(
1377 				ib_vector_get_const(nodes, i));
1378 
1379 			fts_query_check_node(query, token, node);
1380 		}
1381 	}
1382 
1383 	rw_lock_x_unlock(&cache->lock);
1384 
1385 	return(query->error);
1386 }
1387 
1388 /*****************************************************************//**
1389 Set union.
1390 @return DB_SUCCESS if all go well */
1391 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1392 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1393 fts_query_union(
1394 /*============*/
1395 	fts_query_t*		query,	/*!< in: query instance */
1396 	fts_string_t*		token)	/*!< in: token to search */
1397 {
1398 	fts_fetch_t		fetch;
1399 	ulint			n_doc_ids = 0;
1400 	trx_t*			trx = query->trx;
1401 	que_t*			graph = NULL;
1402 	dberr_t			error;
1403 
1404 	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1405 	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1406 
1407 #ifdef FTS_INTERNAL_DIAG_PRINT
1408 	{
1409 		ib::info	out;
1410 		out << "UNION: Searching: '";
1411 		out.write(token->f_str, token->f_len);
1412 		out << "'";
1413 	}
1414 #endif
1415 
1416 	if (query->doc_ids) {
1417 		n_doc_ids = rbt_size(query->doc_ids);
1418 	}
1419 
1420 	if (token->f_len == 0) {
1421 		return(query->error);
1422 	}
1423 
1424 	fts_query_cache(query, token);
1425 
1426 	/* Setup the callback args for filtering and
1427 	consolidating the ilist. */
1428 	fetch.read_arg = query;
1429 	fetch.read_record = fts_query_index_fetch_nodes;
1430 
1431 	/* Read the nodes from disk. */
1432 	error = fts_index_fetch_nodes(
1433 		trx, &graph, &query->fts_index_table, token, &fetch);
1434 
1435 	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1436 	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1437 	if (error != DB_SUCCESS) {
1438 		query->error = error;
1439 	}
1440 
1441 	fts_que_graph_free(graph);
1442 
1443 	if (query->error == DB_SUCCESS) {
1444 
1445 		/* The size can't decrease. */
1446 		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1447 
1448 		/* Calulate the number of doc ids that were added to
1449 		the current doc id set. */
1450 		if (query->doc_ids) {
1451 			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1452 		}
1453 	}
1454 
1455 	return(query->error);
1456 }
1457 
1458 /*****************************************************************//**
1459 Depending upon the current query operator process the doc id.
1460 return DB_SUCCESS if all go well
1461 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1462 static
1463 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1464 fts_query_process_doc_id(
1465 /*=====================*/
1466 	fts_query_t*	query,		/*!< in: query instance */
1467 	doc_id_t	doc_id,		/*!< in: doc id to process */
1468 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
1469 					rank associated with the doc_id */
1470 {
1471 	if (query->flags == FTS_OPT_RANKING) {
1472 		return(DB_SUCCESS);
1473 	}
1474 
1475 	switch (query->oper) {
1476 	case FTS_NONE:
1477 		fts_query_union_doc_id(query, doc_id, rank);
1478 		break;
1479 
1480 	case FTS_EXIST:
1481 		fts_query_intersect_doc_id(query, doc_id, rank);
1482 		break;
1483 
1484 	case FTS_IGNORE:
1485 		fts_query_remove_doc_id(query, doc_id);
1486 		break;
1487 
1488 	case FTS_NEGATE:
1489 		fts_query_change_ranking(query, doc_id, TRUE);
1490 		break;
1491 
1492 	case FTS_DECR_RATING:
1493 		fts_query_union_doc_id(query, doc_id, rank);
1494 		fts_query_change_ranking(query, doc_id, TRUE);
1495 		break;
1496 
1497 	case FTS_INCR_RATING:
1498 		fts_query_union_doc_id(query, doc_id, rank);
1499 		fts_query_change_ranking(query, doc_id, FALSE);
1500 		break;
1501 
1502 	default:
1503 		ut_error;
1504 	}
1505 
1506 	if (query->total_size > fts_result_cache_limit) {
1507 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1508 	} else {
1509 		return(DB_SUCCESS);
1510 	}
1511 }
1512 
1513 /*****************************************************************//**
1514 Merge two result sets. */
1515 static
1516 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1517 fts_merge_doc_ids(
1518 /*==============*/
1519 	fts_query_t*	query,		/*!< in,out: query instance */
1520 	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
1521 {
1522 	const ib_rbt_node_t*	node;
1523 
1524 	DBUG_ENTER("fts_merge_doc_ids");
1525 
1526 	ut_a(!query->intersection);
1527 
1528 	/* To process FTS_EXIST operation (intersection), we need
1529 	to create a new result set for fts_query_intersect(). */
1530 	if (query->oper == FTS_EXIST) {
1531 
1532 		query->intersection = rbt_create(
1533 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1534 
1535 		query->total_size += SIZEOF_RBT_CREATE;
1536 	}
1537 
1538 	/* Merge the elements to the result set. */
1539 	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1540 		fts_ranking_t*		ranking;
1541 		ulint			pos = 0;
1542 		fts_string_t		word;
1543 
1544 		ranking = rbt_value(fts_ranking_t, node);
1545 
1546 		query->error = fts_query_process_doc_id(
1547 				query, ranking->doc_id, ranking->rank);
1548 
1549 		if (query->error != DB_SUCCESS) {
1550 			DBUG_RETURN(query->error);
1551 		}
1552 
1553 		/* Merge words. Don't need to take operator into account. */
1554 		ut_a(ranking->words);
1555 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1556 			fts_query_add_word_to_document(query, ranking->doc_id,
1557 						       &word);
1558 		}
1559 	}
1560 
1561 	/* If it is an intersection operation, reset query->doc_ids
1562 	to query->intersection and free the old result list. */
1563 	if (query->oper == FTS_EXIST && query->intersection != NULL) {
1564 		fts_query_free_doc_ids(query, query->doc_ids);
1565 		query->doc_ids = query->intersection;
1566 		query->intersection = NULL;
1567 	}
1568 
1569 	DBUG_RETURN(DB_SUCCESS);
1570 }
1571 
1572 /*****************************************************************//**
1573 Skip non-whitespace in a string. Move ptr to the next word boundary.
1574 @return pointer to first whitespace character or end */
1575 UNIV_INLINE
1576 byte*
fts_query_skip_word(byte * ptr,const byte * end)1577 fts_query_skip_word(
1578 /*================*/
1579 	byte*		ptr,		/*!< in: start of scan */
1580 	const byte*	end)		/*!< in: pointer to end of string */
1581 {
1582 	/* TODO: Does this have to be UTF-8 too ? */
1583 	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1584 		++ptr;
1585 	}
1586 
1587 	return(ptr);
1588 }
1589 
1590 /*****************************************************************//**
1591 Check whether the remaining terms in the phrase match the text.
1592 @return TRUE if matched else FALSE */
1593 static
1594 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1595 fts_query_match_phrase_terms(
1596 /*=========================*/
1597 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1598 	byte**		start,		/*!< in/out: text to search, we can't
1599 					make this const becase we need to
1600 					first convert the string to
1601 					lowercase */
1602 	const byte*	end,		/*!< in: pointer to the end of
1603 					the string to search */
1604 	mem_heap_t*	heap)		/*!< in: heap */
1605 {
1606 	ulint			i;
1607 	byte*			ptr = *start;
1608 	const ib_vector_t*	tokens = phrase->tokens;
1609 	ulint			distance = phrase->distance;
1610 
1611 	/* We check only from the second term onwards, since the first
1612 	must have matched otherwise we wouldn't be here. */
1613 	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1614 		fts_string_t		match;
1615 		fts_string_t		cmp_str;
1616 		const fts_string_t*	token;
1617 		int			result;
1618 		ulint			ret;
1619 
1620 		ret = innobase_mysql_fts_get_token(
1621 			phrase->charset, ptr,
1622 			const_cast<byte*>(end), &match);
1623 
1624 		if (match.f_len > 0) {
1625 			/* Get next token to match. */
1626 			token = static_cast<const fts_string_t*>(
1627 				ib_vector_get_const(tokens, i));
1628 
1629 			fts_string_dup(&cmp_str, &match, heap);
1630 
1631 			result = innobase_fts_text_case_cmp(
1632 				phrase->charset, token, &cmp_str);
1633 
1634 			/* Skip the rest of the tokens if this one doesn't
1635 			match and the proximity distance is exceeded. */
1636 			if (result
1637 			    && (distance == ULINT_UNDEFINED
1638 				|| distance == 0)) {
1639 
1640 				break;
1641 			}
1642 
1643 			/* This token matched move to the next token. */
1644 			if (result == 0) {
1645 				/* Advance the text to search by the length
1646 				of the last token. */
1647 				ptr += ret;
1648 
1649 				/* Advance to the next token. */
1650 				++i;
1651 			} else {
1652 
1653 				ut_a(distance != ULINT_UNDEFINED);
1654 
1655 				ptr = fts_query_skip_word(ptr, end);
1656 			}
1657 
1658 			/* Distance can be 0 for exact matches. */
1659 			if (distance != ULINT_UNDEFINED && distance > 0) {
1660 				--distance;
1661 			}
1662 		} else {
1663 			ptr += ret;
1664 		}
1665 	}
1666 
1667 	*start = ptr;
1668 
1669 	/* Can't be greater than the number of elements. */
1670 	ut_a(i <= ib_vector_size(tokens));
1671 
1672 	/* This is the case for multiple words. */
1673 	if (i == ib_vector_size(tokens)) {
1674 		phrase->found = TRUE;
1675 	}
1676 
1677 	return(phrase->found);
1678 }
1679 
1680 /*****************************************************************//**
1681 Callback function to count the number of words in position ranges,
1682 and see whether the word count is in specified "phrase->distance"
1683 @return true if the number of characters is less than the "distance" */
1684 static
1685 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1686 fts_proximity_is_word_in_range(
1687 /*===========================*/
1688 	const fts_phrase_t*
1689 			phrase,		/*!< in: phrase with the search info */
1690 	byte*		start,		/*!< in: text to search */
1691 	ulint		total_len)	/*!< in: length of text */
1692 {
1693 	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
1694 
1695 	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1696 	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1697 
1698 	/* Search each matched position pair (with min and max positions)
1699 	and count the number of words in the range */
1700 	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1701 		ulint		cur_pos = proximity_pos->min_pos[i];
1702 		ulint		n_word = 0;
1703 
1704 		ut_ad(proximity_pos->max_pos[i] <= total_len);
1705 
1706 		/* Walk through words in the range and count them */
1707 		while (cur_pos <= proximity_pos->max_pos[i]) {
1708 			ulint		len;
1709 			fts_string_t	str;
1710 
1711 			len = innobase_mysql_fts_get_token(
1712 				phrase->charset,
1713 				start + cur_pos,
1714 				start + total_len, &str);
1715 
1716 			if (len == 0) {
1717 				break;
1718 			}
1719 
1720 			/* Advances position with "len" bytes */
1721 			cur_pos += len;
1722 
1723 			/* Record the number of words */
1724 			if (str.f_n_char > 0) {
1725 				n_word++;
1726 			}
1727 
1728 			if (n_word > phrase->distance) {
1729 				break;
1730 			}
1731 		}
1732 
1733 		/* Check if the number of words is less than specified
1734 		"distance" */
1735 		if (n_word && n_word <= phrase->distance) {
1736 			return(true);
1737 		}
1738 	}
1739 
1740 	return(false);
1741 }
1742 
1743 /*****************************************************************//**
1744 FTS plugin parser 'myql_add_word' callback function for phrase match
1745 Refer to 'st_mysql_ftparser_param' for more detail.
1746 @return 0 if match, or return non-zero */
1747 static
1748 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO *)1749 fts_query_match_phrase_add_word_for_parser(
1750 /*=======================================*/
1751 	MYSQL_FTPARSER_PARAM*	param,		/*!< in: parser param */
1752 	const char*			word,		/*!< in: token */
1753 	int			word_len,	/*!< in: token length */
1754 	MYSQL_FTPARSER_BOOLEAN_INFO*)
1755 {
1756 	fts_phrase_param_t*	phrase_param;
1757 	fts_phrase_t*		phrase;
1758 	const ib_vector_t*	tokens;
1759 	fts_string_t		match;
1760 	fts_string_t		cmp_str;
1761 	const fts_string_t*	token;
1762 	int			result;
1763 	mem_heap_t*		heap;
1764 
1765 	phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1766 	heap = phrase_param->heap;
1767 	phrase = phrase_param->phrase;
1768 	tokens = phrase->tokens;
1769 
1770 	/* In case plugin parser doesn't check return value */
1771 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1772 		return(1);
1773 	}
1774 
1775 	match.f_str = (uchar *)(word);
1776 	match.f_len = ulint(word_len);
1777 	match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1778 
1779 	if (match.f_len > 0) {
1780 		/* Get next token to match. */
1781 		ut_a(phrase_param->token_index < ib_vector_size(tokens));
1782 		token = static_cast<const fts_string_t*>(
1783 			ib_vector_get_const(tokens, phrase_param->token_index));
1784 
1785 		fts_string_dup(&cmp_str, &match, heap);
1786 
1787 		result = innobase_fts_text_case_cmp(
1788 			phrase->charset, token, &cmp_str);
1789 
1790 		if (result == 0) {
1791 			phrase_param->token_index++;
1792 		} else {
1793 			return(1);
1794 		}
1795 	}
1796 
1797 	/* Can't be greater than the number of elements. */
1798 	ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1799 
1800 	/* This is the case for multiple words. */
1801 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1802 		phrase->found = TRUE;
1803 	}
1804 
1805 	return(static_cast<int>(phrase->found));
1806 }
1807 
1808 /*****************************************************************//**
1809 Check whether the terms in the phrase match the text.
1810 @return TRUE if matched else FALSE */
1811 static
1812 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1813 fts_query_match_phrase_terms_by_parser(
1814 /*===================================*/
1815 	fts_phrase_param_t*	phrase_param,	/* in/out: phrase param */
1816 	st_mysql_ftparser*	parser,		/* in: plugin fts parser */
1817 	byte*			text,		/* in: text to check */
1818 	ulint			len)		/* in: text length */
1819 {
1820 	MYSQL_FTPARSER_PARAM	param;
1821 
1822 	ut_a(parser);
1823 
1824 	/* Set paramters for param */
1825 	param.mysql_parse = fts_tokenize_document_internal;
1826 	param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1827 	param.mysql_ftparam = phrase_param;
1828 	param.cs = phrase_param->phrase->charset;
1829 	param.doc = reinterpret_cast<char*>(text);
1830 	param.length = static_cast<int>(len);
1831 	param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1832 
1833 	PARSER_INIT(parser, &param);
1834 	parser->parse(&param);
1835 	PARSER_DEINIT(parser, &param);
1836 
1837 	return(phrase_param->phrase->found);
1838 }
1839 
1840 /*****************************************************************//**
1841 Callback function to fetch and search the document.
1842 @return TRUE if matched else FALSE */
1843 static
1844 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1845 fts_query_match_phrase(
1846 /*===================*/
1847 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1848 	byte*		start,		/*!< in: text to search, we can't make
1849 					this const becase we need to first
1850 					convert the string to lowercase */
1851 	ulint		cur_len,	/*!< in: length of text */
1852 	ulint		prev_len,	/*!< in: total length for searched
1853 					doc fields*/
1854 	mem_heap_t*	heap)		/* heap */
1855 {
1856 	ulint			i;
1857 	const fts_string_t*	first;
1858 	const byte*		end = start + cur_len;
1859 	const ib_vector_t*	tokens = phrase->tokens;
1860 	const ib_vector_t*	positions = phrase->match->positions;
1861 
1862 	ut_a(!phrase->found);
1863 	ut_a(phrase->match->doc_id > 0);
1864 	ut_a(ib_vector_size(tokens) > 0);
1865 	ut_a(ib_vector_size(positions) > 0);
1866 
1867 	first = static_cast<const fts_string_t*>(
1868 		ib_vector_get_const(tokens, 0));
1869 
1870 	ut_a(phrase->match->start < ib_vector_size(positions));
1871 
1872 	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1873 		ulint		pos;
1874 		byte*		ptr = start;
1875 
1876 		pos = *(ulint*) ib_vector_get_const(positions, i);
1877 
1878 		if (pos == ULINT_UNDEFINED) {
1879 			break;
1880 		}
1881 
1882 		if (pos < prev_len) {
1883 			continue;
1884 		}
1885 
1886 		/* Document positions are calculated from the beginning
1887 		of the first field, need to save the length for each
1888 		searched field to adjust the doc position when search
1889 		phrases. */
1890 		pos -= prev_len;
1891 		ptr = start + pos;
1892 
1893 		/* Within limits ? */
1894 		if (ptr >= end) {
1895 			break;
1896 		}
1897 
1898 		if (phrase->parser) {
1899 			fts_phrase_param_t	phrase_param;
1900 
1901 			phrase_param.phrase = phrase;
1902 			phrase_param.token_index = 0;
1903 			phrase_param.heap = heap;
1904 
1905 			if (fts_query_match_phrase_terms_by_parser(
1906 				&phrase_param,
1907 				phrase->parser,
1908 				ptr,
1909 				ulint(end - ptr))) {
1910 				break;
1911 			}
1912 		} else {
1913 			fts_string_t	match;
1914 			fts_string_t	cmp_str;
1915 			ulint		ret;
1916 
1917 			match.f_str = ptr;
1918 			ret = innobase_mysql_fts_get_token(
1919 				phrase->charset, start + pos,
1920 				const_cast<byte*>(end), &match);
1921 
1922 			if (match.f_len == 0) {
1923 				break;
1924 			}
1925 
1926 			fts_string_dup(&cmp_str, &match, heap);
1927 
1928 			if (innobase_fts_text_case_cmp(
1929 				phrase->charset, first, &cmp_str) == 0) {
1930 
1931 				/* This is the case for the single word
1932 				in the phrase. */
1933 				if (ib_vector_size(phrase->tokens) == 1) {
1934 					phrase->found = TRUE;
1935 					break;
1936 				}
1937 
1938 				ptr += ret;
1939 
1940 				/* Match the remaining terms in the phrase. */
1941 				if (fts_query_match_phrase_terms(phrase, &ptr,
1942 								 end, heap)) {
1943 					break;
1944 				}
1945 			}
1946 		}
1947 	}
1948 
1949 	return(phrase->found);
1950 }
1951 
1952 /*****************************************************************//**
1953 Callback function to fetch and search the document.
1954 @return whether the phrase is found */
1955 static
1956 ibool
fts_query_fetch_document(void * row,void * user_arg)1957 fts_query_fetch_document(
1958 /*=====================*/
1959 	void*		row,		/*!< in:  sel_node_t* */
1960 	void*		user_arg)	/*!< in:  fts_doc_t* */
1961 {
1962 
1963 	que_node_t*	exp;
1964 	sel_node_t*	node = static_cast<sel_node_t*>(row);
1965 	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
1966 	ulint		prev_len = 0;
1967 	ulint		total_len = 0;
1968 	byte*		document_text = NULL;
1969 
1970 	exp = node->select_list;
1971 
1972 	phrase->found = FALSE;
1973 
1974 	/* For proximity search, we will need to get the whole document
1975 	from all fields, so first count the total length of the document
1976 	from all the fields */
1977 	if (phrase->proximity_pos) {
1978 		 while (exp) {
1979 			ulint		field_len;
1980 			dfield_t*	dfield = que_node_get_val(exp);
1981 			byte*		data = static_cast<byte*>(
1982 						dfield_get_data(dfield));
1983 
1984 			if (dfield_is_ext(dfield)) {
1985 				ulint	local_len = dfield_get_len(dfield);
1986 
1987 				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1988 
1989 				field_len = mach_read_from_4(
1990 					data + local_len + BTR_EXTERN_LEN + 4);
1991 			} else {
1992 				field_len = dfield_get_len(dfield);
1993 			}
1994 
1995 			if (field_len != UNIV_SQL_NULL) {
1996 				total_len += field_len + 1;
1997 			}
1998 
1999 			exp = que_node_get_next(exp);
2000 		}
2001 
2002 		document_text = static_cast<byte*>(mem_heap_zalloc(
2003 					phrase->heap, total_len));
2004 
2005 		if (!document_text) {
2006 			return(FALSE);
2007 		}
2008 	}
2009 
2010 	exp = node->select_list;
2011 
2012 	while (exp) {
2013 		dfield_t*	dfield = que_node_get_val(exp);
2014 		byte*		data = static_cast<byte*>(
2015 					dfield_get_data(dfield));
2016 		ulint		cur_len;
2017 
2018 		if (dfield_is_ext(dfield)) {
2019 			data = btr_copy_externally_stored_field(
2020 				&cur_len, data, phrase->zip_size,
2021 				dfield_get_len(dfield), phrase->heap);
2022 		} else {
2023 			cur_len = dfield_get_len(dfield);
2024 		}
2025 
2026 		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2027 			if (phrase->proximity_pos) {
2028 				ut_ad(prev_len + cur_len <= total_len);
2029 				memcpy(document_text + prev_len, data, cur_len);
2030 			} else {
2031 				/* For phrase search */
2032 				phrase->found =
2033 					fts_query_match_phrase(
2034 						phrase,
2035 						static_cast<byte*>(data),
2036 						cur_len, prev_len,
2037 						phrase->heap);
2038 			}
2039 
2040 			/* Document positions are calculated from the beginning
2041 			of the first field, need to save the length for each
2042 			searched field to adjust the doc position when search
2043 			phrases. */
2044 			prev_len += cur_len + 1;
2045 		}
2046 
2047 		if (phrase->found) {
2048 			break;
2049 		}
2050 
2051 		exp = que_node_get_next(exp);
2052 	}
2053 
2054 	if (phrase->proximity_pos) {
2055 		ut_ad(prev_len <= total_len);
2056 
2057 		phrase->found = fts_proximity_is_word_in_range(
2058 			phrase, document_text, total_len);
2059 	}
2060 
2061 	return(phrase->found);
2062 }
2063 
2064 #if 0
2065 /********************************************************************
2066 Callback function to check whether a record was found or not. */
2067 static
2068 ibool
2069 fts_query_select(
2070 /*=============*/
2071 	void*		row,		/*!< in:  sel_node_t* */
2072 	void*		user_arg)	/*!< in:  fts_doc_t* */
2073 {
2074 	int		i;
2075 	que_node_t*	exp;
2076 	sel_node_t*	node = row;
2077 	fts_select_t*	select = user_arg;
2078 
2079 	ut_a(select->word_freq);
2080 	ut_a(select->word_freq->doc_freqs);
2081 
2082 	exp = node->select_list;
2083 
2084 	for (i = 0; exp && !select->found; ++i) {
2085 		dfield_t*	dfield = que_node_get_val(exp);
2086 		void*		data = dfield_get_data(dfield);
2087 		ulint		len = dfield_get_len(dfield);
2088 
2089 		switch (i) {
2090 		case 0: /* DOC_COUNT */
2091 			if (len != UNIV_SQL_NULL && len != 0) {
2092 
2093 				select->word_freq->doc_count +=
2094 					mach_read_from_4(data);
2095 			}
2096 			break;
2097 
2098 		case 1: /* ILIST */
2099 			if (len != UNIV_SQL_NULL && len != 0) {
2100 
2101 				fts_query_find_doc_id(select, data, len);
2102 			}
2103 			break;
2104 
2105 		default:
2106 			ut_error;
2107 		}
2108 
2109 		exp = que_node_get_next(exp);
2110 	}
2111 
2112 	return(FALSE);
2113 }
2114 
2115 /********************************************************************
2116 Read the rows from the FTS index, that match word and where the
2117 doc id is between first and last doc id.
2118 @return DB_SUCCESS if all go well else error code */
2119 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2120 dberr_t
2121 fts_query_find_term(
2122 /*================*/
2123 	fts_query_t*		query,	/*!< in: FTS query state */
2124 	que_t**			graph,	/*!< in: prepared statement */
2125 	const fts_string_t*	word,	/*!< in: the word to fetch */
2126 	doc_id_t		doc_id,	/*!< in: doc id to match */
2127 	ulint*			min_pos,/*!< in/out: pos found must be
2128 					 greater than this minimum value. */
2129 	ibool*			found)	/*!< out: TRUE if found else FALSE */
2130 {
2131 	pars_info_t*		info;
2132 	dberr_t			error;
2133 	fts_select_t		select;
2134 	doc_id_t		match_doc_id;
2135 	trx_t*			trx = query->trx;
2136 	char			table_name[MAX_FULL_NAME_LEN];
2137 
2138 	trx->op_info = "fetching FTS index matching nodes";
2139 
2140 	if (*graph) {
2141 		info = (*graph)->info;
2142 	} else {
2143 		ulint	selected;
2144 
2145 		info = pars_info_create();
2146 
2147 		selected = fts_select_index(*word->f_str);
2148 		query->fts_index_table.suffix = fts_get_suffix(selected);
2149 
2150 		fts_get_table_name(&query->fts_index_table, table_name);
2151 		pars_info_bind_id(info, "index_table_name", table_name);
2152 	}
2153 
2154 	select.found = FALSE;
2155 	select.doc_id = doc_id;
2156 	select.min_pos = *min_pos;
2157 	select.word_freq = fts_query_add_word_freq(query, word->f_str);
2158 
2159 	pars_info_bind_function(info, "my_func", fts_query_select, &select);
2160 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2161 
2162 	/* Convert to "storage" byte order. */
2163 	fts_write_doc_id((byte*) &match_doc_id, doc_id);
2164 
2165 	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2166 
2167 	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2168 
2169 	if (!*graph) {
2170 
2171 		*graph = fts_parse_sql(
2172 			&query->fts_index_table,
2173 			info,
2174 			"DECLARE FUNCTION my_func;\n"
2175 			"DECLARE CURSOR c IS"
2176 			" SELECT doc_count, ilist\n"
2177 			" FROM $index_table_name\n"
2178 			" WHERE word LIKE :word AND"
2179 			" first_doc_id <= :min_doc_id AND"
2180 			" last_doc_id >= :max_doc_id\n"
2181 			" ORDER BY first_doc_id;\n"
2182 			"BEGIN\n"
2183 			"\n"
2184 			"OPEN c;\n"
2185 			"WHILE 1 = 1 LOOP\n"
2186 			"  FETCH c INTO my_func();\n"
2187 			"  IF c % NOTFOUND THEN\n"
2188 			"    EXIT;\n"
2189 			"  END IF;\n"
2190 			"END LOOP;\n"
2191 			"CLOSE c;");
2192 	}
2193 
2194 	for (;;) {
2195 		error = fts_eval_sql(trx, *graph);
2196 
2197 		if (error == DB_SUCCESS) {
2198 
2199 			break;				/* Exit the loop. */
2200 		} else {
2201 
2202 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2203 				ib::warn() << "lock wait timeout reading FTS"
2204 					" index. Retrying!";
2205 
2206 				trx->error_state = DB_SUCCESS;
2207 			} else {
2208 				ib::error() << error
2209 					<< " while reading FTS index.";
2210 
2211 				break;			/* Exit the loop. */
2212 			}
2213 		}
2214 	}
2215 
2216 	/* Value to return */
2217 	*found = select.found;
2218 
2219 	if (*found) {
2220 		*min_pos = select.min_pos;
2221 	}
2222 
2223 	return(error);
2224 }
2225 
2226 /********************************************************************
2227 Callback aggregator for int columns. */
2228 static
2229 ibool
2230 fts_query_sum(
2231 /*==========*/
2232 					/*!< out: always returns TRUE */
2233 	void*		row,		/*!< in:  sel_node_t* */
2234 	void*		user_arg)	/*!< in:  ulint* */
2235 {
2236 
2237 	que_node_t*	exp;
2238 	sel_node_t*	node = row;
2239 	ulint*		total = user_arg;
2240 
2241 	exp = node->select_list;
2242 
2243 	while (exp) {
2244 		dfield_t*	dfield = que_node_get_val(exp);
2245 		void*		data = dfield_get_data(dfield);
2246 		ulint		len = dfield_get_len(dfield);
2247 
2248 		if (len != UNIV_SQL_NULL && len != 0) {
2249 			*total += mach_read_from_4(data);
2250 		}
2251 
2252 		exp = que_node_get_next(exp);
2253 	}
2254 
2255 	return(TRUE);
2256 }
2257 
2258 /********************************************************************
2259 Calculate the total documents that contain a particular word (term).
2260 @return DB_SUCCESS if all go well else error code */
2261 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2262 dberr_t
2263 fts_query_total_docs_containing_term(
2264 /*=================================*/
2265 	fts_query_t*		query,	/*!< in: FTS query state */
2266 	const fts_string_t*	word,	/*!< in: the word to check */
2267 	ulint*			total)	/*!< out: documents containing word */
2268 {
2269 	pars_info_t*		info;
2270 	dberr_t			error;
2271 	que_t*			graph;
2272 	ulint			selected;
2273 	trx_t*			trx = query->trx;
2274 	char			table_name[MAX_FULL_NAME_LEN]
2275 
2276 	trx->op_info = "fetching FTS index document count";
2277 
2278 	*total = 0;
2279 
2280 	info = pars_info_create();
2281 
2282 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2283 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2284 
2285 	selected = fts_select_index(*word->f_str);
2286 
2287 	query->fts_index_table.suffix = fts_get_suffix(selected);
2288 
2289 	fts_get_table_name(&query->fts_index_table, table_name);
2290 
2291 	pars_info_bind_id(info, "index_table_name", table_name);
2292 
2293 	graph = fts_parse_sql(
2294 		&query->fts_index_table,
2295 		info,
2296 		"DECLARE FUNCTION my_func;\n"
2297 		"DECLARE CURSOR c IS"
2298 		" SELECT doc_count\n"
2299 		" FROM $index_table_name\n"
2300 		" WHERE word = :word"
2301 		" ORDER BY first_doc_id;\n"
2302 		"BEGIN\n"
2303 		"\n"
2304 		"OPEN c;\n"
2305 		"WHILE 1 = 1 LOOP\n"
2306 		"  FETCH c INTO my_func();\n"
2307 		"  IF c % NOTFOUND THEN\n"
2308 		"    EXIT;\n"
2309 		"  END IF;\n"
2310 		"END LOOP;\n"
2311 		"CLOSE c;");
2312 
2313 	for (;;) {
2314 		error = fts_eval_sql(trx, graph);
2315 
2316 		if (error == DB_SUCCESS) {
2317 
2318 			break;				/* Exit the loop. */
2319 		} else {
2320 
2321 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2322 				ib::warn() << "lock wait timeout reading FTS"
2323 					" index. Retrying!";
2324 
2325 				trx->error_state = DB_SUCCESS;
2326 			} else {
2327 				ib::error() << error
2328 					<< " while reading FTS index.";
2329 
2330 				break;			/* Exit the loop. */
2331 			}
2332 		}
2333 	}
2334 
2335 	fts_que_graph_free(graph);
2336 
2337 	return(error);
2338 }
2339 
2340 /********************************************************************
2341 Get the total number of words in a documents.
2342 @return DB_SUCCESS if all go well else error code */
2343 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2344 dberr_t
2345 fts_query_terms_in_document(
2346 /*========================*/
2347 	fts_query_t*	query,		/*!< in: FTS query state */
2348 	doc_id_t	doc_id,		/*!< in: the word to check */
2349 	ulint*		total)		/*!< out: total words in document */
2350 {
2351 	pars_info_t*	info;
2352 	dberr_t		error;
2353 	que_t*		graph;
2354 	doc_id_t	read_doc_id;
2355 	trx_t*		trx = query->trx;
2356 	char		table_name[MAX_FULL_NAME_LEN];
2357 
2358 	trx->op_info = "fetching FTS document term count";
2359 
2360 	*total = 0;
2361 
2362 	info = pars_info_create();
2363 
2364 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2365 
2366 	/* Convert to "storage" byte order. */
2367 	fts_write_doc_id((byte*) &read_doc_id, doc_id);
2368 	fts_bind_doc_id(info, "doc_id", &read_doc_id);
2369 
2370 	query->fts_index_table.suffix = "DOC_ID";
2371 
2372 	fts_get_table_name(&query->fts_index_table, table_name);
2373 
2374 	pars_info_bind_id(info, "index_table_name", table_name);
2375 
2376 	graph = fts_parse_sql(
2377 		&query->fts_index_table,
2378 		info,
2379 		"DECLARE FUNCTION my_func;\n"
2380 		"DECLARE CURSOR c IS"
2381 		" SELECT count\n"
2382 		" FROM $index_table_name\n"
2383 		" WHERE doc_id = :doc_id"
2384 		" BEGIN\n"
2385 		"\n"
2386 		"OPEN c;\n"
2387 		"WHILE 1 = 1 LOOP\n"
2388 		"  FETCH c INTO my_func();\n"
2389 		"  IF c % NOTFOUND THEN\n"
2390 		"    EXIT;\n"
2391 		"  END IF;\n"
2392 		"END LOOP;\n"
2393 		"CLOSE c;");
2394 
2395 	for (;;) {
2396 		error = fts_eval_sql(trx, graph);
2397 
2398 		if (error == DB_SUCCESS) {
2399 
2400 			break;				/* Exit the loop. */
2401 		} else {
2402 
2403 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2404 				ib::warn() << "lock wait timeout reading FTS"
2405 					" doc id table. Retrying!";
2406 
2407 				trx->error_state = DB_SUCCESS;
2408 			} else {
2409 				ib::error() << error << " while reading FTS"
2410 					" doc id table.";
2411 
2412 				break;			/* Exit the loop. */
2413 			}
2414 		}
2415 	}
2416 
2417 	fts_que_graph_free(graph);
2418 
2419 	return(error);
2420 }
2421 #endif
2422 
2423 /*****************************************************************//**
2424 Retrieve the document and match the phrase tokens.
2425 @return DB_SUCCESS or error code */
2426 MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
2427 static
2428 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2429 fts_query_match_document(
2430 /*=====================*/
2431 	ib_vector_t*	tokens,		/*!< in: phrase tokens */
2432 	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
2433 	fts_match_t*	match,		/*!< in: doc id and positions */
2434 	ulint		distance,	/*!< in: proximity distance */
2435 	st_mysql_ftparser* parser,	/*!< in: fts plugin parser */
2436 	ibool*		found)		/*!< out: TRUE if phrase found */
2437 {
2438 	dberr_t		error;
2439 	fts_phrase_t	phrase(get_doc->index_cache->index->table);
2440 
2441 	phrase.match = match;		/* Positions to match */
2442 	phrase.tokens = tokens;		/* Tokens to match */
2443 	phrase.distance = distance;
2444 	phrase.charset = get_doc->index_cache->charset;
2445 	phrase.heap = mem_heap_create(512);
2446 	phrase.parser = parser;
2447 
2448 	*found = phrase.found = FALSE;
2449 
2450 	error = fts_doc_fetch_by_doc_id(
2451 		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2452 		fts_query_fetch_document, &phrase);
2453 
2454 	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
2455 		ib::error() << "(" << error << ") matching document.";
2456 	} else {
2457 		*found = phrase.found;
2458 	}
2459 
2460 	mem_heap_free(phrase.heap);
2461 
2462 	return(error);
2463 }
2464 
2465 /*****************************************************************//**
2466 This function fetches the original documents and count the
2467 words in between matching words to see that is in specified distance
2468 @return DB_SUCCESS if all OK */
2469 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2470 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2471 fts_query_is_in_proximity_range(
2472 /*============================*/
2473 	const fts_query_t*	query,		/*!< in:  query instance */
2474 	fts_match_t**		match,		/*!< in: query instance */
2475 	fts_proximity_t*	qualified_pos)	/*!< in: position info for
2476 						qualified ranges */
2477 {
2478 	fts_get_doc_t	get_doc;
2479 	fts_cache_t*	cache = query->index->table->fts->cache;
2480 	dberr_t		err;
2481 
2482 	memset(&get_doc, 0x0, sizeof(get_doc));
2483 
2484 	rw_lock_x_lock(&cache->lock);
2485 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2486 	rw_lock_x_unlock(&cache->lock);
2487 	ut_a(get_doc.index_cache != NULL);
2488 
2489 	fts_phrase_t	phrase(get_doc.index_cache->index->table);
2490 
2491 	phrase.distance = query->distance;
2492 	phrase.charset = get_doc.index_cache->charset;
2493 	phrase.heap = mem_heap_create(512);
2494 	phrase.proximity_pos = qualified_pos;
2495 	phrase.found = FALSE;
2496 
2497 	err = fts_doc_fetch_by_doc_id(
2498 		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2499 		fts_query_fetch_document, &phrase);
2500 
2501 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
2502 		ib::error() << "(" << err << ") in verification"
2503 			" phase of proximity search";
2504 	}
2505 
2506 	/* Free the prepared statement. */
2507 	if (get_doc.get_document_graph) {
2508 		fts_que_graph_free(get_doc.get_document_graph);
2509 		get_doc.get_document_graph = NULL;
2510 	}
2511 
2512 	mem_heap_free(phrase.heap);
2513 
2514 	return(err == DB_SUCCESS && phrase.found);
2515 }
2516 
2517 /*****************************************************************//**
2518 Iterate over the matched document ids and search the for the
2519 actual phrase in the text.
2520 @return DB_SUCCESS if all OK */
2521 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2522 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2523 fts_query_search_phrase(
2524 /*====================*/
2525 	fts_query_t*		query,		/*!< in: query instance */
2526 	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
2527 						with any stopwords in the
2528 						original phrase */
2529 	ib_vector_t*		tokens)		/*!< in: tokens that does
2530 						not include stopwords and
2531 						can be used to calculate
2532 						ranking */
2533 {
2534 	ulint			i;
2535 	fts_get_doc_t		get_doc;
2536 	ulint			n_matched;
2537 	fts_cache_t*		cache = query->index->table->fts->cache;
2538 
2539 	n_matched = ib_vector_size(query->matched);
2540 
2541 	/* Setup the doc retrieval infrastructure. */
2542 	memset(&get_doc, 0x0, sizeof(get_doc));
2543 
2544 	rw_lock_x_lock(&cache->lock);
2545 
2546 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2547 
2548 	/* Must find the index cache */
2549 	ut_a(get_doc.index_cache != NULL);
2550 
2551 	rw_lock_x_unlock(&cache->lock);
2552 
2553 #ifdef FTS_INTERNAL_DIAG_PRINT
2554 	ib::info() << "Start phrase search";
2555 #endif
2556 
2557 	/* Read the document from disk and do the actual
2558 	match, matching documents will be added to the current
2559 	doc id set. */
2560 	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2561 		fts_match_t*	match;
2562 		ibool		found = FALSE;
2563 
2564 		match = static_cast<fts_match_t*>(
2565 			ib_vector_get(query->matched, i));
2566 
2567 		/* Skip the document ids that were filtered out by
2568 		an earlier pass. */
2569 		if (match->doc_id != 0) {
2570 
2571 			query->error = fts_query_match_document(
2572 				orig_tokens, &get_doc, match,
2573 				query->distance, query->parser, &found);
2574 
2575 			if (query->error == DB_SUCCESS && found) {
2576 				ulint	z;
2577 
2578 				query->error = fts_query_process_doc_id(query,
2579 							 match->doc_id, 0);
2580 				if (query->error != DB_SUCCESS) {
2581 					goto func_exit;
2582 				}
2583 
2584 				for (z = 0; z < ib_vector_size(tokens); z++) {
2585 					fts_string_t*   token;
2586 					token = static_cast<fts_string_t*>(
2587 						ib_vector_get(tokens, z));
2588 					fts_query_add_word_to_document(
2589 						query, match->doc_id, token);
2590 				}
2591 			}
2592 		}
2593 	}
2594 
2595 func_exit:
2596 	/* Free the prepared statement. */
2597 	if (get_doc.get_document_graph) {
2598 		fts_que_graph_free(get_doc.get_document_graph);
2599 		get_doc.get_document_graph = NULL;
2600 	}
2601 
2602 	return(query->error);
2603 }
2604 
2605 /** Split the phrase into tokens
2606 @param[in,out]	query		query instance
2607 @param[in]	node		query node to search
2608 @param[in,out]	tokens		token vector
2609 @param[in,out]	orig_tokens	original node tokens include stopword
2610 @param[in,out]	heap	mem heap */
2611 static
2612 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2613 fts_query_phrase_split(
2614 	fts_query_t*		query,
2615 	const fts_ast_node_t*	node,
2616 	ib_vector_t*		tokens,
2617 	ib_vector_t*		orig_tokens,
2618 	mem_heap_t*		heap)
2619 {
2620 	fts_string_t		phrase;
2621 	ulint			len = 0;
2622 	ulint			cur_pos = 0;
2623 	fts_ast_node_t*		term_node = NULL;
2624 
2625 	if (node->type == FTS_AST_TEXT) {
2626 		phrase.f_str = node->text.ptr->str;
2627 		phrase.f_len = node->text.ptr->len;
2628 		len = phrase.f_len;
2629 	} else {
2630 		ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2631 		phrase.f_str = NULL;
2632 		phrase.f_len = 0;
2633 		term_node = node->list.head;
2634 	}
2635 
2636 	while (true) {
2637 		fts_cache_t*	cache = query->index->table->fts->cache;
2638 		ulint		cur_len;
2639 		fts_string_t	result_str;
2640 
2641 		if (node->type == FTS_AST_TEXT) {
2642 			if (cur_pos >= len) {
2643 				break;
2644 			}
2645 
2646 			cur_len = innobase_mysql_fts_get_token(
2647 				query->fts_index_table.charset,
2648 				reinterpret_cast<const byte*>(phrase.f_str)
2649 				+ cur_pos,
2650 				reinterpret_cast<const byte*>(phrase.f_str)
2651 				+ len,
2652 				&result_str);
2653 
2654 			if (cur_len == 0) {
2655 				break;
2656 			}
2657 
2658 			cur_pos += cur_len;
2659 		} else {
2660 			ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2661 			/* Term node in parser phrase list */
2662 			if (term_node == NULL) {
2663 				break;
2664 			}
2665 
2666 			ut_a(term_node->type == FTS_AST_TERM);
2667 			result_str.f_str = term_node->term.ptr->str;
2668 			result_str.f_len = term_node->term.ptr->len;
2669 			result_str.f_n_char = fts_get_token_size(
2670 				query->fts_index_table.charset,
2671 				reinterpret_cast<char*>(result_str.f_str),
2672 				result_str.f_len);
2673 
2674 			term_node = term_node->next;
2675 		}
2676 
2677 		if (result_str.f_n_char == 0) {
2678 			continue;
2679 		}
2680 
2681 		fts_string_t*	token = static_cast<fts_string_t*>(
2682 			ib_vector_push(tokens, NULL));
2683 		fts_string_dup(token, &result_str, heap);
2684 
2685 		if (fts_check_token(
2686 			   &result_str,
2687 			   cache->stopword_info.cached_stopword,
2688 			   query->fts_index_table.charset)) {
2689 			/* Add the word to the RB tree so that we can
2690 			calculate it's frequencey within a document. */
2691 			fts_query_add_word_freq(query, token);
2692 		} else {
2693 			ib_vector_pop(tokens);
2694 		}
2695 
2696 		/* we will start to store all words including stopwords
2697 		in the "orig_tokens" vector, but skip any leading words
2698 		that are stopwords */
2699 		if (!ib_vector_is_empty(tokens)) {
2700 			fts_string_t*	orig_token = static_cast<fts_string_t*>(
2701 				ib_vector_push(orig_tokens, NULL));
2702 
2703 			orig_token->f_str = token->f_str;
2704 			orig_token->f_len = token->f_len;
2705 		}
2706 	}
2707 }
2708 
2709 /*****************************************************************//**
2710 Text/Phrase search.
2711 @return DB_SUCCESS or error code */
2712 static MY_ATTRIBUTE((warn_unused_result))
2713 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2714 fts_query_phrase_search(
2715 /*====================*/
2716 	fts_query_t*		query,	/*!< in: query instance */
2717 	const fts_ast_node_t*	node)	/*!< in: node to search */
2718 {
2719 	ib_vector_t*		tokens;
2720 	ib_vector_t*		orig_tokens;
2721 	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
2722 	ib_alloc_t*		heap_alloc;
2723 	ulint			num_token;
2724 
2725 	heap_alloc = ib_heap_allocator_create(heap);
2726 
2727 	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2728 	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2729 
2730 	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2731 		query->flags = FTS_PROXIMITY;
2732 	} else {
2733 		query->flags = FTS_PHRASE;
2734 	}
2735 
2736 	/* Split the phrase into tokens. */
2737 	fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2738 
2739 	num_token = ib_vector_size(tokens);
2740 	if (num_token > MAX_PROXIMITY_ITEM) {
2741 		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2742 		goto func_exit;
2743 	}
2744 
2745 	ut_ad(ib_vector_size(orig_tokens) >= num_token);
2746 
2747 	/* Ignore empty strings. */
2748 	if (num_token > 0) {
2749 		fts_string_t*	token = NULL;
2750 		fts_fetch_t	fetch;
2751 		trx_t*		trx = query->trx;
2752 		fts_ast_oper_t	oper = query->oper;
2753 		que_t*		graph = NULL;
2754 		ulint		i;
2755 		dberr_t		error;
2756 
2757 		/* Create the vector for storing matching document ids
2758 		and the positions of the first token of the phrase. */
2759 		if (!query->matched) {
2760 			ib_alloc_t*	heap_alloc;
2761 
2762 			heap_alloc = ib_heap_allocator_create(heap);
2763 
2764 			if (!(query->flags & FTS_PROXIMITY)
2765 			    && !(query->flags & FTS_PHRASE)) {
2766 				query->matched = ib_vector_create(
2767 					heap_alloc, sizeof(fts_match_t),
2768 					64);
2769 			} else {
2770 				ut_a(num_token <= MAX_PROXIMITY_ITEM);
2771 				query->match_array =
2772 					(ib_vector_t**) mem_heap_alloc(
2773 						heap,
2774 						num_token *
2775 						sizeof(query->matched));
2776 
2777 				for (i = 0; i < num_token; i++) {
2778 					query->match_array[i] =
2779 					ib_vector_create(
2780 						heap_alloc, sizeof(fts_match_t),
2781 						64);
2782 				}
2783 
2784 				query->matched = query->match_array[0];
2785 			}
2786 		}
2787 
2788 		/* Setup the callback args for filtering and consolidating
2789 		the ilist. */
2790 		fetch.read_arg = query;
2791 		fetch.read_record = fts_query_index_fetch_nodes;
2792 
2793 		for (i = 0; i < num_token; i++) {
2794 			/* Search for the first word from the phrase. */
2795 			token = static_cast<fts_string_t*>(
2796 				ib_vector_get(tokens, i));
2797 
2798 			if (query->flags & FTS_PROXIMITY
2799 			    || query->flags & FTS_PHRASE) {
2800 				query->matched = query->match_array[i];
2801 			}
2802 
2803 			error = fts_index_fetch_nodes(
2804 				trx, &graph, &query->fts_index_table,
2805 				token, &fetch);
2806 
2807 			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2808 			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2809 			if (error != DB_SUCCESS) {
2810 				query->error = error;
2811 			}
2812 
2813 			fts_que_graph_free(graph);
2814 			graph = NULL;
2815 
2816 			fts_query_cache(query, token);
2817 
2818 			if (!(query->flags & FTS_PHRASE)
2819 			    && !(query->flags & FTS_PROXIMITY)) {
2820 				break;
2821 			}
2822 
2823 			/* If any of the token can't be found,
2824 			no need to continue match */
2825 			if (ib_vector_is_empty(query->match_array[i])
2826 			    || query->error != DB_SUCCESS) {
2827 				goto func_exit;
2828 			}
2829 		}
2830 
2831 		/* Just a single word, no need to fetch the original
2832 		documents to do phrase matching */
2833 		if (ib_vector_size(orig_tokens) == 1
2834 		    && !ib_vector_is_empty(query->match_array[0])) {
2835 			fts_match_t*    match;
2836 			ulint		n_matched;
2837 
2838 			n_matched = ib_vector_size(query->match_array[0]);
2839 
2840 			for (i = 0; i < n_matched; i++) {
2841 				match = static_cast<fts_match_t*>(
2842 					ib_vector_get(
2843 						query->match_array[0], i));
2844 
2845 				query->error = fts_query_process_doc_id(
2846 						query, match->doc_id, 0);
2847 				if (query->error != DB_SUCCESS) {
2848 					goto func_exit;
2849 				}
2850 
2851 				fts_query_add_word_to_document(
2852 					query, match->doc_id, token);
2853 			}
2854 			query->oper = oper;
2855 			goto func_exit;
2856 		}
2857 
2858 		/* If we are doing proximity search, verify the distance
2859 		between all words, and check they are in specified distance. */
2860 		if (query->flags & FTS_PROXIMITY) {
2861 			fts_phrase_or_proximity_search(query, tokens);
2862 		} else {
2863 			ibool	matched;
2864 
2865 			/* Phrase Search case:
2866 			We filter out the doc ids that don't contain
2867 			all the tokens in the phrase. It's cheaper to
2868 			search the ilist than bringing the documents in
2869 			and then doing a search through the text. Isolated
2870 			testing shows this also helps in mitigating disruption
2871 			of the buffer cache. */
2872 			matched = fts_phrase_or_proximity_search(query, tokens);
2873 			query->matched = query->match_array[0];
2874 
2875 			/* Read the actual text in and search for the phrase. */
2876 			if (matched) {
2877 				ut_ad(query->error == DB_SUCCESS);
2878 				query->error = fts_query_search_phrase(
2879 					query, orig_tokens, tokens);
2880 			}
2881 		}
2882 
2883 		/* Restore original operation. */
2884 		query->oper = oper;
2885 
2886 		if (query->error != DB_SUCCESS) {
2887 			goto func_exit;
2888 		}
2889 	}
2890 
2891 func_exit:
2892 	mem_heap_free(heap);
2893 
2894 	/* Don't need it anymore. */
2895 	query->matched = NULL;
2896 
2897 	return(query->error);
2898 }
2899 
2900 /*****************************************************************//**
2901 Find the word and evaluate.
2902 @return DB_SUCCESS if all go well */
2903 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2904 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2905 fts_query_execute(
2906 /*==============*/
2907 	fts_query_t*		query,	/*!< in: query instance */
2908 	fts_string_t*		token)	/*!< in: token to search */
2909 {
2910 	switch (query->oper) {
2911 	case FTS_NONE:
2912 	case FTS_NEGATE:
2913 	case FTS_INCR_RATING:
2914 	case FTS_DECR_RATING:
2915 		query->error = fts_query_union(query, token);
2916 		break;
2917 
2918 	case FTS_EXIST:
2919 		query->error = fts_query_intersect(query, token);
2920 		break;
2921 
2922 	case FTS_IGNORE:
2923 		query->error = fts_query_difference(query, token);
2924 		break;
2925 
2926 	default:
2927 		ut_error;
2928 	}
2929 
2930 	return(query->error);
2931 }
2932 
2933 /*****************************************************************//**
2934 Create a wildcard string. It's the responsibility of the caller to
2935 free the byte* pointer. It's allocated using ut_malloc_nokey().
2936 @return ptr to allocated memory */
2937 static
2938 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2939 fts_query_get_token(
2940 /*================*/
2941 	fts_ast_node_t*	node,		/*!< in: the current sub tree */
2942 	fts_string_t*	token)		/*!< in: token to create */
2943 {
2944 	ulint		str_len;
2945 	byte*		new_ptr = NULL;
2946 
2947 	str_len = node->term.ptr->len;
2948 
2949 	ut_a(node->type == FTS_AST_TERM);
2950 
2951 	token->f_len = str_len;
2952 	token->f_str = node->term.ptr->str;
2953 
2954 	if (node->term.wildcard) {
2955 
2956 		token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2957 		token->f_len = str_len + 1;
2958 
2959 		memcpy(token->f_str, node->term.ptr->str, str_len);
2960 
2961 		token->f_str[str_len] = '%';
2962 		token->f_str[token->f_len] = 0;
2963 
2964 		new_ptr = token->f_str;
2965 	}
2966 
2967 	return(new_ptr);
2968 }
2969 
2970 static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
2971 
2972 /*****************************************************************//**
2973 Visit every node of the AST. */
2974 static
2975 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2976 fts_query_visitor(
2977 /*==============*/
2978 	fts_ast_oper_t	oper,		/*!< in: current operator */
2979 	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
2980 	void*		arg)		/*!< in: callback arg*/
2981 {
2982 	byte*		ptr;
2983 	fts_string_t	token;
2984 	fts_query_t*	query = static_cast<fts_query_t*>(arg);
2985 
2986 	ut_a(node);
2987 	DBUG_ENTER("fts_query_visitor");
2988 	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2989 
2990 	token.f_n_char = 0;
2991 	query->oper = oper;
2992 	query->cur_node = node;
2993 
2994 	switch (node->type) {
2995 	case FTS_AST_TEXT:
2996 	case FTS_AST_PARSER_PHRASE_LIST:
2997 
2998 		if (query->oper == FTS_EXIST) {
2999 			ut_ad(query->intersection == NULL);
3000 			query->intersection = rbt_create(
3001 				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3002 
3003 			query->total_size += SIZEOF_RBT_CREATE;
3004 		}
3005 
3006 		/* Set the current proximity distance. */
3007 		query->distance = node->text.distance;
3008 
3009 		/* Force collection of doc ids and the positions. */
3010 		query->collect_positions = TRUE;
3011 
3012 		query->error = fts_query_phrase_search(query, node);
3013 
3014 		query->collect_positions = FALSE;
3015 
3016 		if (query->oper == FTS_EXIST) {
3017 			fts_query_free_doc_ids(query, query->doc_ids);
3018 			query->doc_ids = query->intersection;
3019 			query->intersection = NULL;
3020 		}
3021 
3022 		break;
3023 
3024 	case FTS_AST_TERM:
3025 		token.f_str = node->term.ptr->str;
3026 		token.f_len = node->term.ptr->len;
3027 
3028 		/* Collect wildcard words for QUERY EXPANSION. */
3029 		if (node->term.wildcard && query->wildcard_words != NULL) {
3030 			ib_rbt_bound_t          parent;
3031 
3032 			if (rbt_search(query->wildcard_words, &parent, &token)
3033 			     != 0) {
3034 				fts_string_t	word;
3035 
3036 				fts_string_dup(&word, &token, query->heap);
3037 				rbt_add_node(query->wildcard_words, &parent,
3038 					     &word);
3039 			}
3040 		}
3041 
3042 		/* Add the word to our RB tree that will be used to
3043 		calculate this terms per document frequency. */
3044 		fts_query_add_word_freq(query, &token);
3045 
3046 		ptr = fts_query_get_token(node, &token);
3047 		query->error = fts_query_execute(query, &token);
3048 
3049 		if (ptr) {
3050 			ut_free(ptr);
3051 		}
3052 
3053 		break;
3054 
3055 	case FTS_AST_SUBEXP_LIST:
3056 		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3057 		break;
3058 
3059 	default:
3060 		ut_error;
3061 	}
3062 
3063 	if (query->oper == FTS_EXIST) {
3064 		query->multi_exist = true;
3065 	}
3066 
3067 	DBUG_RETURN(query->error);
3068 }
3069 
3070 /** Process (nested) sub-expression, create a new result set to store the
3071 sub-expression result by processing nodes under current sub-expression
3072 list. Merge the sub-expression result with that of parent expression list.
3073 @param[in,out]	node	current root node
3074 @param[in,out]	visitor	callback function
3075 @param[in,out]	arg	argument for callback
3076 @return DB_SUCCESS if all go well */
3077 static
3078 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3079 fts_ast_visit_sub_exp(
3080 	fts_ast_node_t*		node,
3081 	fts_ast_callback	visitor,
3082 	void*			arg)
3083 {
3084 	fts_ast_oper_t		cur_oper;
3085 	fts_query_t*		query = static_cast<fts_query_t*>(arg);
3086 	ib_rbt_t*		parent_doc_ids;
3087 	ib_rbt_t*		subexpr_doc_ids;
3088 	dberr_t			error = DB_SUCCESS;
3089 	bool			will_be_ignored = false;
3090 	bool			multi_exist;
3091 
3092 	DBUG_ENTER("fts_ast_visit_sub_exp");
3093 
3094 	ut_a(node->type == FTS_AST_SUBEXP_LIST);
3095 
3096 	/* To avoid stack overflow, we limit the mutual recursion
3097 	depth between fts_ast_visit(), fts_query_visitor() and
3098 	fts_ast_visit_sub_exp(). */
3099 	if (query->visiting_sub_exp++ > 31) {
3100 		query->error = DB_OUT_OF_MEMORY;
3101 		DBUG_RETURN(query->error);
3102 	}
3103 
3104 	cur_oper = query->oper;
3105 
3106 	/* Save current result set */
3107 	parent_doc_ids = query->doc_ids;
3108 
3109 	/* Create new result set to store the sub-expression result. We
3110 	will merge this result set with the parent after processing. */
3111 	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3112 				    fts_ranking_doc_id_cmp);
3113 
3114 	query->total_size += SIZEOF_RBT_CREATE;
3115 
3116 	multi_exist = query->multi_exist;
3117 	query->multi_exist = false;
3118 	/* Process nodes in current sub-expression and store its
3119 	result set in query->doc_ids we created above. */
3120 	error = fts_ast_visit(FTS_NONE, node, visitor,
3121 			      arg, &will_be_ignored);
3122 
3123 	/* Reinstate parent node state */
3124 	query->multi_exist = multi_exist;
3125 	query->oper = cur_oper;
3126 	query->visiting_sub_exp--;
3127 
3128 	/* Merge the sub-expression result with the parent result set. */
3129 	subexpr_doc_ids = query->doc_ids;
3130 	query->doc_ids = parent_doc_ids;
3131 	if (error == DB_SUCCESS) {
3132 		error = fts_merge_doc_ids(query, subexpr_doc_ids);
3133 	}
3134 
3135 	/* Free current result set. Result already merged into parent. */
3136 	fts_query_free_doc_ids(query, subexpr_doc_ids);
3137 
3138 	DBUG_RETURN(error);
3139 }
3140 
3141 #if 0
3142 /*****************************************************************//***
3143 Check if the doc id exists in the ilist.
3144 @return TRUE if doc id found */
3145 static
3146 ulint
3147 fts_query_find_doc_id(
3148 /*==================*/
3149 	fts_select_t*	select,		/*!< in/out: contains the doc id to
3150 					find, we update the word freq if
3151 					document found */
3152 	void*		data,		/*!< in: doc id ilist */
3153 	ulint		len)		/*!< in: doc id ilist size */
3154 {
3155 	byte*		ptr = data;
3156 	doc_id_t	doc_id = 0;
3157 	ulint		decoded = 0;
3158 
3159 	/* Decode the ilist and search for selected doc_id. We also
3160 	calculate the frequency of the word in the document if found. */
3161 	while (decoded < len && !select->found) {
3162 		ulint		freq = 0;
3163 		ulint		min_pos = 0;
3164 		ulint		last_pos = 0;
3165 		ulint		pos = fts_decode_vlc(&ptr);
3166 
3167 		/* Add the delta. */
3168 		doc_id += pos;
3169 
3170 		while (*ptr) {
3171 			++freq;
3172 			last_pos += fts_decode_vlc(&ptr);
3173 
3174 			/* Only if min_pos is not set and the current
3175 			term exists in a position greater than the
3176 			min_pos of the previous term. */
3177 			if (min_pos == 0 && last_pos > select->min_pos) {
3178 				min_pos = last_pos;
3179 			}
3180 		}
3181 
3182 		/* Skip the end of word position marker. */
3183 		++ptr;
3184 
3185 		/* Bytes decoded so far. */
3186 		decoded = ptr - (byte*) data;
3187 
3188 		/* A word may exist in the document but we only consider a
3189 		match if it exists in a position that is greater than the
3190 		position of the previous term. */
3191 		if (doc_id == select->doc_id && min_pos > 0) {
3192 			fts_doc_freq_t*	doc_freq;
3193 
3194 			/* Add the doc id to the doc freq rb tree, if
3195 			the doc id doesn't exist it will be created. */
3196 			doc_freq = fts_query_add_doc_freq(
3197 				select->word_freq->doc_freqs, doc_id);
3198 
3199 			/* Avoid duplicating the frequency tally */
3200 			if (doc_freq->freq == 0) {
3201 				doc_freq->freq = freq;
3202 			}
3203 
3204 			select->found = TRUE;
3205 			select->min_pos = min_pos;
3206 		}
3207 	}
3208 
3209 	return(select->found);
3210 }
3211 #endif
3212 
3213 /*****************************************************************//**
3214 Read and filter nodes.
3215 @return DB_SUCCESS if all go well,
3216 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3217 static
3218 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3219 fts_query_filter_doc_ids(
3220 /*=====================*/
3221 	fts_query_t*		query,		/*!< in: query instance */
3222 	const fts_string_t*	word,		/*!< in: the current word */
3223 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
3224 	const fts_node_t*	node,		/*!< in: current FTS node */
3225 	void*			data,		/*!< in: doc id ilist */
3226 	ulint			len,		/*!< in: doc id ilist size */
3227 	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
3228 {
3229 	const byte*	ptr = static_cast<byte*>(data);
3230 	doc_id_t	doc_id = 0;
3231 	ulint		decoded = 0;
3232 	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
3233 
3234 	/* Decode the ilist and add the doc ids to the query doc_id set. */
3235 	while (decoded < len) {
3236 		ulint		freq = 0;
3237 		fts_doc_freq_t*	doc_freq;
3238 		fts_match_t*	match = NULL;
3239 		doc_id_t	last_pos = 0;
3240 		doc_id_t	pos = fts_decode_vlc(&ptr);
3241 
3242 		/* Some sanity checks. */
3243 		if (doc_id == 0) {
3244 			ut_a(pos == node->first_doc_id);
3245 		}
3246 
3247 		/* Add the delta. */
3248 		doc_id += pos;
3249 
3250 		if (calc_doc_count) {
3251 			word_freq->doc_count++;
3252 		}
3253 
3254 		/* We simply collect the matching instances here. */
3255 		if (query->collect_positions) {
3256 			ib_alloc_t*	heap_alloc;
3257 
3258 			/* Create a new fts_match_t instance. */
3259 			match = static_cast<fts_match_t*>(
3260 				ib_vector_push(query->matched, NULL));
3261 
3262 			match->start = 0;
3263 			match->doc_id = doc_id;
3264 			heap_alloc = ib_vector_allocator(query->matched);
3265 
3266 			/* Allocate from the same heap as the
3267 			parent container. */
3268 			match->positions = ib_vector_create(
3269 				heap_alloc, sizeof(ulint), 64);
3270 
3271 			query->total_size += sizeof(fts_match_t)
3272 				+ sizeof(ib_vector_t)
3273 				+ sizeof(ulint) * 64;
3274 		}
3275 
3276 		/* Unpack the positions within the document. */
3277 		while (*ptr) {
3278 			last_pos += fts_decode_vlc(&ptr);
3279 
3280 			/* Collect the matching word positions, for phrase
3281 			matching later. */
3282 			if (query->collect_positions) {
3283 				ib_vector_push(match->positions, &last_pos);
3284 			}
3285 
3286 			++freq;
3287 		}
3288 
3289 		/* End of list marker. */
3290 		last_pos = (ulint) -1;
3291 
3292 		if (query->collect_positions) {
3293 			ut_a(match != NULL);
3294 			ib_vector_push(match->positions, &last_pos);
3295 		}
3296 
3297 		/* Add the doc id to the doc freq rb tree, if the doc id
3298 		doesn't exist it will be created. */
3299 		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3300 
3301 		/* Avoid duplicating frequency tally. */
3302 		if (doc_freq->freq == 0) {
3303 			doc_freq->freq = freq;
3304 		}
3305 
3306 		/* Skip the end of word position marker. */
3307 		++ptr;
3308 
3309 		/* Bytes decoded so far */
3310 		decoded = ulint(ptr - (byte*) data);
3311 
3312 		/* We simply collect the matching documents and the
3313 		positions here and match later. */
3314 		if (!query->collect_positions) {
3315 			/* We ignore error here and will check it later */
3316 			fts_query_process_doc_id(query, doc_id, 0);
3317 
3318 			/* Add the word to the document's matched RB tree. */
3319 			fts_query_add_word_to_document(query, doc_id, word);
3320 		}
3321 	}
3322 
3323 	/* Some sanity checks. */
3324 	ut_a(doc_id == node->last_doc_id);
3325 
3326 	if (query->total_size > fts_result_cache_limit) {
3327 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3328 	} else {
3329 		return(DB_SUCCESS);
3330 	}
3331 }
3332 
3333 /*****************************************************************//**
3334 Read the FTS INDEX row.
3335 @return DB_SUCCESS if all go well. */
3336 static
3337 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3338 fts_query_read_node(
3339 /*================*/
3340 	fts_query_t*		query,	/*!< in: query instance */
3341 	const fts_string_t*	word,	/*!< in: current word */
3342 	que_node_t*		exp)	/*!< in: query graph node */
3343 {
3344 	int			i;
3345 	int			ret;
3346 	fts_node_t		node;
3347 	ib_rbt_bound_t		parent;
3348 	fts_word_freq_t*	word_freq;
3349 	ibool			skip = FALSE;
3350 	fts_string_t		term;
3351 	byte			buf[FTS_MAX_WORD_LEN + 1];
3352 	dberr_t			error = DB_SUCCESS;
3353 
3354 	ut_a(query->cur_node->type == FTS_AST_TERM
3355 	     || query->cur_node->type == FTS_AST_TEXT
3356 	     || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3357 
3358 	memset(&node, 0, sizeof(node));
3359 	term.f_str = buf;
3360 
3361 	/* Need to consider the wildcard search case, the word frequency
3362 	is created on the search string not the actual word. So we need
3363 	to assign the frequency on search string behalf. */
3364 	if (query->cur_node->type == FTS_AST_TERM
3365 	    && query->cur_node->term.wildcard) {
3366 
3367 		term.f_len = query->cur_node->term.ptr->len;
3368 		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3369 		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3370 	} else {
3371 		term.f_len = word->f_len;
3372 		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3373 		memcpy(term.f_str, word->f_str, word->f_len);
3374 	}
3375 
3376 	/* Lookup the word in our rb tree, it must exist. */
3377 	ret = rbt_search(query->word_freqs, &parent, &term);
3378 
3379 	ut_a(ret == 0);
3380 
3381 	word_freq = rbt_value(fts_word_freq_t, parent.last);
3382 
3383 	/* Start from 1 since the first column has been read by the caller.
3384 	Also, we rely on the order of the columns projected, to filter
3385 	out ilists that are out of range and we always want to read
3386 	the doc_count irrespective of the suitablility of the row. */
3387 
3388 	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3389 
3390 		dfield_t*	dfield = que_node_get_val(exp);
3391 		byte*		data = static_cast<byte*>(
3392 			dfield_get_data(dfield));
3393 		ulint		len = dfield_get_len(dfield);
3394 
3395 		ut_a(len != UNIV_SQL_NULL);
3396 
3397 		/* Note: The column numbers below must match the SELECT. */
3398 
3399 		switch (i) {
3400 		case 1: /* DOC_COUNT */
3401 			word_freq->doc_count += mach_read_from_4(data);
3402 			break;
3403 
3404 		case 2: /* FIRST_DOC_ID */
3405 			node.first_doc_id = fts_read_doc_id(data);
3406 
3407 			/* Skip nodes whose doc ids are out range. */
3408 			if (query->oper == FTS_EXIST
3409 			    && query->upper_doc_id > 0
3410 			    && node.first_doc_id > query->upper_doc_id) {
3411 				skip = TRUE;
3412 			}
3413 			break;
3414 
3415 		case 3: /* LAST_DOC_ID */
3416 			node.last_doc_id = fts_read_doc_id(data);
3417 
3418 			/* Skip nodes whose doc ids are out range. */
3419 			if (query->oper == FTS_EXIST
3420 			    && query->lower_doc_id > 0
3421 			    && node.last_doc_id < query->lower_doc_id) {
3422 				skip = TRUE;
3423 			}
3424 			break;
3425 
3426 		case 4: /* ILIST */
3427 
3428 			error = fts_query_filter_doc_ids(
3429 					query, &word_freq->word, word_freq,
3430 					&node, data, len, FALSE);
3431 
3432 			break;
3433 
3434 		default:
3435 			ut_error;
3436 		}
3437 	}
3438 
3439 	if (!skip) {
3440 		/* Make sure all columns were read. */
3441 
3442 		ut_a(i == 5);
3443 	}
3444 
3445 	return error;
3446 }
3447 
3448 /*****************************************************************//**
3449 Callback function to fetch the rows in an FTS INDEX record.
3450 @return always returns TRUE */
3451 static
3452 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3453 fts_query_index_fetch_nodes(
3454 /*========================*/
3455 	void*		row,		/*!< in: sel_node_t* */
3456 	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
3457 {
3458 	fts_string_t	key;
3459 	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
3460 	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
3461 	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
3462 	que_node_t*	exp = sel_node->select_list;
3463 	dfield_t*	dfield = que_node_get_val(exp);
3464 	void*		data = dfield_get_data(dfield);
3465 	ulint		dfield_len = dfield_get_len(dfield);
3466 
3467 	key.f_str = static_cast<byte*>(data);
3468 	key.f_len = dfield_len;
3469 
3470 	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3471 
3472 	/* Note: we pass error out by 'query->error' */
3473 	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3474 
3475 	if (query->error != DB_SUCCESS) {
3476 		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3477 		return(FALSE);
3478 	} else {
3479 		return(TRUE);
3480 	}
3481 }
3482 
3483 /*****************************************************************//**
3484 Calculate the inverse document frequency (IDF) for all the terms. */
3485 static
3486 void
fts_query_calculate_idf(fts_query_t * query)3487 fts_query_calculate_idf(
3488 /*====================*/
3489 	fts_query_t*	query)	/*!< in: Query state */
3490 {
3491 	const ib_rbt_node_t*	node;
3492 	ib_uint64_t		total_docs = query->total_docs;
3493 
3494 	/* We need to free any instances of fts_doc_freq_t that we
3495 	may have allocated. */
3496 	for (node = rbt_first(query->word_freqs);
3497 	     node;
3498 	     node = rbt_next(query->word_freqs, node)) {
3499 
3500 		fts_word_freq_t*	word_freq;
3501 
3502 		word_freq = rbt_value(fts_word_freq_t, node);
3503 
3504 		if (word_freq->doc_count > 0) {
3505 			if (total_docs == word_freq->doc_count) {
3506 				/* QP assume ranking > 0 if we find
3507 				a match. Since Log10(1) = 0, we cannot
3508 				make IDF a zero value if do find a
3509 				word in all documents. So let's make
3510 				it an arbitrary very small number */
3511 				word_freq->idf = log10(1.0001);
3512 			} else {
3513 				word_freq->idf = log10(
3514 					static_cast<double>(total_docs)
3515 					/ static_cast<double>(
3516 						word_freq->doc_count));
3517 			}
3518 		}
3519 	}
3520 }
3521 
3522 /*****************************************************************//**
3523 Calculate the ranking of the document. */
3524 static
3525 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3526 fts_query_calculate_ranking(
3527 /*========================*/
3528 	const fts_query_t*	query,		/*!< in: query state */
3529 	fts_ranking_t*		ranking)	/*!< in: Document to rank */
3530 {
3531 	ulint	pos = 0;
3532 	fts_string_t	word;
3533 
3534 	/* At this stage, ranking->rank should not exceed the 1.0
3535 	bound */
3536 	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3537 	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3538 
3539 	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3540 		int			ret;
3541 		ib_rbt_bound_t		parent;
3542 		double			weight;
3543 		fts_doc_freq_t*		doc_freq;
3544 		fts_word_freq_t*	word_freq;
3545 
3546 		ret = rbt_search(query->word_freqs, &parent, &word);
3547 
3548 		/* It must exist. */
3549 		ut_a(ret == 0);
3550 
3551 		word_freq = rbt_value(fts_word_freq_t, parent.last);
3552 
3553 		ret = rbt_search(
3554 			word_freq->doc_freqs, &parent, &ranking->doc_id);
3555 
3556 		/* It must exist. */
3557 		ut_a(ret == 0);
3558 
3559 		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3560 
3561 		weight = (double) doc_freq->freq * word_freq->idf;
3562 
3563 		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3564 	}
3565 }
3566 
3567 /*****************************************************************//**
3568 Add ranking to the result set. */
3569 static
3570 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3571 fts_query_add_ranking(
3572 /*==================*/
3573 	fts_query_t*		query,		/*!< in: query state */
3574 	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
3575 	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
3576 {
3577 	ib_rbt_bound_t		parent;
3578 
3579 	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
3580 	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3581 		fts_ranking_t*	ranking;
3582 
3583 		ranking = rbt_value(fts_ranking_t, parent.last);
3584 
3585 		ranking->rank += new_ranking->rank;
3586 
3587 		ut_a(ranking->words == NULL);
3588 	} else {
3589 		rbt_add_node(ranking_tree, &parent, new_ranking);
3590 
3591 		query->total_size += SIZEOF_RBT_NODE_ADD
3592 			+ sizeof(fts_ranking_t);
3593 	}
3594 }
3595 
3596 /*****************************************************************//**
3597 Retrieve the FTS Relevance Ranking result for doc with doc_id
3598 @return the relevance ranking value, 0 if no ranking value
3599 present. */
3600 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3601 fts_retrieve_ranking(
3602 /*=================*/
3603 	fts_result_t*	result,	/*!< in: FTS result structure */
3604 	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
3605 {
3606 	ib_rbt_bound_t		parent;
3607 	fts_ranking_t		new_ranking;
3608 
3609 	DBUG_ENTER("fts_retrieve_ranking");
3610 
3611 	if (!result || !result->rankings_by_id) {
3612 		DBUG_RETURN(0);
3613 	}
3614 
3615 	new_ranking.doc_id = doc_id;
3616 
3617 	/* Lookup the ranking in our rb tree */
3618 	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3619 		fts_ranking_t*  ranking;
3620 
3621 		ranking = rbt_value(fts_ranking_t, parent.last);
3622 
3623 		DBUG_RETURN(ranking->rank);
3624 	}
3625 
3626 	DBUG_RETURN(0);
3627 }
3628 
3629 /*****************************************************************//**
3630 Create the result and copy the data to it. */
3631 static
3632 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3633 fts_query_prepare_result(
3634 /*=====================*/
3635 	fts_query_t*	query,	/*!< in: Query state */
3636 	fts_result_t*	result)	/*!< in: result this can contain
3637 				data from a previous search on
3638 				another FTS index */
3639 {
3640 	const ib_rbt_node_t*	node;
3641 	bool			result_is_null = false;
3642 
3643 	DBUG_ENTER("fts_query_prepare_result");
3644 
3645 	if (result == NULL) {
3646 		result = static_cast<fts_result_t*>(
3647 			ut_zalloc_nokey(sizeof(*result)));
3648 
3649 		result->rankings_by_id = rbt_create(
3650 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3651 
3652 		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3653 		result_is_null = true;
3654 	}
3655 
3656 	if (query->flags == FTS_OPT_RANKING) {
3657 		fts_word_freq_t*	word_freq;
3658 		ulint		size = ib_vector_size(query->deleted->doc_ids);
3659 		doc_id_t*	updates =
3660 			(doc_id_t*) query->deleted->doc_ids->data;
3661 
3662 		node = rbt_first(query->word_freqs);
3663 		ut_ad(node);
3664 		word_freq = rbt_value(fts_word_freq_t, node);
3665 
3666 		for (node = rbt_first(word_freq->doc_freqs);
3667 		     node;
3668 		     node = rbt_next(word_freq->doc_freqs, node)) {
3669 			fts_doc_freq_t* doc_freq;
3670 			fts_ranking_t	ranking;
3671 
3672 			doc_freq = rbt_value(fts_doc_freq_t, node);
3673 
3674 			/* Don't put deleted docs into result */
3675 			if (fts_bsearch(updates, 0, static_cast<int>(size),
3676 					doc_freq->doc_id) >= 0) {
3677 				/* one less matching doc count */
3678 				--word_freq->doc_count;
3679 				continue;
3680 			}
3681 
3682 			ranking.doc_id = doc_freq->doc_id;
3683 			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3684 			ranking.words = NULL;
3685 
3686 			fts_query_add_ranking(query, result->rankings_by_id,
3687 					      &ranking);
3688 
3689 			if (query->total_size > fts_result_cache_limit) {
3690 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3691 				fts_query_free_result(result);
3692 				DBUG_RETURN(NULL);
3693 			}
3694 		}
3695 
3696 		/* Calculate IDF only after we exclude the deleted items */
3697 		fts_query_calculate_idf(query);
3698 
3699 		node = rbt_first(query->word_freqs);
3700 		word_freq = rbt_value(fts_word_freq_t, node);
3701 
3702 		/* Calculate the ranking for each doc */
3703 		for (node = rbt_first(result->rankings_by_id);
3704 		     node != NULL;
3705 		     node = rbt_next(result->rankings_by_id, node)) {
3706 
3707 			fts_ranking_t*  ranking;
3708 
3709 			ranking = rbt_value(fts_ranking_t, node);
3710 
3711 			ranking->rank = static_cast<fts_rank_t>(
3712 				ranking->rank * word_freq->idf * word_freq->idf);
3713 		}
3714 
3715 		DBUG_RETURN(result);
3716 	}
3717 
3718 	ut_a(rbt_size(query->doc_ids) > 0);
3719 
3720 	for (node = rbt_first(query->doc_ids);
3721 	     node;
3722 	     node = rbt_next(query->doc_ids, node)) {
3723 
3724 		fts_ranking_t*	ranking;
3725 
3726 		ranking = rbt_value(fts_ranking_t, node);
3727 		fts_query_calculate_ranking(query, ranking);
3728 
3729 		// FIXME: I think we may requre this information to improve the
3730 		// ranking of doc ids which have more word matches from
3731 		// different FTS indexes.
3732 
3733 		/* We don't need these anymore free the resources. */
3734 		ranking->words = NULL;
3735 
3736 		if (!result_is_null) {
3737 			fts_query_add_ranking(query, result->rankings_by_id, ranking);
3738 
3739 			 if (query->total_size > fts_result_cache_limit) {
3740 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3741 				fts_query_free_result(result);
3742 				DBUG_RETURN(NULL);
3743                         }
3744 		}
3745 	}
3746 
3747 	if (result_is_null) {
3748 		/* Use doc_ids directly */
3749 		rbt_free(result->rankings_by_id);
3750 		result->rankings_by_id = query->doc_ids;
3751 		query->doc_ids = NULL;
3752 	}
3753 
3754 	DBUG_RETURN(result);
3755 }
3756 
3757 /*****************************************************************//**
3758 Get the result of the query. Calculate the similarity coefficient. */
3759 static
3760 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3761 fts_query_get_result(
3762 /*=================*/
3763 	fts_query_t*		query,	/*!< in: query instance */
3764 	fts_result_t*		result)	/*!< in: result */
3765 {
3766 	DBUG_ENTER("fts_query_get_result");
3767 
3768 	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3769 		/* Copy the doc ids to the result. */
3770 		result = fts_query_prepare_result(query, result);
3771 	} else {
3772 		/* Create an empty result instance. */
3773 		result = static_cast<fts_result_t*>(
3774 			ut_zalloc_nokey(sizeof(*result)));
3775 	}
3776 
3777 	DBUG_RETURN(result);
3778 }
3779 
3780 /*****************************************************************//**
3781 FTS Query free resources and reset. */
3782 static
3783 void
fts_query_free(fts_query_t * query)3784 fts_query_free(
3785 /*===========*/
3786 	fts_query_t*	query)		/*!< in: query instance to free*/
3787 {
3788 
3789 	if (query->read_nodes_graph) {
3790 		fts_que_graph_free(query->read_nodes_graph);
3791 	}
3792 
3793 	if (query->root) {
3794 		fts_ast_free_node(query->root);
3795 	}
3796 
3797 	if (query->deleted) {
3798 		fts_doc_ids_free(query->deleted);
3799 	}
3800 
3801 	if (query->intersection) {
3802 		fts_query_free_doc_ids(query, query->intersection);
3803 	}
3804 
3805 	if (query->doc_ids) {
3806 		fts_query_free_doc_ids(query, query->doc_ids);
3807 	}
3808 
3809 	if (query->word_freqs) {
3810 		const ib_rbt_node_t*	node;
3811 
3812 		/* We need to free any instances of fts_doc_freq_t that we
3813 		may have allocated. */
3814 		for (node = rbt_first(query->word_freqs);
3815 		     node;
3816 		     node = rbt_next(query->word_freqs, node)) {
3817 
3818 			fts_word_freq_t*	word_freq;
3819 
3820 			word_freq = rbt_value(fts_word_freq_t, node);
3821 
3822 			/* We need to cast away the const. */
3823 			rbt_free(word_freq->doc_freqs);
3824 		}
3825 
3826 		rbt_free(query->word_freqs);
3827 	}
3828 
3829 	if (query->wildcard_words != NULL) {
3830 		rbt_free(query->wildcard_words);
3831 	}
3832 
3833 	ut_a(!query->intersection);
3834 
3835 	if (query->word_map) {
3836 		rbt_free(query->word_map);
3837 	}
3838 
3839 	if (query->word_vector != NULL) {
3840 		UT_DELETE(query->word_vector);
3841 	}
3842 
3843 	if (query->heap) {
3844 		mem_heap_free(query->heap);
3845 	}
3846 
3847 	memset(query, 0, sizeof(*query));
3848 }
3849 
3850 /*****************************************************************//**
3851 Parse the query using flex/bison or plugin parser.
3852 @return parse tree node. */
3853 static
3854 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3855 fts_query_parse(
3856 /*============*/
3857 	fts_query_t*	query,		/*!< in: query instance */
3858 	byte*		query_str,	/*!< in: query string */
3859 	ulint		query_len)	/*!< in: query string length */
3860 {
3861 	int		error;
3862 	fts_ast_state_t state;
3863 	bool		mode = query->boolean_mode;
3864 	DBUG_ENTER("fts_query_parse");
3865 
3866 	memset(&state, 0x0, sizeof(state));
3867 
3868 	state.charset = query->fts_index_table.charset;
3869 
3870 	DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3871 		query->parser = NULL;);
3872 
3873 	if (query->parser) {
3874 		state.root = state.cur_node =
3875 			fts_ast_create_node_list(&state, NULL);
3876 		error = fts_parse_by_parser(mode, query_str, query_len,
3877 					    query->parser, &state);
3878 	} else {
3879 		/* Setup the scanner to use, this depends on the mode flag. */
3880 		state.lexer = fts_lexer_create(mode, query_str, query_len);
3881 		state.charset = query->fts_index_table.charset;
3882 		error = fts_parse(&state);
3883 		fts_lexer_free(state.lexer);
3884 		state.lexer = NULL;
3885 	}
3886 
3887 	/* Error during parsing ? */
3888 	if (error) {
3889 		/* Free the nodes that were allocated during parsing. */
3890 		fts_ast_state_free(&state);
3891 	} else {
3892 		query->root = state.root;
3893 
3894 		if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
3895 			fts_ast_node_print(query->root);
3896 		}
3897 	}
3898 
3899 	DBUG_RETURN(state.root);
3900 }
3901 
3902 /*******************************************************************//**
3903 FTS Query optimization
3904 Set FTS_OPT_RANKING if it is a simple term query */
3905 static
3906 void
fts_query_can_optimize(fts_query_t * query,uint flags)3907 fts_query_can_optimize(
3908 /*===================*/
3909 	fts_query_t*	query,		/*!< in/out: query instance */
3910 	uint		flags)		/*!< In: FTS search mode */
3911 {
3912 	fts_ast_node_t*	node = query->root;
3913 
3914 	if (flags & FTS_EXPAND) {
3915 		return;
3916 	}
3917 
3918 	/* Check if it has only a term without oper */
3919 	ut_ad(node->type == FTS_AST_LIST);
3920 	node = node->list.head;
3921 	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3922 		query->flags = FTS_OPT_RANKING;
3923 	}
3924 }
3925 
3926 /** FTS Query entry point.
3927 @param[in,out]	trx		transaction
3928 @param[in]	index		fts index to search
3929 @param[in]	flags		FTS search mode
3930 @param[in]	query_str	FTS query
3931 @param[in]	query_len	FTS query string len in bytes
3932 @param[in,out]	result		result doc ids
3933 @return DB_SUCCESS if successful otherwise error code */
3934 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3935 fts_query(
3936 	trx_t*		trx,
3937 	dict_index_t*	index,
3938 	uint		flags,
3939 	const byte*	query_str,
3940 	ulint		query_len,
3941 	fts_result_t**	result)
3942 {
3943 	fts_query_t	query;
3944 	dberr_t		error = DB_SUCCESS;
3945 	byte*		lc_query_str;
3946 	ulint		lc_query_str_len;
3947 	ulint		result_len;
3948 	bool		boolean_mode;
3949 	trx_t*		query_trx; /* FIXME: use provided trx */
3950 	CHARSET_INFO*	charset;
3951 	ulint		start_time_ms;
3952 	bool		will_be_ignored = false;
3953 
3954 	boolean_mode = flags & FTS_BOOL;
3955 
3956 	*result = NULL;
3957 	memset(&query, 0x0, sizeof(query));
3958 	query_trx = trx_create();
3959 	query_trx->op_info = "FTS query";
3960 
3961 	start_time_ms = ut_time_ms();
3962 
3963 	query.trx = query_trx;
3964 	query.index = index;
3965 	query.boolean_mode = boolean_mode;
3966 	query.deleted = fts_doc_ids_create();
3967 	query.cur_node = NULL;
3968 
3969 	query.fts_common_table.type = FTS_COMMON_TABLE;
3970 	query.fts_common_table.table_id = index->table->id;
3971 	query.fts_common_table.table = index->table;
3972 
3973 	charset = fts_index_get_charset(index);
3974 
3975 	query.fts_index_table.type = FTS_INDEX_TABLE;
3976 	query.fts_index_table.index_id = index->id;
3977 	query.fts_index_table.table_id = index->table->id;
3978 	query.fts_index_table.charset = charset;
3979 	query.fts_index_table.table = index->table;
3980 
3981 	query.word_map = rbt_create_arg_cmp(
3982 		sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3983 	query.word_vector = UT_NEW_NOKEY(word_vector_t());
3984 	query.error = DB_SUCCESS;
3985 
3986 	/* Setup the RB tree that will be used to collect per term
3987 	statistics. */
3988 	query.word_freqs = rbt_create_arg_cmp(
3989 		sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3990                 (void*) charset);
3991 
3992 	if (flags & FTS_EXPAND) {
3993 		query.wildcard_words = rbt_create_arg_cmp(
3994 			sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3995 	}
3996 
3997 	query.total_size += SIZEOF_RBT_CREATE;
3998 
3999 	query.total_docs = dict_table_get_n_rows(index->table);
4000 
4001 	query.fts_common_table.suffix = "DELETED";
4002 
4003 	/* Read the deleted doc_ids, we need these for filtering. */
4004 	error = fts_table_fetch_doc_ids(
4005 		NULL, &query.fts_common_table, query.deleted);
4006 
4007 	if (error != DB_SUCCESS) {
4008 		goto func_exit;
4009 	}
4010 
4011 	query.fts_common_table.suffix = "DELETED_CACHE";
4012 
4013 	error = fts_table_fetch_doc_ids(
4014 		NULL, &query.fts_common_table, query.deleted);
4015 
4016 	if (error != DB_SUCCESS) {
4017 		goto func_exit;
4018 	}
4019 
4020 	/* Get the deleted doc ids that are in the cache. */
4021 	fts_cache_append_deleted_doc_ids(
4022 		index->table->fts->cache, query.deleted->doc_ids);
4023 	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4024 
4025 	/* Sort the vector so that we can do a binary search over the ids. */
4026 	ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
4027 
4028 	/* Convert the query string to lower case before parsing. We own
4029 	the ut_malloc'ed result and so remember to free it before return. */
4030 
4031 	lc_query_str_len = query_len * charset->casedn_multiply + 1;
4032 	lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4033 
4034 	/* For binary collations, a case sensitive search is
4035 	performed. Hence don't convert to lower case. */
4036 	if (my_binary_compare(charset)) {
4037 	memcpy(lc_query_str, query_str, query_len);
4038 		lc_query_str[query_len]= 0;
4039 		result_len= query_len;
4040 	} else {
4041 	result_len = innobase_fts_casedn_str(
4042 				charset, (char*)( query_str), query_len,
4043 				(char*)(lc_query_str), lc_query_str_len);
4044 	}
4045 
4046 	ut_ad(result_len < lc_query_str_len);
4047 
4048 	lc_query_str[result_len] = 0;
4049 
4050 	query.heap = mem_heap_create(128);
4051 
4052 	/* Create the rb tree for the doc id (current) set. */
4053 	query.doc_ids = rbt_create(
4054 		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4055 	query.parser = index->parser;
4056 
4057 	query.total_size += SIZEOF_RBT_CREATE;
4058 
4059 	/* Parse the input query string. */
4060 	if (fts_query_parse(&query, lc_query_str, result_len)) {
4061 		fts_ast_node_t*	ast = query.root;
4062 		ast->trx = trx;
4063 
4064 		/* Optimize query to check if it's a single term */
4065 		fts_query_can_optimize(&query, flags);
4066 
4067 		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4068 			        fts_result_cache_limit = 2048;
4069 		);
4070 
4071 		/* Traverse the Abstract Syntax Tree (AST) and execute
4072 		the query. */
4073 		query.error = fts_ast_visit(
4074 			FTS_NONE, ast, fts_query_visitor,
4075 			&query, &will_be_ignored);
4076 		if (query.error == DB_INTERRUPTED) {
4077 			error = DB_INTERRUPTED;
4078 			ut_free(lc_query_str);
4079 			goto func_exit;
4080 		}
4081 
4082 		/* If query expansion is requested, extend the search
4083 		with first search pass result */
4084 		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4085 			query.error = fts_expand_query(index, &query);
4086 		}
4087 
4088 		/* Calculate the inverse document frequency of the terms. */
4089 		if (query.error == DB_SUCCESS
4090 		    && query.flags != FTS_OPT_RANKING) {
4091 			fts_query_calculate_idf(&query);
4092 		}
4093 
4094 		/* Copy the result from the query state, so that we can
4095 		return it to the caller. */
4096 		if (query.error == DB_SUCCESS) {
4097 			*result = fts_query_get_result(&query, *result);
4098 		}
4099 
4100 		error = query.error;
4101 	} else {
4102 		/* still return an empty result set */
4103 		*result = static_cast<fts_result_t*>(
4104 			ut_zalloc_nokey(sizeof(**result)));
4105 	}
4106 
4107 	if (trx_is_interrupted(trx)) {
4108 		error = DB_INTERRUPTED;
4109 		ut_free(lc_query_str);
4110 		if (*result) {
4111 			fts_query_free_result(*result);
4112 		}
4113 		goto func_exit;
4114 	}
4115 
4116 	ut_free(lc_query_str);
4117 
4118 	if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
4119 		ulint	diff_time = ut_time_ms() - start_time_ms;
4120 
4121 		ib::info() << "FTS Search Processing time: "
4122 			<< diff_time / 1000 << " secs: " << diff_time % 1000
4123 			<< " millisec: row(s) "
4124 			<< ((*result)->rankings_by_id
4125 			    ? lint(rbt_size((*result)->rankings_by_id))
4126 			    : -1);
4127 
4128 		/* Log memory consumption & result size */
4129 		ib::info() << "Full Search Memory: " << query.total_size
4130 			<< " (bytes),  Row: "
4131 			<< ((*result)->rankings_by_id
4132 			    ? rbt_size((*result)->rankings_by_id)
4133 			    : 0)
4134 			<< ".";
4135 	}
4136 
4137 func_exit:
4138 	fts_query_free(&query);
4139 
4140 	query_trx->free();
4141 
4142 	return(error);
4143 }
4144 
4145 /*****************************************************************//**
4146 FTS Query free result, returned by fts_query(). */
4147 void
fts_query_free_result(fts_result_t * result)4148 fts_query_free_result(
4149 /*==================*/
4150 	fts_result_t*	result)		/*!< in: result instance to free.*/
4151 {
4152 	if (result) {
4153 		if (result->rankings_by_id != NULL) {
4154 			rbt_free(result->rankings_by_id);
4155 			result->rankings_by_id = NULL;
4156 		}
4157 		if (result->rankings_by_rank != NULL) {
4158 			rbt_free(result->rankings_by_rank);
4159 			result->rankings_by_rank = NULL;
4160 		}
4161 
4162 		ut_free(result);
4163 		result = NULL;
4164 	}
4165 }
4166 
4167 /*****************************************************************//**
4168 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4169 void
fts_query_sort_result_on_rank(fts_result_t * result)4170 fts_query_sort_result_on_rank(
4171 /*==========================*/
4172 	fts_result_t*	result)		/*!< out: result instance to sort.*/
4173 {
4174 	const ib_rbt_node_t*	node;
4175 	ib_rbt_t*		ranked;
4176 
4177 	ut_a(result->rankings_by_id != NULL);
4178 	if (result->rankings_by_rank) {
4179 		rbt_free(result->rankings_by_rank);
4180 	}
4181 
4182 	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4183 
4184 	/* We need to free any instances of fts_doc_freq_t that we
4185 	may have allocated. */
4186 	for (node = rbt_first(result->rankings_by_id);
4187 	     node;
4188 	     node = rbt_next(result->rankings_by_id, node)) {
4189 
4190 		fts_ranking_t*	ranking;
4191 
4192 		ranking = rbt_value(fts_ranking_t, node);
4193 
4194 		ut_a(ranking->words == NULL);
4195 
4196 		rbt_insert(ranked, ranking, ranking);
4197 	}
4198 
4199 	/* Reset the current node too. */
4200 	result->current = NULL;
4201 	result->rankings_by_rank = ranked;
4202 }
4203 
4204 /*******************************************************************//**
4205 A debug function to print result doc_id set. */
4206 static
4207 void
fts_print_doc_id(fts_query_t * query)4208 fts_print_doc_id(
4209 /*=============*/
4210 	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
4211 {
4212 	const ib_rbt_node_t*	node;
4213 
4214 	/* Iterate each member of the doc_id set */
4215 	for (node = rbt_first(query->doc_ids);
4216 	     node;
4217 	     node = rbt_next(query->doc_ids, node)) {
4218 		fts_ranking_t*	ranking;
4219 		ranking = rbt_value(fts_ranking_t, node);
4220 
4221 		ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4222 
4223 		ulint		pos = 0;
4224 		fts_string_t	word;
4225 
4226 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4227 			ib::info() << "doc_ids info, value: " << word.f_str;
4228 		}
4229 	}
4230 }
4231 
4232 /*************************************************************//**
4233 This function implements a simple "blind" query expansion search:
4234 words in documents found in the first search pass will be used as
4235 search arguments to search the document again, thus "expand"
4236 the search result set.
4237 @return DB_SUCCESS if success, otherwise the error code */
4238 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4239 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4240 fts_expand_query(
4241 /*=============*/
4242 	dict_index_t*	index,		/*!< in: FTS index to search */
4243 	fts_query_t*	query)		/*!< in: FTS query instance */
4244 {
4245 	const ib_rbt_node_t*	node;
4246 	const ib_rbt_node_t*	token_node;
4247 	fts_doc_t		result_doc;
4248 	dberr_t			error = DB_SUCCESS;
4249 	const fts_index_cache_t*index_cache;
4250 
4251 	/* If no doc is found in first search pass, return */
4252 	if (!rbt_size(query->doc_ids)) {
4253 		return(error);
4254 	}
4255 
4256 	/* Init "result_doc", to hold words from the first search pass */
4257 	fts_doc_init(&result_doc);
4258 
4259 	rw_lock_x_lock(&index->table->fts->cache->lock);
4260 	index_cache = fts_find_index_cache(index->table->fts->cache, index);
4261 	rw_lock_x_unlock(&index->table->fts->cache->lock);
4262 
4263 	ut_a(index_cache);
4264 
4265 	result_doc.tokens = rbt_create_arg_cmp(
4266 		sizeof(fts_token_t), innobase_fts_text_cmp,
4267 		(void*) index_cache->charset);
4268 
4269 	result_doc.charset = index_cache->charset;
4270 	result_doc.parser = index_cache->index->parser;
4271 
4272 	query->total_size += SIZEOF_RBT_CREATE;
4273 
4274 	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
4275 		fts_print_doc_id(query);
4276 	}
4277 
4278 	for (node = rbt_first(query->doc_ids);
4279 	     node;
4280 	     node = rbt_next(query->doc_ids, node)) {
4281 
4282 		fts_ranking_t*	ranking;
4283 		ulint		prev_token_size;
4284 		ulint		estimate_size;
4285 
4286 		prev_token_size = rbt_size(result_doc.tokens);
4287 
4288 		ranking = rbt_value(fts_ranking_t, node);
4289 
4290 		/* Fetch the documents with the doc_id from the
4291 		result of first seach pass. Since we do not
4292 		store document-to-word mapping, we need to
4293 		fetch the original document and parse them.
4294 		Future optimization could be done here if we
4295 		support some forms of document-to-word mapping */
4296 		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4297 					FTS_FETCH_DOC_BY_ID_EQUAL,
4298 					fts_query_expansion_fetch_doc,
4299 					&result_doc);
4300 
4301 		/* Estimate memory used, see fts_process_token and fts_token_t.
4302 		   We ignore token size here. */
4303 		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4304 			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4305 			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
4306 		query->total_size += estimate_size;
4307 
4308 		if (query->total_size > fts_result_cache_limit) {
4309 			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4310 			goto	func_exit;
4311 		}
4312 	}
4313 
4314 	/* Remove words that have already been searched in the first pass */
4315 	for (ulint i = 0; i < query->word_vector->size(); i++) {
4316 		fts_string_t	word = query->word_vector->at(i);
4317 		ib_rbt_bound_t	parent;
4318 
4319 		if (query->wildcard_words
4320 		    && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4321 			/* If it's a wildcard word, remove words having
4322 			it as prefix. */
4323 			while (rbt_search_cmp(result_doc.tokens,
4324 					      &parent, &word, NULL,
4325 					      innobase_fts_text_cmp_prefix)
4326 			       == 0) {
4327 				ut_free(rbt_remove_node(result_doc.tokens,
4328 							parent.last));
4329 			}
4330 		} else {
4331 			/* We don't check return value, because the word may
4332 			have been deleted by a previous wildcard word as its
4333 			prefix, e.g. ('g * good'). */
4334 			rbt_delete(result_doc.tokens, &word);
4335 		}
4336 	}
4337 
4338 	/* Search the table the second time with expanded search list */
4339 	for (token_node = rbt_first(result_doc.tokens);
4340 	     token_node;
4341 	     token_node = rbt_next(result_doc.tokens, token_node)) {
4342 		fts_token_t*	mytoken;
4343 		mytoken = rbt_value(fts_token_t, token_node);
4344 
4345 		/* '%' in the end is treated as prefix search,
4346 		it can cause assert failure, so we skip it. */
4347 		if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4348 			continue;
4349 		}
4350 
4351 		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4352 		fts_query_add_word_freq(query, &mytoken->text);
4353 		error = fts_query_union(query, &mytoken->text);
4354 
4355 		if (error != DB_SUCCESS) {
4356 			break;
4357 		}
4358 	}
4359 
4360 func_exit:
4361 	fts_doc_free(&result_doc);
4362 
4363 	return(error);
4364 }
4365 /*************************************************************//**
4366 This function finds documents that contain all words in a
4367 phrase or proximity search. And if proximity search, verify
4368 the words are close enough to each other, as in specified distance.
4369 This function is called for phrase and proximity search.
4370 @return TRUE if documents are found, FALSE if otherwise */
4371 static
4372 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4373 fts_phrase_or_proximity_search(
4374 /*===========================*/
4375 	fts_query_t*	query,		/*!< in/out:  query instance.
4376 					query->doc_ids might be instantiated
4377 					with qualified doc IDs */
4378 	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
4379 {
4380 	ulint		n_matched;
4381 	ulint		i;
4382 	ibool		matched = FALSE;
4383 	ulint		num_token = ib_vector_size(tokens);
4384 	fts_match_t*	match[MAX_PROXIMITY_ITEM];
4385 	ibool		end_list = FALSE;
4386 
4387 	/* Number of matched documents for the first token */
4388 	n_matched = ib_vector_size(query->match_array[0]);
4389 
4390 	/* We have a set of match list for each word, we shall
4391 	walk through the list and find common documents that
4392 	contain all the matching words. */
4393 	for (i = 0; i < n_matched; i++) {
4394 		ulint		j;
4395 		ulint		k = 0;
4396 		fts_proximity_t	qualified_pos;
4397 
4398 		match[0] = static_cast<fts_match_t*>(
4399 			ib_vector_get(query->match_array[0], i));
4400 
4401 		/* For remaining match list for the token(word), we
4402 		try to see if there is a document with the same
4403 		doc id */
4404 		for (j = 1; j < num_token; j++) {
4405 			match[j] = static_cast<fts_match_t*>(
4406 				ib_vector_get(query->match_array[j], k));
4407 
4408 			while (match[j]->doc_id < match[0]->doc_id
4409 			       && k < ib_vector_size(query->match_array[j])) {
4410 				 match[j] = static_cast<fts_match_t*>(
4411 					ib_vector_get(
4412 						query->match_array[j], k));
4413 				k++;
4414 			}
4415 
4416 			if (match[j]->doc_id > match[0]->doc_id) {
4417 				/* no match */
4418 				if (query->flags & FTS_PHRASE) {
4419 					match[0]->doc_id = 0;
4420 				}
4421 				break;
4422 			}
4423 
4424 			if (k == ib_vector_size(query->match_array[j])) {
4425 				end_list = TRUE;
4426 
4427 				if (query->flags & FTS_PHRASE) {
4428 					ulint	s;
4429 					/* Since i is the last doc id in the
4430 					match_array[j], remove all doc ids > i
4431 					from the match_array[0]. */
4432 					fts_match_t*    match_temp;
4433 					for (s = i + 1; s < n_matched; s++) {
4434 						match_temp = static_cast<
4435 						fts_match_t*>(ib_vector_get(
4436 						query->match_array[0], s));
4437 						match_temp->doc_id = 0;
4438 					}
4439 
4440 					if (match[j]->doc_id !=
4441 						match[0]->doc_id) {
4442 						/* no match */
4443 						match[0]->doc_id = 0;
4444 					}
4445 				}
4446 
4447 				if (match[j]->doc_id != match[0]->doc_id) {
4448 					goto func_exit;
4449 				}
4450 			}
4451 
4452 			/* FIXME: A better solution will be a counter array
4453 			remember each run's last position. So we don't
4454 			reset it here very time */
4455 			k = 0;
4456 		}
4457 
4458 		if (j != num_token) {
4459 			continue;
4460 		}
4461 
4462 		/* For this matching doc, we need to further
4463 		verify whether the words in the doc are close
4464 		to each other, and within the distance specified
4465 		in the proximity search */
4466 		if (query->flags & FTS_PHRASE) {
4467 			matched = TRUE;
4468 		} else if (fts_proximity_get_positions(
4469 			match, num_token, ULINT_MAX, &qualified_pos)) {
4470 
4471 			/* Fetch the original documents and count the
4472 			words in between matching words to see that is in
4473 			specified distance */
4474 			if (fts_query_is_in_proximity_range(
4475 				query, match, &qualified_pos)) {
4476 				/* If so, mark we find a matching doc */
4477 				query->error = fts_query_process_doc_id(
4478 					query, match[0]->doc_id, 0);
4479 				if (query->error != DB_SUCCESS) {
4480 					matched = FALSE;
4481 					goto func_exit;
4482 				}
4483 
4484 				matched = TRUE;
4485 				for (ulint z = 0; z < num_token; z++) {
4486 					fts_string_t*	token;
4487 					token = static_cast<fts_string_t*>(
4488 						ib_vector_get(tokens, z));
4489 					fts_query_add_word_to_document(
4490 						query, match[0]->doc_id, token);
4491 				}
4492 			}
4493 		}
4494 
4495 		if (end_list) {
4496 			break;
4497 		}
4498 	}
4499 
4500 func_exit:
4501 	return(matched);
4502 }
4503 
4504 /*************************************************************//**
4505 This function checks whether words in result documents are close to
4506 each other (within proximity range as specified by "distance").
4507 If "distance" is MAX_ULINT, then it will find all combinations of
4508 positions of matching words and store min and max positions
4509 in the "qualified_pos" for later verification.
4510 @return true if words are close to each other, false if otherwise */
4511 static
4512 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4513 fts_proximity_get_positions(
4514 /*========================*/
4515 	fts_match_t**		match,		/*!< in: query instance */
4516 	ulint			num_match,	/*!< in: number of matching
4517 						items */
4518 	ulint			distance,	/*!< in: distance value
4519 						for proximity search */
4520 	fts_proximity_t*	qualified_pos)	/*!< out: the position info
4521 						records ranges containing
4522 						all matching words. */
4523 {
4524 	ulint	i;
4525 	ulint	idx[MAX_PROXIMITY_ITEM];
4526 	ulint	num_pos[MAX_PROXIMITY_ITEM];
4527 	ulint	min_idx;
4528 
4529 	qualified_pos->n_pos = 0;
4530 
4531 	ut_a(num_match <= MAX_PROXIMITY_ITEM);
4532 
4533 	/* Each word could appear multiple times in a doc. So
4534 	we need to walk through each word's position list, and find
4535 	closest distance between different words to see if
4536 	they are in the proximity distance. */
4537 
4538 	/* Assume each word's position list is sorted, we
4539 	will just do a walk through to all words' lists
4540 	similar to a the merge phase of a merge sort */
4541 	for (i = 0; i < num_match; i++) {
4542 		/* idx is the current position we are checking
4543 		for a particular word */
4544 		idx[i] = 0;
4545 
4546 		/* Number of positions for this word */
4547 		num_pos[i] = ib_vector_size(match[i]->positions);
4548 	}
4549 
4550 	/* Start with the first word */
4551 	min_idx = 0;
4552 
4553 	while (idx[min_idx] < num_pos[min_idx]) {
4554 		ulint	position[MAX_PROXIMITY_ITEM];
4555 		ulint	min_pos = ULINT_MAX;
4556 		ulint	max_pos = 0;
4557 
4558 		/* Check positions in each word position list, and
4559 		record the max/min position */
4560 		for (i = 0; i < num_match; i++) {
4561 			position[i] = *(ulint*) ib_vector_get_const(
4562 				match[i]->positions, idx[i]);
4563 
4564 			if (position[i] == ULINT_UNDEFINED) {
4565 				break;
4566 			}
4567 
4568 			if (position[i] < min_pos) {
4569 				min_pos = position[i];
4570 				min_idx = i;
4571 			}
4572 
4573 			if (position[i] > max_pos) {
4574 				max_pos = position[i];
4575 			}
4576 		}
4577 
4578 		/* If max and min position are within range, we
4579 		find a good match */
4580 		if (max_pos - min_pos <= distance
4581 		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4582 			/* The charset has variable character
4583 			length encoding, record the min_pos and
4584 			max_pos, we will need to verify the actual
4585 			number of characters */
4586 			qualified_pos->min_pos.push_back(min_pos);
4587 			qualified_pos->max_pos.push_back(max_pos);
4588 			qualified_pos->n_pos++;
4589 		}
4590 
4591 		/* Otherwise, move to the next position is the
4592 		list for the word with the smallest position */
4593 		idx[min_idx]++;
4594 	}
4595 
4596 	return(qualified_pos->n_pos != 0);
4597 }
4598