1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file fts/fts0que.cc
29 Full Text Search functionality.
30 
31 Created 2007/03/27 Sunny Bains
32 Completed 2011/7/10 Sunny and Jimmy Yang
33 *******************************************************/
34 
35 #include "ha_prototypes.h"
36 
37 #include "dict0dict.h"
38 #include "ut0rbt.h"
39 #include "row0sel.h"
40 #include "fts0fts.h"
41 #include "fts0priv.h"
42 #include "fts0ast.h"
43 #include "fts0pars.h"
44 #include "fts0types.h"
45 #include "fts0plugin.h"
46 #include "ut0new.h"
47 
48 #ifdef UNIV_NONINL
49 #include "fts0types.ic"
50 #include "fts0vlc.ic"
51 #endif
52 
53 #include <iomanip>
54 #include <vector>
55 
56 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
57 
58 #define RANK_DOWNGRADE		(-1.0F)
59 #define RANK_UPGRADE		(1.0F)
60 
61 /* Maximum number of words supported in a phrase or proximity search. */
62 #define MAX_PROXIMITY_ITEM	128
63 
64 /* Memory used by rbt itself for create and node add */
65 #define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
66 #define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
67 
68 /*Initial byte length for 'words' in fts_ranking_t */
69 #define RANKING_WORDS_INIT_LEN	4
70 
71 // FIXME: Need to have a generic iterator that traverses the ilist.
72 
73 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> >	word_vector_t;
74 
75 struct fts_word_freq_t;
76 
77 /** State of an FTS query. */
78 struct fts_query_t {
79 	mem_heap_t*	heap;		/*!< Heap to use for allocations */
80 
81 	trx_t*		trx;		/*!< The query transaction */
82 
83 	dict_index_t*	index;		/*!< The FTS index to search */
84 					/*!< FTS auxiliary common table def */
85 
86 	fts_table_t	fts_common_table;
87 
88 	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
89 
90 	ulint		total_size;	/*!< total memory size used by query */
91 
92 	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
93 					filtered from the output */
94 
95 	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
96 
97 	fts_ast_node_t* cur_node;	/*!< Current tree node */
98 
99 	ib_rbt_t*	word_map;	/*!< Matched word map for
100 					searching by word*/
101 
102 	word_vector_t*	word_vector;	/*!< Matched word vector for
103 					searching by index */
104 
105 	ib_rbt_t*       doc_ids;	/*!< The current set of matching
106 					doc ids, elements are of
107 					type fts_ranking_t */
108 
109 	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
110 					doc_ids, this tree will become
111 					the new doc_ids, elements are of type
112 					fts_ranking_t */
113 
114 					/*!< Prepared statement to read the
115 					nodes from the FTS INDEX */
116 	que_t*		read_nodes_graph;
117 
118 	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
119 
120 					/*!< TRUE if we want to collect the
121 					word positions within the document */
122 	ibool		collect_positions;
123 
124 	ulint		flags;		/*!< Specify the full text search type,
125 					such as  boolean search, phrase
126 					search, proximity search etc. */
127 
128 	ulint		distance;	/*!< The proximity distance of a
129 					phrase search. */
130 
131 					/*!< These doc ids are used as a
132 					boundary condition when searching the
133 					FTS index rows */
134 
135 	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
136 
137 	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
138 
139 	bool		boolean_mode;	/*!< TRUE if boolean mode query */
140 
141 	ib_vector_t*	matched;	/*!< Array of matching documents
142 					(fts_match_t) to search for a phrase */
143 
144 	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
145 					position info for each matched word
146 					in the word list */
147 
148 	ib_uint64_t	total_docs;	/*!< The total number of documents */
149 
150 	ulint		total_words;	/*!< The total number of words */
151 
152 	dberr_t		error;		/*!< Error code if any, that is
153 					encountered during query processing */
154 
155 	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
156 					document, its elements are of type
157 					fts_word_freq_t */
158 
159 	ib_rbt_t*	wildcard_words;	/*!< words with wildcard */
160 
161 	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
162 
163 	st_mysql_ftparser*	parser;	/*!< fts plugin parser */
164 
165 	/** limit value for the fts query */
166 	ulonglong		limit;
167 
168 	/** number of docs fetched by query. This is to restrict the
169 	result with limit value */
170 	ulonglong		n_docs;
171 
172 	ulint		nested_exp_count; /*!< number of nested sub expression
173 					  limit */
174 };
175 
176 /** For phrase matching, first we collect the documents and the positions
177 then we match. */
178 struct fts_match_t {
179 	doc_id_t	doc_id;		/*!< Document id */
180 
181 	ulint		start;		/*!< Start the phrase match from
182 					this offset within the positions
183 					vector. */
184 
185 	ib_vector_t*	positions;	/*!< Offsets of a word in a
186 					document */
187 };
188 
189 /** For matching tokens in a phrase search. We use this data structure in
190 the callback that determines whether a document should be accepted or
191 rejected for a phrase search. */
192 struct fts_select_t {
193 	doc_id_t	doc_id;		/*!< The document id to match */
194 
195 	ulint		min_pos;	/*!< For found to be TRUE at least
196 					one position must be greater than
197 					min_pos. */
198 
199 	ibool		found;		/*!< TRUE if found */
200 
201 	fts_word_freq_t*
202 			word_freq;	/*!< Word frequency instance of the
203 					current word being looked up in
204 					the FTS index */
205 };
206 
207 typedef std::vector<ulint, ut_allocator<ulint> >       pos_vector_t;
208 
209 /** structure defines a set of ranges for original documents, each of which
210 has a minimum position and maximum position. Text in such range should
211 contain all words in the proximity search. We will need to count the
212 words in such range to make sure it is less than the specified distance
213 of the proximity search */
214 struct fts_proximity_t {
215 	ulint		n_pos;		/*!< number of position set, defines
216 					a range (min to max) containing all
217 					matching words */
218 	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
219 					of the range */
220 	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
221 					of the range */
222 };
223 
224 /** The match positions and tokesn to match */
225 struct fts_phrase_t {
fts_phrase_tfts_phrase_t226 	fts_phrase_t(const dict_table_t* table)
227 		:
228 		found(false),
229 		match(NULL),
230 		tokens(NULL),
231 		distance(0),
232 		charset(NULL),
233 		heap(NULL),
234 		page_size(dict_table_page_size(table)),
235 		proximity_pos(NULL),
236 		parser(NULL)
237 	{
238 	}
239 
240 	/** Match result */
241 	ibool			found;
242 
243 	/** Positions within text */
244 	const fts_match_t*	match;
245 
246 	/** Tokens to match */
247 	const ib_vector_t*	tokens;
248 
249 	/** For matching on proximity distance. Can be 0 for exact match */
250 	ulint			distance;
251 
252 	/** Phrase match charset */
253 	CHARSET_INFO*		charset;
254 
255 	/** Heap for word processing */
256 	mem_heap_t*		heap;
257 
258 	/** Row page size */
259 	const page_size_t	page_size;
260 
261 	/** Position info for proximity search verification. Records the
262 	min and max position of words matched */
263 	fts_proximity_t*	proximity_pos;
264 
265 	/** FTS plugin parser */
266 	st_mysql_ftparser*	parser;
267 };
268 
269 /** Paramter passed to fts phrase match by parser */
270 struct fts_phrase_param_t {
271 	fts_phrase_t*	phrase;		/*!< Match phrase instance */
272 	ulint		token_index;	/*!< Index of token to match next */
273 	mem_heap_t*	heap;		/*!< Heap for word processing */
274 };
275 
276 /** For storing the frequncy of a word/term in a document */
277 struct fts_doc_freq_t {
278 	doc_id_t	doc_id;		/*!< Document id */
279 	ulint		freq;		/*!< Frequency of a word in a document */
280 };
281 
282 /** To determine the word frequency per document. */
283 struct fts_word_freq_t {
284 	fts_string_t	word;		/*!< Word for which we need the freq,
285 					it's allocated on the query heap */
286 
287 	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
288 					word frequencies. The elements are
289 					of type fts_doc_freq_t */
290 	ib_uint64_t	doc_count;	/*!< Total number of documents that
291 					contain this word */
292 	double		idf;		/*!< Inverse document frequency */
293 };
294 
295 /********************************************************************
296 Callback function to fetch the rows in an FTS INDEX record.
297 @return always TRUE */
298 static
299 ibool
300 fts_query_index_fetch_nodes(
301 /*========================*/
302 	void*		row,		/*!< in: sel_node_t* */
303 	void*		user_arg);	/*!< in: pointer to ib_vector_t */
304 
305 /********************************************************************
306 Read and filter nodes.
307 @return fts_node_t instance */
308 static
309 dberr_t
310 fts_query_filter_doc_ids(
311 /*=====================*/
312 	fts_query_t*		query,		/*!< in: query instance */
313 	const fts_string_t*	word,		/*!< in: the current word */
314 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
315 	const fts_node_t*	node,		/*!< in: current FTS node */
316 	void*			data,		/*!< in: doc id ilist */
317 	ulint			len,		/*!< in: doc id ilist size */
318 	ibool			calc_doc_count);/*!< in: whether to remember doc
319 						count */
320 
321 #if 0
322 /*****************************************************************//***
323 Find a doc_id in a word's ilist.
324 @return TRUE if found. */
325 static
326 ibool
327 fts_query_find_doc_id(
328 /*==================*/
329 	fts_select_t*	select,		/*!< in/out: search the doc id selected,
330 					update the frequency if found. */
331 	void*		data,		/*!< in: doc id ilist */
332 	ulint		len);		/*!< in: doc id ilist size */
333 #endif
334 
335 /*************************************************************//**
336 This function implements a simple "blind" query expansion search:
337 words in documents found in the first search pass will be used as
338 search arguments to search the document again, thus "expand"
339 the search result set.
340 @return DB_SUCCESS if success, otherwise the error code */
341 static
342 dberr_t
343 fts_expand_query(
344 /*=============*/
345 	dict_index_t*	index,		/*!< in: FTS index to search */
346 	fts_query_t*	query)		/*!< in: query result, to be freed
347 					by the client */
348 	MY_ATTRIBUTE((nonnull, warn_unused_result));
349 /*************************************************************//**
350 This function finds documents that contain all words in a
351 phrase or proximity search. And if proximity search, verify
352 the words are close enough to each other, as in specified distance.
353 This function is called for phrase and proximity search.
354 @return TRUE if documents are found, FALSE if otherwise */
355 static
356 ibool
357 fts_phrase_or_proximity_search(
358 /*===========================*/
359 	fts_query_t*	query,		/*!< in/out:  query instance
360 					query->doc_ids might be instantiated
361 					with qualified doc IDs */
362 	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
363 /*************************************************************//**
364 This function checks whether words in result documents are close to
365 each other (within proximity range as specified by "distance").
366 If "distance" is MAX_ULINT, then it will find all combinations of
367 positions of matching words and store min and max positions
368 in the "qualified_pos" for later verification.
369 @return true if words are close to each other, false if otherwise */
370 static
371 bool
372 fts_proximity_get_positions(
373 /*========================*/
374 	fts_match_t**		match,		/*!< in: query instance */
375 	ulint			num_match,	/*!< in: number of matching
376 						items */
377 	ulint			distance,	/*!< in: distance value
378 						for proximity search */
379 	fts_proximity_t*	qualified_pos);	/*!< out: the position info
380 						records ranges containing
381 						all matching words. */
382 #if 0
383 /********************************************************************
384 Get the total number of words in a documents. */
385 static
386 ulint
387 fts_query_terms_in_document(
388 /*========================*/
389 					/*!< out: DB_SUCCESS if all go well
390 					else error code */
391 	fts_query_t*	query,		/*!< in: FTS query state */
392 	doc_id_t	doc_id,		/*!< in: the word to check */
393 	ulint*		total);		/*!< out: total words in document */
394 #endif
395 
396 /********************************************************************
397 Compare two fts_doc_freq_t doc_ids.
398 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
399 UNIV_INLINE
400 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)401 fts_freq_doc_id_cmp(
402 /*================*/
403 	const void*	p1,			/*!< in: id1 */
404 	const void*	p2)			/*!< in: id2 */
405 {
406 	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
407 	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
408 
409 	return((int) (fq1->doc_id - fq2->doc_id));
410 }
411 
412 #if 0
413 /*******************************************************************//**
414 Print the table used for calculating LCS. */
415 static
416 void
417 fts_print_lcs_table(
418 /*================*/
419 	const ulint*	table,		/*!< in: array to print */
420 	ulint		n_rows,		/*!< in: total no. of rows */
421 	ulint		n_cols)		/*!< in: total no. of cols */
422 {
423 	ulint		i;
424 
425 	for (i = 0; i < n_rows; ++i) {
426 		ulint	j;
427 
428 		printf("\n");
429 
430 		for (j = 0; j < n_cols; ++j) {
431 
432 			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
433 		}
434 	}
435 }
436 
437 /********************************************************************
438 Find the longest common subsequence between the query string and
439 the document. */
440 static
441 ulint
442 fts_query_lcs(
443 /*==========*/
444 					/*!< out: LCS (length) between
445 					two ilists */
446 	const	ulint*	p1,		/*!< in: word positions of query */
447 	ulint	len_p1,			/*!< in: no. of elements in p1 */
448 	const	ulint*	p2,		/*!< in: word positions within document */
449 	ulint	len_p2)			/*!< in: no. of elements in p2 */
450 {
451 	int	i;
452 	ulint	len = 0;
453 	ulint	r = len_p1;
454 	ulint	c = len_p2;
455 	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
456 	ulint*	table = (ulint*) ut_malloc_nokey(size);
457 
458 	/* Traverse the table backwards, from the last row to the first and
459 	also from the last column to the first. We compute the smaller
460 	common subsequeces first, then use the caluclated values to determine
461 	the longest common subsequence. The result will be in TABLE[0][0]. */
462 	for (i = r; i >= 0; --i) {
463 		int	j;
464 
465 		for (j = c; j >= 0; --j) {
466 
467 			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
468 
469 				FTS_ELEM(table, c, i, j) = 0;
470 
471 			} else if (p1[i] == p2[j]) {
472 
473 				FTS_ELEM(table, c, i, j) = FTS_ELEM(
474 					table, c, i + 1, j + 1) + 1;
475 
476 			} else {
477 
478 				ulint	value;
479 
480 				value = ut_max(
481 					FTS_ELEM(table, c, i + 1, j),
482 					FTS_ELEM(table, c, i, j + 1));
483 
484 				FTS_ELEM(table, c, i, j) = value;
485 			}
486 		}
487 	}
488 
489 	len = FTS_ELEM(table, c, 0, 0);
490 
491 	fts_print_lcs_table(table, r, c);
492 	printf("\nLen=%lu\n", len);
493 
494 	ut_free(table);
495 
496 	return(len);
497 }
498 #endif
499 
500 /*******************************************************************//**
501 Compare two fts_ranking_t instance on their rank value and doc ids in
502 descending order on the rank and ascending order on doc id.
503 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
504 static
505 int
fts_query_compare_rank(const void * p1,const void * p2)506 fts_query_compare_rank(
507 /*===================*/
508 	const void*	p1,		/*!< in: pointer to elem */
509 	const void*	p2)		/*!< in: pointer to elem */
510 {
511 	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
512 	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
513 
514 	if (r2->rank < r1->rank) {
515 		return(-1);
516 	} else if (r2->rank == r1->rank) {
517 
518 		if (r1->doc_id < r2->doc_id) {
519 			return(1);
520 		} else if (r1->doc_id > r2->doc_id) {
521 			return(1);
522 		}
523 
524 		return(0);
525 	}
526 
527 	return(1);
528 }
529 
530 /*******************************************************************//**
531 Create words in ranking */
532 static
533 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)534 fts_ranking_words_create(
535 /*=====================*/
536 	fts_query_t*	query,		/*!< in: query instance */
537 	fts_ranking_t*	ranking)	/*!< in: ranking instance */
538 {
539 	ranking->words = static_cast<byte*>(
540 		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
541 	ranking->words_len = RANKING_WORDS_INIT_LEN;
542 }
543 
544 /*
545 The optimization here is using a char array(bitmap) to replace words rb tree
546 in fts_ranking_t.
547 
548 It can save lots of memory except in some cases of QUERY EXPANSION.
549 
550 'word_map' is used as a word dictionary, in which the key is a word, the value
551 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
552 if not, we add it into 'word_map', and give it a position(actually a number).
553 then we set the corresponding bit to '1' at the position in the char array 'words'.
554 
555 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
556 more quickly than searching by value in 'word_map'. we use 'word_vector'
557 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
558 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
559 by the word.
560 */
561 
562 /*******************************************************************//**
563 Add a word into ranking */
564 static
565 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)566 fts_ranking_words_add(
567 /*==================*/
568 	fts_query_t*		query,		/*!< in: query instance */
569 	fts_ranking_t*		ranking,	/*!< in: ranking instance */
570 	const fts_string_t*	word)		/*!< in: term/word to add */
571 {
572 	ulint	pos;
573 	ulint	byte_offset;
574 	ulint	bit_offset;
575 	ib_rbt_bound_t	parent;
576 
577 	/* Note: we suppose the word map and vector are append-only. */
578 	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
579 
580 	/* We use ib_rbt to simulate a map, f_n_char means position. */
581 	if (rbt_search(query->word_map, &parent, word) == 0) {
582 		fts_string_t*	result_word;
583 
584 		result_word = rbt_value(fts_string_t, parent.last);
585 		pos = result_word->f_n_char;
586 		ut_ad(pos < rbt_size(query->word_map));
587 	} else {
588 		/* Add the word to map. */
589 		fts_string_t	new_word;
590 
591 		pos = rbt_size(query->word_map);
592 
593 		fts_string_dup(&new_word, word, query->heap);
594 		new_word.f_n_char = pos;
595 
596 		rbt_add_node(query->word_map, &parent, &new_word);
597 		ut_ad(rbt_validate(query->word_map));
598 		query->word_vector->push_back(new_word);
599 	}
600 
601 	/* Check words len */
602 	byte_offset = pos / CHAR_BIT;
603 	if (byte_offset >= ranking->words_len) {
604 		byte*	words = ranking->words;
605 		ulint	words_len = ranking->words_len;
606 
607 		while (byte_offset >= words_len) {
608 			words_len *= 2;
609 		}
610 
611 		ranking->words = static_cast<byte*>(
612 			mem_heap_zalloc(query->heap, words_len));
613 		ut_memcpy(ranking->words, words, ranking->words_len);
614 		ranking->words_len = words_len;
615 	}
616 
617 	/* Set ranking words */
618 	ut_ad(byte_offset < ranking->words_len);
619 	bit_offset = pos % CHAR_BIT;
620 	ranking->words[byte_offset] |= 1 << bit_offset;
621 }
622 
623 /*******************************************************************//**
624 Get a word from a ranking
625 @return true if it's successful */
626 static
627 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)628 fts_ranking_words_get_next(
629 /*=======================*/
630 	const	fts_query_t*	query,	/*!< in: query instance */
631 	fts_ranking_t*		ranking,/*!< in: ranking instance */
632 	ulint*			pos,	/*!< in/out: word start pos */
633 	fts_string_t*		word)	/*!< in/out: term/word to add */
634 {
635 	bool	ret = false;
636 	ulint	max_pos = ranking->words_len * CHAR_BIT;
637 
638 	/* Search for next word */
639 	while (*pos < max_pos) {
640 		ulint	byte_offset = *pos / CHAR_BIT;
641 		ulint	bit_offset = *pos % CHAR_BIT;
642 
643 		if (ranking->words[byte_offset] & (1 << bit_offset)) {
644 			ret = true;
645 			break;
646 		}
647 
648 		*pos += 1;
649 	};
650 
651 	/* Get next word from word vector */
652 	if (ret) {
653 		ut_ad(*pos < query->word_vector->size());
654 		*word = query->word_vector->at((size_t)*pos);
655 		*pos += 1;
656 	}
657 
658 	return ret;
659 }
660 
661 /*******************************************************************//**
662 Add a word if it doesn't exist, to the term freq RB tree. We store
663 a pointer to the word that is passed in as the argument.
664 @return pointer to word */
665 static
666 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)667 fts_query_add_word_freq(
668 /*====================*/
669 	fts_query_t*		query,		/*!< in: query instance */
670 	const fts_string_t*	word)		/*!< in: term/word to add */
671 {
672 	ib_rbt_bound_t		parent;
673 
674 	/* Lookup the word in our rb tree and add if it doesn't exist. */
675 	if (rbt_search(query->word_freqs, &parent, word) != 0) {
676 		fts_word_freq_t	word_freq;
677 
678 		memset(&word_freq, 0, sizeof(word_freq));
679 
680 		fts_string_dup(&word_freq.word, word, query->heap);
681 
682 		word_freq.doc_count = 0;
683 
684 		word_freq.doc_freqs = rbt_create(
685 			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
686 
687 		parent.last = rbt_add_node(
688 			query->word_freqs, &parent, &word_freq);
689 
690 		query->total_size += word->f_len
691 			+ SIZEOF_RBT_CREATE
692 			+ SIZEOF_RBT_NODE_ADD
693 			+ sizeof(fts_word_freq_t);
694 	}
695 
696 	return(rbt_value(fts_word_freq_t, parent.last));
697 }
698 
699 /*******************************************************************//**
700 Add a doc id if it doesn't exist, to the doc freq RB tree.
701 @return pointer to word */
702 static
703 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)704 fts_query_add_doc_freq(
705 /*===================*/
706 	fts_query_t*	query,		/*!< in: query instance	*/
707 	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
708 	doc_id_t	doc_id)		/*!< in: doc id to add */
709 {
710 	ib_rbt_bound_t	parent;
711 
712 	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
713 	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
714 		fts_doc_freq_t	doc_freq;
715 
716 		memset(&doc_freq, 0, sizeof(doc_freq));
717 
718 		doc_freq.freq = 0;
719 		doc_freq.doc_id = doc_id;
720 
721 		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
722 
723 		query->total_size += SIZEOF_RBT_NODE_ADD
724 			+ sizeof(fts_doc_freq_t);
725 	}
726 
727 	return(rbt_value(fts_doc_freq_t, parent.last));
728 }
729 
730 /*******************************************************************//**
731 Add the doc id to the query set only if it's not in the
732 deleted array. */
733 static
734 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)735 fts_query_union_doc_id(
736 /*===================*/
737 	fts_query_t*	query,		/*!< in: query instance */
738 	doc_id_t	doc_id,		/*!< in: the doc id to add */
739 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
740 					rank associated with the doc_id */
741 {
742 	ib_rbt_bound_t	parent;
743 	ulint		size = ib_vector_size(query->deleted->doc_ids);
744 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
745 
746 	/* Check if the doc id is deleted and it's not already in our set. */
747 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
748 	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
749 
750 		fts_ranking_t	ranking;
751 
752 		ranking.rank = rank;
753 		ranking.doc_id = doc_id;
754 		fts_ranking_words_create(query, &ranking);
755 
756 		rbt_add_node(query->doc_ids, &parent, &ranking);
757 
758 		query->total_size += SIZEOF_RBT_NODE_ADD
759 			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
760 	}
761 }
762 
763 /*******************************************************************//**
764 Remove the doc id from the query set only if it's not in the
765 deleted set. */
766 static
767 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)768 fts_query_remove_doc_id(
769 /*====================*/
770 	fts_query_t*	query,		/*!< in: query instance */
771 	doc_id_t	doc_id)		/*!< in: the doc id to add */
772 {
773 	ib_rbt_bound_t	parent;
774 	ulint		size = ib_vector_size(query->deleted->doc_ids);
775 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
776 
777 	/* Check if the doc id is deleted and it's in our set. */
778 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
779 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
780 		ut_free(rbt_remove_node(query->doc_ids, parent.last));
781 
782 		ut_ad(query->total_size >=
783 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
784 		query->total_size -= SIZEOF_RBT_NODE_ADD
785 			+ sizeof(fts_ranking_t);
786 	}
787 }
788 
789 /*******************************************************************//**
790 Find the doc id in the query set but not in the deleted set, artificialy
791 downgrade or upgrade its ranking by a value and make/initialize its ranking
792 under or above its normal range 0 to 1. This is used for Boolean Search
793 operator such as Negation operator, which makes word's contribution to the
794 row's relevance to be negative */
795 static
796 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)797 fts_query_change_ranking(
798 /*====================*/
799 	fts_query_t*	query,		/*!< in: query instance */
800 	doc_id_t	doc_id,		/*!< in: the doc id to add */
801 	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
802 {
803 	ib_rbt_bound_t	parent;
804 	ulint		size = ib_vector_size(query->deleted->doc_ids);
805 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
806 
807 	/* Check if the doc id is deleted and it's in our set. */
808 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
809 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
810 
811 		fts_ranking_t*	ranking;
812 
813 		ranking = rbt_value(fts_ranking_t, parent.last);
814 
815 		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
816 
817 		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
818 		and RANK_UPGRADE (0.5) */
819 		if (ranking->rank >= 1.0F) {
820 			ranking->rank = 1.0F;
821 		} else if (ranking->rank <= -1.0F) {
822 			ranking->rank = -1.0F;
823 		}
824 	}
825 }
826 
827 /*******************************************************************//**
828 Check the doc id in the query set only if it's not in the
829 deleted array. The doc ids that were found are stored in
830 another rb tree (fts_query_t::intersect). */
831 static
832 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)833 fts_query_intersect_doc_id(
834 /*=======================*/
835 	fts_query_t*	query,		/*!< in: query instance */
836 	doc_id_t	doc_id,		/*!< in: the doc id to add */
837 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
838 					rank associated with the doc_id */
839 {
840 	ib_rbt_bound_t	parent;
841 	ulint		size = ib_vector_size(query->deleted->doc_ids);
842 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
843 	fts_ranking_t*	ranking= NULL;
844 
845 	/* There are three types of intersect:
846 	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
847 	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
848 	      if it matches 'b'. if the doc is also in  doc_ids, then change the
849 	      doc's rank, and add 'a' in doc's words.
850 	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
851 	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
852 
853 	/* Check if the doc id is deleted and it's in our set */
854 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
855 		fts_ranking_t	new_ranking;
856 
857 		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
858 			if (query->multi_exist) {
859 				return;
860 			} else {
861 				new_ranking.words = NULL;
862 			}
863 		} else {
864 			ranking = rbt_value(fts_ranking_t, parent.last);
865 
866 			/* We've just checked the doc id before */
867 			if (ranking->words == NULL) {
868 				ut_ad(rbt_search(query->intersection, &parent,
869 					ranking) == 0);
870 				return;
871 			}
872 
873 			/* Merge rank */
874 			rank += ranking->rank;
875 			if (rank >= 1.0F) {
876 				rank = 1.0F;
877 			} else if (rank <= -1.0F) {
878 				rank = -1.0F;
879 			}
880 
881 			/* Take words */
882 			new_ranking.words = ranking->words;
883 			new_ranking.words_len = ranking->words_len;
884 		}
885 
886 		new_ranking.rank = rank;
887 		new_ranking.doc_id = doc_id;
888 
889 		if (rbt_search(query->intersection, &parent,
890 			       &new_ranking) != 0) {
891 			if (new_ranking.words == NULL) {
892 				fts_ranking_words_create(query, &new_ranking);
893 
894 				query->total_size += RANKING_WORDS_INIT_LEN;
895 			} else {
896 				/* Note that the intersection has taken
897 				ownership of the ranking data. */
898 				ranking->words = NULL;
899 			}
900 
901 			rbt_add_node(query->intersection,
902 				     &parent, &new_ranking);
903 
904 			query->total_size += SIZEOF_RBT_NODE_ADD
905 				+ sizeof(fts_ranking_t);
906 		}
907 	}
908 }
909 
910 /*******************************************************************//**
911 Free the document ranking rb tree. */
912 static
913 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)914 fts_query_free_doc_ids(
915 /*===================*/
916 	fts_query_t*	query,		/*!< in: query instance */
917 	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
918 {
919 	const ib_rbt_node_t*	node;
920 
921 	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
922 
923 		fts_ranking_t*	ranking;
924 
925 		ranking = rbt_value(fts_ranking_t, node);
926 
927 		if (ranking->words) {
928 			ranking->words = NULL;
929 		}
930 
931 		ut_free(rbt_remove_node(doc_ids, node));
932 
933 		ut_ad(query->total_size >=
934 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
935 		query->total_size -= SIZEOF_RBT_NODE_ADD
936 			+ sizeof(fts_ranking_t);
937 	}
938 
939 	rbt_free(doc_ids);
940 
941 	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
942 	query->total_size -= SIZEOF_RBT_CREATE;
943 }
944 
945 /*******************************************************************//**
946 Add the word to the documents "list" of matching words from
947 the query. We make a copy of the word from the query heap. */
948 static
949 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)950 fts_query_add_word_to_document(
951 /*===========================*/
952 	fts_query_t*		query,	/*!< in: query to update */
953 	doc_id_t		doc_id,	/*!< in: the document to update */
954 	const fts_string_t*	word)	/*!< in: the token to add */
955 {
956 	ib_rbt_bound_t		parent;
957 	fts_ranking_t*		ranking = NULL;
958 
959 	if (query->flags == FTS_OPT_RANKING) {
960 		return;
961 	}
962 
963 	/* First we search the intersection RB tree as it could have
964 	taken ownership of the words rb tree instance. */
965 	if (query->intersection
966 	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
967 
968 		ranking = rbt_value(fts_ranking_t, parent.last);
969 	}
970 
971 	if (ranking == NULL
972 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
973 
974 		ranking = rbt_value(fts_ranking_t, parent.last);
975 	}
976 
977 	if (ranking != NULL) {
978 		fts_ranking_words_add(query, ranking, word);
979 	}
980 }
981 
982 /*******************************************************************//**
983 Check the node ilist. */
984 static
985 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)986 fts_query_check_node(
987 /*=================*/
988 	fts_query_t*		query,	/*!< in: query to update */
989 	const fts_string_t*	token,	/*!< in: the token to search */
990 	const fts_node_t*	node)	/*!< in: node to check */
991 {
992 	/* Skip nodes whose doc ids are out range. */
993 	if (query->oper == FTS_EXIST
994 	    && ((query->upper_doc_id > 0
995 		&& node->first_doc_id > query->upper_doc_id)
996 		|| (query->lower_doc_id > 0
997 		    && node->last_doc_id < query->lower_doc_id))) {
998 
999 		/* Ignore */
1000 
1001 	} else {
1002 		int		ret;
1003 		ib_rbt_bound_t	parent;
1004 		ulint		ilist_size = node->ilist_size;
1005 		fts_word_freq_t*word_freqs;
1006 
1007 		/* The word must exist. */
1008 		ret = rbt_search(query->word_freqs, &parent, token);
1009 		ut_a(ret == 0);
1010 
1011 		word_freqs = rbt_value(fts_word_freq_t, parent.last);
1012 
1013 		query->error = fts_query_filter_doc_ids(
1014 					query, token, word_freqs, node,
1015 					node->ilist, ilist_size, TRUE);
1016 	}
1017 }
1018 
1019 /*****************************************************************//**
1020 Search index cache for word with wildcard match.
1021 @return number of words matched */
1022 static
1023 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1024 fts_cache_find_wildcard(
1025 /*====================*/
1026 	fts_query_t*		query,		/*!< in: query instance */
1027 	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
1028 	const fts_string_t*	token)		/*!< in: token to search */
1029 {
1030 	ib_rbt_bound_t		parent;
1031 	const ib_vector_t*	nodes = NULL;
1032 	fts_string_t		srch_text;
1033 	byte			term[FTS_MAX_WORD_LEN + 1];
1034 	ulint			num_word = 0;
1035 
1036 	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1037 			? token->f_len - 1
1038 			: token->f_len;
1039 
1040 	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1041 	term[srch_text.f_len] = '\0';
1042 	srch_text.f_str = term;
1043 
1044 	/* Lookup the word in the rb tree */
1045 	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1046 			   innobase_fts_text_cmp_prefix) == 0) {
1047 		const fts_tokenizer_word_t*     word;
1048 		ulint				i;
1049 		const ib_rbt_node_t*		cur_node;
1050 		ibool				forward = FALSE;
1051 
1052 		word = rbt_value(fts_tokenizer_word_t, parent.last);
1053 		cur_node = parent.last;
1054 
1055 		while (innobase_fts_text_cmp_prefix(
1056 			index_cache->charset, &srch_text, &word->text) == 0) {
1057 
1058 			nodes = word->nodes;
1059 
1060 			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1061 				int                     ret;
1062 				const fts_node_t*       node;
1063 				ib_rbt_bound_t          freq_parent;
1064 				fts_word_freq_t*	word_freqs;
1065 
1066 				node = static_cast<const fts_node_t*>(
1067 					ib_vector_get_const(nodes, i));
1068 
1069 				ret = rbt_search(query->word_freqs,
1070 						 &freq_parent,
1071 						 &srch_text);
1072 
1073 				ut_a(ret == 0);
1074 
1075 				word_freqs = rbt_value(
1076 					fts_word_freq_t,
1077 					freq_parent.last);
1078 
1079 				query->error = fts_query_filter_doc_ids(
1080 					query, &srch_text,
1081 					word_freqs, node,
1082 					node->ilist, node->ilist_size, TRUE);
1083 
1084 				if (query->error != DB_SUCCESS) {
1085 					return(0);
1086 				}
1087 			}
1088 
1089 			num_word++;
1090 
1091 			if (!forward) {
1092 				cur_node = rbt_prev(
1093 					index_cache->words, cur_node);
1094 			} else {
1095 cont_search:
1096 				cur_node = rbt_next(
1097 					index_cache->words, cur_node);
1098 			}
1099 
1100 			if (!cur_node) {
1101 				break;
1102 			}
1103 
1104 			word = rbt_value(fts_tokenizer_word_t, cur_node);
1105 		}
1106 
1107 		if (!forward) {
1108 			forward = TRUE;
1109 			cur_node = parent.last;
1110 			goto cont_search;
1111 		}
1112 	}
1113 
1114 	return(num_word);
1115 }
1116 
1117 /*****************************************************************//**
1118 Set difference.
1119 @return DB_SUCCESS if all go well */
1120 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1121 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1122 fts_query_difference(
1123 /*=================*/
1124 	fts_query_t*		query,	/*!< in: query instance */
1125 	const fts_string_t*	token)	/*!< in: token to search */
1126 {
1127 	ulint			n_doc_ids= 0;
1128 	trx_t*			trx = query->trx;
1129 	dict_table_t*		table = query->index->table;
1130 
1131 	ut_a(query->oper == FTS_IGNORE);
1132 
1133 #ifdef FTS_INTERNAL_DIAG_PRINT
1134 	{
1135 		ib::info	out;
1136 		out << "DIFFERENCE: Searching: '";
1137 		out.write(token->f_str, token->f_len);
1138 		out << "'";
1139 	}
1140 #endif
1141 
1142 	if (query->doc_ids) {
1143 		n_doc_ids = rbt_size(query->doc_ids);
1144 	}
1145 
1146 	/* There is nothing we can substract from an empty set. */
1147 	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1148 		ulint			i;
1149 		fts_fetch_t		fetch;
1150 		const ib_vector_t*	nodes;
1151 		const fts_index_cache_t*index_cache;
1152 		que_t*			graph = NULL;
1153 		fts_cache_t*		cache = table->fts->cache;
1154 		dberr_t			error;
1155 
1156 		rw_lock_x_lock(&cache->lock);
1157 
1158 		index_cache = fts_find_index_cache(cache, query->index);
1159 
1160 		/* Must find the index cache */
1161 		ut_a(index_cache != NULL);
1162 
1163 		/* Search the cache for a matching word first. */
1164 		if (query->cur_node->term.wildcard
1165 		    && query->flags != FTS_PROXIMITY
1166 		    && query->flags != FTS_PHRASE) {
1167 			fts_cache_find_wildcard(query, index_cache, token);
1168 		} else {
1169 			nodes = fts_cache_find_word(index_cache, token);
1170 
1171 			for (i = 0; nodes && i < ib_vector_size(nodes)
1172 			     && query->error == DB_SUCCESS; ++i) {
1173 				const fts_node_t*	node;
1174 
1175 				node = static_cast<const fts_node_t*>(
1176 					ib_vector_get_const(nodes, i));
1177 
1178 				fts_query_check_node(query, token, node);
1179 			}
1180 		}
1181 
1182 		rw_lock_x_unlock(&cache->lock);
1183 
1184 		/* error is passed by 'query->error' */
1185 		if (query->error != DB_SUCCESS) {
1186 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1187 			return(query->error);
1188 		}
1189 
1190 		/* Setup the callback args for filtering and
1191 		consolidating the ilist. */
1192 		fetch.read_arg = query;
1193 		fetch.read_record = fts_query_index_fetch_nodes;
1194 
1195 		error = fts_index_fetch_nodes(
1196 			trx, &graph, &query->fts_index_table, token, &fetch);
1197 
1198 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1199 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1200 		if (error != DB_SUCCESS) {
1201 			query->error = error;
1202 		}
1203 
1204 		fts_que_graph_free(graph);
1205 	}
1206 
1207 	/* The size can't increase. */
1208 	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1209 
1210 	return(query->error);
1211 }
1212 
1213 /*****************************************************************//**
1214 Intersect the token doc ids with the current set.
1215 @return DB_SUCCESS if all go well */
1216 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1217 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1218 fts_query_intersect(
1219 /*================*/
1220 	fts_query_t*		query,	/*!< in: query instance */
1221 	const fts_string_t*	token)	/*!< in: the token to search */
1222 {
1223 	trx_t*			trx = query->trx;
1224 	dict_table_t*		table = query->index->table;
1225 
1226 	ut_a(query->oper == FTS_EXIST);
1227 
1228 #ifdef FTS_INTERNAL_DIAG_PRINT
1229 	{
1230 		ib::info	out;
1231 		out << "INTERSECT: Searching: '";
1232 		out.write(token->f_str, token->f_len);
1233 		out << "'";
1234 	}
1235 #endif
1236 
1237 	/* If the words set is not empty and multi exist is true,
1238 	we know the intersection set is empty in advance. */
1239 	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1240 		ulint                   n_doc_ids = 0;
1241 		ulint			i;
1242 		fts_fetch_t		fetch;
1243 		const ib_vector_t*	nodes;
1244 		const fts_index_cache_t*index_cache;
1245 		que_t*			graph = NULL;
1246 		fts_cache_t*		cache = table->fts->cache;
1247 		dberr_t			error;
1248 
1249 		ut_a(!query->intersection);
1250 
1251 		n_doc_ids = rbt_size(query->doc_ids);
1252 
1253 		/* Create the rb tree that will hold the doc ids of
1254 		the intersection. */
1255 		query->intersection = rbt_create(
1256 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1257 
1258 		query->total_size += SIZEOF_RBT_CREATE;
1259 
1260 		/* This is to avoid decompressing the ilist if the
1261 		node's ilist doc ids are out of range. */
1262 		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1263 			const ib_rbt_node_t*	node;
1264 			doc_id_t*		doc_id;
1265 
1266 			node = rbt_first(query->doc_ids);
1267 			doc_id = rbt_value(doc_id_t, node);
1268 			query->lower_doc_id = *doc_id;
1269 
1270 			node = rbt_last(query->doc_ids);
1271 			doc_id = rbt_value(doc_id_t, node);
1272 			query->upper_doc_id = *doc_id;
1273 
1274 		} else {
1275 			query->lower_doc_id = 0;
1276 			query->upper_doc_id = 0;
1277 		}
1278 
1279 		/* Search the cache for a matching word first. */
1280 
1281 		rw_lock_x_lock(&cache->lock);
1282 
1283 		/* Search for the index specific cache. */
1284 		index_cache = fts_find_index_cache(cache, query->index);
1285 
1286 		/* Must find the index cache. */
1287 		ut_a(index_cache != NULL);
1288 
1289 		if (query->cur_node->term.wildcard) {
1290 			/* Wildcard search the index cache */
1291 			fts_cache_find_wildcard(query, index_cache, token);
1292 		} else {
1293 			nodes = fts_cache_find_word(index_cache, token);
1294 
1295 			for (i = 0; nodes && i < ib_vector_size(nodes)
1296 			     && query->error == DB_SUCCESS; ++i) {
1297 				const fts_node_t*	node;
1298 
1299 				node = static_cast<const fts_node_t*>(
1300 					ib_vector_get_const(nodes, i));
1301 
1302 				fts_query_check_node(query, token, node);
1303 			}
1304 		}
1305 
1306 		rw_lock_x_unlock(&cache->lock);
1307 
1308 		/* error is passed by 'query->error' */
1309 		if (query->error != DB_SUCCESS) {
1310 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1311 			return(query->error);
1312 		}
1313 
1314 		/* Setup the callback args for filtering and
1315 		consolidating the ilist. */
1316 		fetch.read_arg = query;
1317 		fetch.read_record = fts_query_index_fetch_nodes;
1318 
1319 		error = fts_index_fetch_nodes(
1320 			trx, &graph, &query->fts_index_table, token, &fetch);
1321 
1322 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1323 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1324 		if (error != DB_SUCCESS) {
1325 			query->error = error;
1326 		}
1327 
1328 		fts_que_graph_free(graph);
1329 
1330 		if (query->error == DB_SUCCESS) {
1331 			/* Make the intesection (rb tree) the current doc id
1332 			set and free the old set. */
1333 			fts_query_free_doc_ids(query, query->doc_ids);
1334 			query->doc_ids = query->intersection;
1335 			query->intersection = NULL;
1336 
1337 			ut_a(!query->multi_exist || (query->multi_exist
1338 			     && rbt_size(query->doc_ids) <= n_doc_ids));
1339 		}
1340 	}
1341 
1342 	return(query->error);
1343 }
1344 
1345 /*****************************************************************//**
1346 Query index cache.
1347 @return DB_SUCCESS if all go well */
1348 static
1349 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1350 fts_query_cache(
1351 /*============*/
1352 	fts_query_t*		query,	/*!< in/out: query instance */
1353 	const fts_string_t*	token)	/*!< in: token to search */
1354 {
1355 	const fts_index_cache_t*index_cache;
1356 	dict_table_t*		table = query->index->table;
1357 	fts_cache_t*		cache = table->fts->cache;
1358 
1359 	/* Search the cache for a matching word first. */
1360 	rw_lock_x_lock(&cache->lock);
1361 
1362 	/* Search for the index specific cache. */
1363 	index_cache = fts_find_index_cache(cache, query->index);
1364 
1365 	/* Must find the index cache. */
1366 	ut_a(index_cache != NULL);
1367 
1368 	if (query->cur_node->term.wildcard
1369 	    && query->flags != FTS_PROXIMITY
1370 	    && query->flags != FTS_PHRASE) {
1371 		/* Wildcard search the index cache */
1372 		fts_cache_find_wildcard(query, index_cache, token);
1373 	} else {
1374 		const ib_vector_t*      nodes;
1375 		ulint			i;
1376 
1377 		nodes = fts_cache_find_word(index_cache, token);
1378 
1379 		for (i = 0; nodes && i < ib_vector_size(nodes)
1380 		     && query->error == DB_SUCCESS; ++i) {
1381 			const fts_node_t*	node;
1382 
1383 			node = static_cast<const fts_node_t*>(
1384 				ib_vector_get_const(nodes, i));
1385 
1386 			fts_query_check_node(query, token, node);
1387 		}
1388 	}
1389 
1390 	rw_lock_x_unlock(&cache->lock);
1391 
1392 	return(query->error);
1393 }
1394 
1395 /*****************************************************************//**
1396 Set union.
1397 @return DB_SUCCESS if all go well */
1398 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1399 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1400 fts_query_union(
1401 /*============*/
1402 	fts_query_t*		query,	/*!< in: query instance */
1403 	fts_string_t*		token)	/*!< in: token to search */
1404 {
1405 	fts_fetch_t		fetch;
1406 	ulint			n_doc_ids = 0;
1407 	trx_t*			trx = query->trx;
1408 	que_t*			graph = NULL;
1409 	dberr_t			error;
1410 
1411 	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1412 	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1413 
1414 #ifdef FTS_INTERNAL_DIAG_PRINT
1415 	{
1416 		ib::info	out;
1417 		out << "UNION: Searching: '";
1418 		out.write(token->f_str, token->f_len);
1419 		out << "'";
1420 	}
1421 #endif
1422 
1423 	if (query->doc_ids) {
1424 		n_doc_ids = rbt_size(query->doc_ids);
1425 	}
1426 
1427 	if (token->f_len == 0) {
1428 		return(query->error);
1429 	}
1430 
1431 	fts_query_cache(query, token);
1432 
1433 	/* Setup the callback args for filtering and
1434 	consolidating the ilist. */
1435 	fetch.read_arg = query;
1436 	fetch.read_record = fts_query_index_fetch_nodes;
1437 
1438 	/* Read the nodes from disk. */
1439 	error = fts_index_fetch_nodes(
1440 		trx, &graph, &query->fts_index_table, token, &fetch);
1441 
1442 	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1443 	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1444 	if (error != DB_SUCCESS) {
1445 		query->error = error;
1446 	}
1447 
1448 	fts_que_graph_free(graph);
1449 
1450 	if (query->error == DB_SUCCESS) {
1451 
1452 		/* The size can't decrease. */
1453 		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1454 
1455 		/* Calulate the number of doc ids that were added to
1456 		the current doc id set. */
1457 		if (query->doc_ids) {
1458 			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1459 		}
1460 	}
1461 
1462 	return(query->error);
1463 }
1464 
1465 /*****************************************************************//**
1466 Depending upon the current query operator process the doc id.
1467 return DB_SUCCESS if all go well
1468 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1469 static
1470 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1471 fts_query_process_doc_id(
1472 /*=====================*/
1473 	fts_query_t*	query,		/*!< in: query instance */
1474 	doc_id_t	doc_id,		/*!< in: doc id to process */
1475 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
1476 					rank associated with the doc_id */
1477 {
1478 	if (query->flags == FTS_OPT_RANKING) {
1479 		return(DB_SUCCESS);
1480 	}
1481 
1482 	switch (query->oper) {
1483 	case FTS_NONE:
1484 		fts_query_union_doc_id(query, doc_id, rank);
1485 		break;
1486 
1487 	case FTS_EXIST:
1488 		fts_query_intersect_doc_id(query, doc_id, rank);
1489 		break;
1490 
1491 	case FTS_IGNORE:
1492 		fts_query_remove_doc_id(query, doc_id);
1493 		break;
1494 
1495 	case FTS_NEGATE:
1496 		fts_query_change_ranking(query, doc_id, TRUE);
1497 		break;
1498 
1499 	case FTS_DECR_RATING:
1500 		fts_query_union_doc_id(query, doc_id, rank);
1501 		fts_query_change_ranking(query, doc_id, TRUE);
1502 		break;
1503 
1504 	case FTS_INCR_RATING:
1505 		fts_query_union_doc_id(query, doc_id, rank);
1506 		fts_query_change_ranking(query, doc_id, FALSE);
1507 		break;
1508 
1509 	default:
1510 		ut_error;
1511 	}
1512 
1513 	if (query->total_size > fts_result_cache_limit) {
1514 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1515 	} else {
1516 		return(DB_SUCCESS);
1517 	}
1518 }
1519 
1520 /*****************************************************************//**
1521 Merge two result sets. */
1522 static
1523 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1524 fts_merge_doc_ids(
1525 /*==============*/
1526 	fts_query_t*	query,		/*!< in,out: query instance */
1527 	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
1528 {
1529 	const ib_rbt_node_t*	node;
1530 
1531 	DBUG_ENTER("fts_merge_doc_ids");
1532 
1533 	ut_a(!query->intersection);
1534 
1535 	/* To process FTS_EXIST operation (intersection), we need
1536 	to create a new result set for fts_query_intersect(). */
1537 	if (query->oper == FTS_EXIST) {
1538 
1539 		query->intersection = rbt_create(
1540 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1541 
1542 		query->total_size += SIZEOF_RBT_CREATE;
1543 	}
1544 
1545 	/* Merge the elements to the result set. */
1546 	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1547 		fts_ranking_t*		ranking;
1548 		ulint			pos = 0;
1549 		fts_string_t		word;
1550 
1551 		ranking = rbt_value(fts_ranking_t, node);
1552 
1553 		query->error = fts_query_process_doc_id(
1554 				query, ranking->doc_id, ranking->rank);
1555 
1556 		if (query->error != DB_SUCCESS) {
1557 			DBUG_RETURN(query->error);
1558 		}
1559 
1560 		/* Merge words. Don't need to take operator into account. */
1561 		ut_a(ranking->words);
1562 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1563 			fts_query_add_word_to_document(query, ranking->doc_id,
1564 						       &word);
1565 		}
1566 	}
1567 
1568 	/* If it is an intersection operation, reset query->doc_ids
1569 	to query->intersection and free the old result list. */
1570 	if (query->oper == FTS_EXIST && query->intersection != NULL) {
1571 		fts_query_free_doc_ids(query, query->doc_ids);
1572 		query->doc_ids = query->intersection;
1573 		query->intersection = NULL;
1574 	}
1575 
1576 	DBUG_RETURN(DB_SUCCESS);
1577 }
1578 
1579 /*****************************************************************//**
1580 Skip non-whitespace in a string. Move ptr to the next word boundary.
1581 @return pointer to first whitespace character or end */
1582 UNIV_INLINE
1583 byte*
fts_query_skip_word(byte * ptr,const byte * end)1584 fts_query_skip_word(
1585 /*================*/
1586 	byte*		ptr,		/*!< in: start of scan */
1587 	const byte*	end)		/*!< in: pointer to end of string */
1588 {
1589 	/* TODO: Does this have to be UTF-8 too ? */
1590 	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1591 		++ptr;
1592 	}
1593 
1594 	return(ptr);
1595 }
1596 
1597 /*****************************************************************//**
1598 Check whether the remaining terms in the phrase match the text.
1599 @return TRUE if matched else FALSE */
1600 static
1601 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1602 fts_query_match_phrase_terms(
1603 /*=========================*/
1604 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1605 	byte**		start,		/*!< in/out: text to search, we can't
1606 					make this const becase we need to
1607 					first convert the string to
1608 					lowercase */
1609 	const byte*	end,		/*!< in: pointer to the end of
1610 					the string to search */
1611 	mem_heap_t*	heap)		/*!< in: heap */
1612 {
1613 	ulint			i;
1614 	byte*			ptr = *start;
1615 	const ib_vector_t*	tokens = phrase->tokens;
1616 	ulint			distance = phrase->distance;
1617 
1618 	/* We check only from the second term onwards, since the first
1619 	must have matched otherwise we wouldn't be here. */
1620 	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1621 		fts_string_t		match;
1622 		fts_string_t		cmp_str;
1623 		const fts_string_t*	token;
1624 		int			result;
1625 		ulint			ret;
1626 
1627 		ret = innobase_mysql_fts_get_token(
1628 			phrase->charset, ptr,
1629 			const_cast<byte*>(end), &match);
1630 
1631 		if (match.f_len > 0) {
1632 			/* Get next token to match. */
1633 			token = static_cast<const fts_string_t*>(
1634 				ib_vector_get_const(tokens, i));
1635 
1636 			fts_string_dup(&cmp_str, &match, heap);
1637 
1638 			result = innobase_fts_text_case_cmp(
1639 				phrase->charset, token, &cmp_str);
1640 
1641 			/* Skip the rest of the tokens if this one doesn't
1642 			match and the proximity distance is exceeded. */
1643 			if (result
1644 			    && (distance == ULINT_UNDEFINED
1645 				|| distance == 0)) {
1646 
1647 				break;
1648 			}
1649 
1650 			/* This token matched move to the next token. */
1651 			if (result == 0) {
1652 				/* Advance the text to search by the length
1653 				of the last token. */
1654 				ptr += ret;
1655 
1656 				/* Advance to the next token. */
1657 				++i;
1658 			} else {
1659 
1660 				ut_a(distance != ULINT_UNDEFINED);
1661 
1662 				ptr = fts_query_skip_word(ptr, end);
1663 			}
1664 
1665 			/* Distance can be 0 for exact matches. */
1666 			if (distance != ULINT_UNDEFINED && distance > 0) {
1667 				--distance;
1668 			}
1669 		} else {
1670 			ptr += ret;
1671 		}
1672 	}
1673 
1674 	*start = ptr;
1675 
1676 	/* Can't be greater than the number of elements. */
1677 	ut_a(i <= ib_vector_size(tokens));
1678 
1679 	/* This is the case for multiple words. */
1680 	if (i == ib_vector_size(tokens)) {
1681 		phrase->found = TRUE;
1682 	}
1683 
1684 	return(phrase->found);
1685 }
1686 
1687 /*****************************************************************//**
1688 Callback function to count the number of words in position ranges,
1689 and see whether the word count is in specified "phrase->distance"
1690 @return true if the number of characters is less than the "distance" */
1691 static
1692 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1693 fts_proximity_is_word_in_range(
1694 /*===========================*/
1695 	const fts_phrase_t*
1696 			phrase,		/*!< in: phrase with the search info */
1697 	byte*		start,		/*!< in: text to search */
1698 	ulint		total_len)	/*!< in: length of text */
1699 {
1700 	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
1701 
1702 	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1703 	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1704 
1705 	/* Search each matched position pair (with min and max positions)
1706 	and count the number of words in the range */
1707 	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1708 		ulint		cur_pos = proximity_pos->min_pos[i];
1709 		ulint		n_word = 0;
1710 
1711 		ut_ad(proximity_pos->max_pos[i] <= total_len);
1712 
1713 		/* Walk through words in the range and count them */
1714 		while (cur_pos <= proximity_pos->max_pos[i]) {
1715 			ulint		len;
1716 			fts_string_t	str;
1717 
1718 			len = innobase_mysql_fts_get_token(
1719 				phrase->charset,
1720 				start + cur_pos,
1721 				start + total_len, &str);
1722 
1723 			if (len == 0) {
1724 				break;
1725 			}
1726 
1727 			/* Advances position with "len" bytes */
1728 			cur_pos += len;
1729 
1730 			/* Record the number of words */
1731 			if (str.f_n_char > 0) {
1732 				n_word++;
1733 			}
1734 
1735 			if (n_word > phrase->distance) {
1736 				break;
1737 			}
1738 		}
1739 
1740 		/* Check if the number of words is less than specified
1741 		"distance" */
1742 		if (n_word && n_word <= phrase->distance) {
1743 			return(true);
1744 		}
1745 	}
1746 
1747 	return(false);
1748 }
1749 
1750 /*****************************************************************//**
1751 FTS plugin parser 'myql_add_word' callback function for phrase match
1752 Refer to 'st_mysql_ftparser_param' for more detail.
1753 @return 0 if match, or return non-zero */
1754 static
1755 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)1756 fts_query_match_phrase_add_word_for_parser(
1757 /*=======================================*/
1758 	MYSQL_FTPARSER_PARAM*	param,		/*!< in: parser param */
1759 	char*			word,		/*!< in: token */
1760 	int			word_len,	/*!< in: token length */
1761 	MYSQL_FTPARSER_BOOLEAN_INFO* info)	/*!< in: token info */
1762 {
1763 	fts_phrase_param_t*	phrase_param;
1764 	fts_phrase_t*		phrase;
1765 	const ib_vector_t*	tokens;
1766 	fts_string_t		match;
1767 	fts_string_t		cmp_str;
1768 	const fts_string_t*	token;
1769 	int			result;
1770 	mem_heap_t*		heap;
1771 
1772 	phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1773 	heap = phrase_param->heap;
1774 	phrase = phrase_param->phrase;
1775 	tokens = phrase->tokens;
1776 
1777 	/* In case plugin parser doesn't check return value */
1778 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1779 		return(1);
1780 	}
1781 
1782 	match.f_str = reinterpret_cast<byte*>(word);
1783 	match.f_len = word_len;
1784 	match.f_n_char = fts_get_token_size(phrase->charset, word, word_len);
1785 
1786 	if (match.f_len > 0) {
1787 		/* Get next token to match. */
1788 		ut_a(phrase_param->token_index < ib_vector_size(tokens));
1789 		token = static_cast<const fts_string_t*>(
1790 			ib_vector_get_const(tokens, phrase_param->token_index));
1791 
1792 		fts_string_dup(&cmp_str, &match, heap);
1793 
1794 		result = innobase_fts_text_case_cmp(
1795 			phrase->charset, token, &cmp_str);
1796 
1797 		if (result == 0) {
1798 			phrase_param->token_index++;
1799 		} else {
1800 			return(1);
1801 		}
1802 	}
1803 
1804 	/* Can't be greater than the number of elements. */
1805 	ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1806 
1807 	/* This is the case for multiple words. */
1808 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1809 		phrase->found = TRUE;
1810 	}
1811 
1812 	return(static_cast<int>(phrase->found));
1813 }
1814 
1815 /*****************************************************************//**
1816 Check whether the terms in the phrase match the text.
1817 @return TRUE if matched else FALSE */
1818 static
1819 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1820 fts_query_match_phrase_terms_by_parser(
1821 /*===================================*/
1822 	fts_phrase_param_t*	phrase_param,	/* in/out: phrase param */
1823 	st_mysql_ftparser*	parser,		/* in: plugin fts parser */
1824 	byte*			text,		/* in: text to check */
1825 	ulint			len)		/* in: text length */
1826 {
1827 	MYSQL_FTPARSER_PARAM	param;
1828 
1829 	ut_a(parser);
1830 
1831 	/* Set paramters for param */
1832 	param.mysql_parse = fts_tokenize_document_internal;
1833 	param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1834 	param.mysql_ftparam = phrase_param;
1835 	param.cs = phrase_param->phrase->charset;
1836 	param.doc = reinterpret_cast<char*>(text);
1837 	param.length = static_cast<int>(len);
1838 	param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1839 
1840 	PARSER_INIT(parser, &param);
1841 	parser->parse(&param);
1842 	PARSER_DEINIT(parser, &param);
1843 
1844 	return(phrase_param->phrase->found);
1845 }
1846 
1847 /*****************************************************************//**
1848 Callback function to fetch and search the document.
1849 @return TRUE if matched else FALSE */
1850 static
1851 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1852 fts_query_match_phrase(
1853 /*===================*/
1854 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1855 	byte*		start,		/*!< in: text to search, we can't make
1856 					this const becase we need to first
1857 					convert the string to lowercase */
1858 	ulint		cur_len,	/*!< in: length of text */
1859 	ulint		prev_len,	/*!< in: total length for searched
1860 					doc fields*/
1861 	mem_heap_t*	heap)		/* heap */
1862 {
1863 	ulint			i;
1864 	const fts_string_t*	first;
1865 	const byte*		end = start + cur_len;
1866 	const ib_vector_t*	tokens = phrase->tokens;
1867 	const ib_vector_t*	positions = phrase->match->positions;
1868 
1869 	ut_a(!phrase->found);
1870 	ut_a(phrase->match->doc_id > 0);
1871 	ut_a(ib_vector_size(tokens) > 0);
1872 	ut_a(ib_vector_size(positions) > 0);
1873 
1874 	first = static_cast<const fts_string_t*>(
1875 		ib_vector_get_const(tokens, 0));
1876 
1877 	ut_a(phrase->match->start < ib_vector_size(positions));
1878 
1879 	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1880 		ulint		pos;
1881 		byte*		ptr = start;
1882 
1883 		pos = *(ulint*) ib_vector_get_const(positions, i);
1884 
1885 		if (pos == ULINT_UNDEFINED) {
1886 			break;
1887 		}
1888 
1889 		if (pos < prev_len) {
1890 			continue;
1891 		}
1892 
1893 		/* Document positions are calculated from the beginning
1894 		of the first field, need to save the length for each
1895 		searched field to adjust the doc position when search
1896 		phrases. */
1897 		pos -= prev_len;
1898 		ptr = start + pos;
1899 
1900 		/* Within limits ? */
1901 		if (ptr >= end) {
1902 			break;
1903 		}
1904 
1905 		if (phrase->parser) {
1906 			fts_phrase_param_t	phrase_param;
1907 
1908 			phrase_param.phrase = phrase;
1909 			phrase_param.token_index = 0;
1910 			phrase_param.heap = heap;
1911 
1912 			if (fts_query_match_phrase_terms_by_parser(
1913 				&phrase_param,
1914 				phrase->parser,
1915 				ptr,
1916 				(end - ptr))) {
1917 				break;
1918 			}
1919 		} else {
1920 			fts_string_t	match;
1921 			fts_string_t	cmp_str;
1922 			ulint		ret;
1923 
1924 			match.f_str = ptr;
1925 			ret = innobase_mysql_fts_get_token(
1926 				phrase->charset, start + pos,
1927 				const_cast<byte*>(end), &match);
1928 
1929 			if (match.f_len == 0) {
1930 				break;
1931 			}
1932 
1933 			fts_string_dup(&cmp_str, &match, heap);
1934 
1935 			if (innobase_fts_text_case_cmp(
1936 				phrase->charset, first, &cmp_str) == 0) {
1937 
1938 				/* This is the case for the single word
1939 				in the phrase. */
1940 				if (ib_vector_size(phrase->tokens) == 1) {
1941 					phrase->found = TRUE;
1942 					break;
1943 				}
1944 
1945 				ptr += ret;
1946 
1947 				/* Match the remaining terms in the phrase. */
1948 				if (fts_query_match_phrase_terms(phrase, &ptr,
1949 								 end, heap)) {
1950 					break;
1951 				}
1952 			}
1953 		}
1954 	}
1955 
1956 	return(phrase->found);
1957 }
1958 
1959 /*****************************************************************//**
1960 Callback function to fetch and search the document.
1961 @return whether the phrase is found */
1962 static
1963 ibool
fts_query_fetch_document(void * row,void * user_arg)1964 fts_query_fetch_document(
1965 /*=====================*/
1966 	void*		row,		/*!< in:  sel_node_t* */
1967 	void*		user_arg)	/*!< in:  fts_doc_t* */
1968 {
1969 
1970 	que_node_t*	exp;
1971 	sel_node_t*	node = static_cast<sel_node_t*>(row);
1972 	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
1973 	ulint		prev_len = 0;
1974 	ulint		total_len = 0;
1975 	byte*		document_text = NULL;
1976 
1977 	exp = node->select_list;
1978 
1979 	phrase->found = FALSE;
1980 
1981 	/* For proximity search, we will need to get the whole document
1982 	from all fields, so first count the total length of the document
1983 	from all the fields */
1984 	if (phrase->proximity_pos) {
1985 		 while (exp) {
1986 			ulint		field_len;
1987 			dfield_t*	dfield = que_node_get_val(exp);
1988 			byte*		data = static_cast<byte*>(
1989 						dfield_get_data(dfield));
1990 
1991 			if (dfield_is_ext(dfield)) {
1992 				ulint	local_len = dfield_get_len(dfield);
1993 
1994 				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1995 
1996 				field_len = mach_read_from_4(
1997 					data + local_len + BTR_EXTERN_LEN + 4);
1998 			} else {
1999 				field_len = dfield_get_len(dfield);
2000 			}
2001 
2002 			if (field_len != UNIV_SQL_NULL) {
2003 				total_len += field_len + 1;
2004 			}
2005 
2006 			exp = que_node_get_next(exp);
2007 		}
2008 
2009 		document_text = static_cast<byte*>(mem_heap_zalloc(
2010 					phrase->heap, total_len));
2011 
2012 		if (!document_text) {
2013 			return(FALSE);
2014 		}
2015 	}
2016 
2017 	exp = node->select_list;
2018 
2019 	while (exp) {
2020 		dfield_t*	dfield = que_node_get_val(exp);
2021 		byte*		data = static_cast<byte*>(
2022 					dfield_get_data(dfield));
2023 		ulint		cur_len;
2024 
2025 		if (dfield_is_ext(dfield)) {
2026 			data = btr_copy_externally_stored_field(
2027 				&cur_len, data, phrase->page_size,
2028 				dfield_get_len(dfield), phrase->heap);
2029 		} else {
2030 			cur_len = dfield_get_len(dfield);
2031 		}
2032 
2033 		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2034 			if (phrase->proximity_pos) {
2035 				ut_ad(prev_len + cur_len <= total_len);
2036 				memcpy(document_text + prev_len, data, cur_len);
2037 			} else {
2038 				/* For phrase search */
2039 				phrase->found =
2040 					fts_query_match_phrase(
2041 						phrase,
2042 						static_cast<byte*>(data),
2043 						cur_len, prev_len,
2044 						phrase->heap);
2045 			}
2046 
2047 			/* Document positions are calculated from the beginning
2048 			of the first field, need to save the length for each
2049 			searched field to adjust the doc position when search
2050 			phrases. */
2051 			prev_len += cur_len + 1;
2052 		}
2053 
2054 		if (phrase->found) {
2055 			break;
2056 		}
2057 
2058 		exp = que_node_get_next(exp);
2059 	}
2060 
2061 	if (phrase->proximity_pos) {
2062 		ut_ad(prev_len <= total_len);
2063 
2064 		phrase->found = fts_proximity_is_word_in_range(
2065 			phrase, document_text, total_len);
2066 	}
2067 
2068 	return(phrase->found);
2069 }
2070 
2071 #if 0
2072 /********************************************************************
2073 Callback function to check whether a record was found or not. */
2074 static
2075 ibool
2076 fts_query_select(
2077 /*=============*/
2078 	void*		row,		/*!< in:  sel_node_t* */
2079 	void*		user_arg)	/*!< in:  fts_doc_t* */
2080 {
2081 	int		i;
2082 	que_node_t*	exp;
2083 	sel_node_t*	node = row;
2084 	fts_select_t*	select = user_arg;
2085 
2086 	ut_a(select->word_freq);
2087 	ut_a(select->word_freq->doc_freqs);
2088 
2089 	exp = node->select_list;
2090 
2091 	for (i = 0; exp && !select->found; ++i) {
2092 		dfield_t*	dfield = que_node_get_val(exp);
2093 		void*		data = dfield_get_data(dfield);
2094 		ulint		len = dfield_get_len(dfield);
2095 
2096 		switch (i) {
2097 		case 0: /* DOC_COUNT */
2098 			if (len != UNIV_SQL_NULL && len != 0) {
2099 
2100 				select->word_freq->doc_count +=
2101 					mach_read_from_4(data);
2102 			}
2103 			break;
2104 
2105 		case 1: /* ILIST */
2106 			if (len != UNIV_SQL_NULL && len != 0) {
2107 
2108 				fts_query_find_doc_id(select, data, len);
2109 			}
2110 			break;
2111 
2112 		default:
2113 			ut_error;
2114 		}
2115 
2116 		exp = que_node_get_next(exp);
2117 	}
2118 
2119 	return(FALSE);
2120 }
2121 
2122 /********************************************************************
2123 Read the rows from the FTS index, that match word and where the
2124 doc id is between first and last doc id.
2125 @return DB_SUCCESS if all go well else error code */
2126 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2127 dberr_t
2128 fts_query_find_term(
2129 /*================*/
2130 	fts_query_t*		query,	/*!< in: FTS query state */
2131 	que_t**			graph,	/*!< in: prepared statement */
2132 	const fts_string_t*	word,	/*!< in: the word to fetch */
2133 	doc_id_t		doc_id,	/*!< in: doc id to match */
2134 	ulint*			min_pos,/*!< in/out: pos found must be
2135 					 greater than this minimum value. */
2136 	ibool*			found)	/*!< out: TRUE if found else FALSE */
2137 {
2138 	pars_info_t*		info;
2139 	dberr_t			error;
2140 	fts_select_t		select;
2141 	doc_id_t		match_doc_id;
2142 	trx_t*			trx = query->trx;
2143 	char			table_name[MAX_FULL_NAME_LEN];
2144 
2145 	trx->op_info = "fetching FTS index matching nodes";
2146 
2147 	if (*graph) {
2148 		info = (*graph)->info;
2149 	} else {
2150 		ulint	selected;
2151 
2152 		info = pars_info_create();
2153 
2154 		selected = fts_select_index(*word->f_str);
2155 		query->fts_index_table.suffix = fts_get_suffix(selected);
2156 
2157 		fts_get_table_name(&query->fts_index_table, table_name);
2158 		pars_info_bind_id(info, true, "index_table_name", table_name);
2159 	}
2160 
2161 	select.found = FALSE;
2162 	select.doc_id = doc_id;
2163 	select.min_pos = *min_pos;
2164 	select.word_freq = fts_query_add_word_freq(query, word->f_str);
2165 
2166 	pars_info_bind_function(info, "my_func", fts_query_select, &select);
2167 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2168 
2169 	/* Convert to "storage" byte order. */
2170 	fts_write_doc_id((byte*) &match_doc_id, doc_id);
2171 
2172 	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2173 
2174 	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2175 
2176 	if (!*graph) {
2177 
2178 		*graph = fts_parse_sql(
2179 			&query->fts_index_table,
2180 			info,
2181 			"DECLARE FUNCTION my_func;\n"
2182 			"DECLARE CURSOR c IS"
2183 			" SELECT doc_count, ilist\n"
2184 			" FROM $index_table_name\n"
2185 			" WHERE word LIKE :word AND"
2186 			" first_doc_id <= :min_doc_id AND"
2187 			" last_doc_id >= :max_doc_id\n"
2188 			" ORDER BY first_doc_id;\n"
2189 			"BEGIN\n"
2190 			"\n"
2191 			"OPEN c;\n"
2192 			"WHILE 1 = 1 LOOP\n"
2193 			"  FETCH c INTO my_func();\n"
2194 			"  IF c % NOTFOUND THEN\n"
2195 			"    EXIT;\n"
2196 			"  END IF;\n"
2197 			"END LOOP;\n"
2198 			"CLOSE c;");
2199 	}
2200 
2201 	for (;;) {
2202 		error = fts_eval_sql(trx, *graph);
2203 
2204 		if (error == DB_SUCCESS) {
2205 
2206 			break;				/* Exit the loop. */
2207 		} else {
2208 
2209 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2210 				ib::warn() << "lock wait timeout reading FTS"
2211 					" index. Retrying!";
2212 
2213 				trx->error_state = DB_SUCCESS;
2214 			} else {
2215 				ib::error() << error
2216 					<< " while reading FTS index.";
2217 
2218 				break;			/* Exit the loop. */
2219 			}
2220 		}
2221 	}
2222 
2223 	/* Value to return */
2224 	*found = select.found;
2225 
2226 	if (*found) {
2227 		*min_pos = select.min_pos;
2228 	}
2229 
2230 	return(error);
2231 }
2232 
2233 /********************************************************************
2234 Callback aggregator for int columns. */
2235 static
2236 ibool
2237 fts_query_sum(
2238 /*==========*/
2239 					/*!< out: always returns TRUE */
2240 	void*		row,		/*!< in:  sel_node_t* */
2241 	void*		user_arg)	/*!< in:  ulint* */
2242 {
2243 
2244 	que_node_t*	exp;
2245 	sel_node_t*	node = row;
2246 	ulint*		total = user_arg;
2247 
2248 	exp = node->select_list;
2249 
2250 	while (exp) {
2251 		dfield_t*	dfield = que_node_get_val(exp);
2252 		void*		data = dfield_get_data(dfield);
2253 		ulint		len = dfield_get_len(dfield);
2254 
2255 		if (len != UNIV_SQL_NULL && len != 0) {
2256 			*total += mach_read_from_4(data);
2257 		}
2258 
2259 		exp = que_node_get_next(exp);
2260 	}
2261 
2262 	return(TRUE);
2263 }
2264 
2265 /********************************************************************
2266 Calculate the total documents that contain a particular word (term).
2267 @return DB_SUCCESS if all go well else error code */
2268 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2269 dberr_t
2270 fts_query_total_docs_containing_term(
2271 /*=================================*/
2272 	fts_query_t*		query,	/*!< in: FTS query state */
2273 	const fts_string_t*	word,	/*!< in: the word to check */
2274 	ulint*			total)	/*!< out: documents containing word */
2275 {
2276 	pars_info_t*		info;
2277 	dberr_t			error;
2278 	que_t*			graph;
2279 	ulint			selected;
2280 	trx_t*			trx = query->trx;
2281 	char			table_name[MAX_FULL_NAME_LEN]
2282 
2283 	trx->op_info = "fetching FTS index document count";
2284 
2285 	*total = 0;
2286 
2287 	info = pars_info_create();
2288 
2289 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2290 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2291 
2292 	selected = fts_select_index(*word->f_str);
2293 
2294 	query->fts_index_table.suffix = fts_get_suffix(selected);
2295 
2296 	fts_get_table_name(&query->fts_index_table, table_name);
2297 
2298 	pars_info_bind_id(info, true, "index_table_name", table_name);
2299 
2300 	graph = fts_parse_sql(
2301 		&query->fts_index_table,
2302 		info,
2303 		"DECLARE FUNCTION my_func;\n"
2304 		"DECLARE CURSOR c IS"
2305 		" SELECT doc_count\n"
2306 		" FROM $index_table_name\n"
2307 		" WHERE word = :word"
2308 		" ORDER BY first_doc_id;\n"
2309 		"BEGIN\n"
2310 		"\n"
2311 		"OPEN c;\n"
2312 		"WHILE 1 = 1 LOOP\n"
2313 		"  FETCH c INTO my_func();\n"
2314 		"  IF c % NOTFOUND THEN\n"
2315 		"    EXIT;\n"
2316 		"  END IF;\n"
2317 		"END LOOP;\n"
2318 		"CLOSE c;");
2319 
2320 	for (;;) {
2321 		error = fts_eval_sql(trx, graph);
2322 
2323 		if (error == DB_SUCCESS) {
2324 
2325 			break;				/* Exit the loop. */
2326 		} else {
2327 
2328 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2329 				ib::warn() << "lock wait timeout reading FTS"
2330 					" index. Retrying!";
2331 
2332 				trx->error_state = DB_SUCCESS;
2333 			} else {
2334 				ib::error() << error
2335 					<< " while reading FTS index.";
2336 
2337 				break;			/* Exit the loop. */
2338 			}
2339 		}
2340 	}
2341 
2342 	fts_que_graph_free(graph);
2343 
2344 	return(error);
2345 }
2346 
2347 /********************************************************************
2348 Get the total number of words in a documents.
2349 @return DB_SUCCESS if all go well else error code */
2350 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2351 dberr_t
2352 fts_query_terms_in_document(
2353 /*========================*/
2354 	fts_query_t*	query,		/*!< in: FTS query state */
2355 	doc_id_t	doc_id,		/*!< in: the word to check */
2356 	ulint*		total)		/*!< out: total words in document */
2357 {
2358 	pars_info_t*	info;
2359 	dberr_t		error;
2360 	que_t*		graph;
2361 	doc_id_t	read_doc_id;
2362 	trx_t*		trx = query->trx;
2363 	char		table_name[MAX_FULL_NAME_LEN];
2364 
2365 	trx->op_info = "fetching FTS document term count";
2366 
2367 	*total = 0;
2368 
2369 	info = pars_info_create();
2370 
2371 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2372 
2373 	/* Convert to "storage" byte order. */
2374 	fts_write_doc_id((byte*) &read_doc_id, doc_id);
2375 	fts_bind_doc_id(info, "doc_id", &read_doc_id);
2376 
2377 	query->fts_index_table.suffix = "DOC_ID";
2378 
2379 	fts_get_table_name(&query->fts_index_table, table_name);
2380 
2381 	pars_info_bind_id(info, true, "index_table_name", table_name);
2382 
2383 	graph = fts_parse_sql(
2384 		&query->fts_index_table,
2385 		info,
2386 		"DECLARE FUNCTION my_func;\n"
2387 		"DECLARE CURSOR c IS"
2388 		" SELECT count\n"
2389 		" FROM $index_table_name\n"
2390 		" WHERE doc_id = :doc_id"
2391 		" BEGIN\n"
2392 		"\n"
2393 		"OPEN c;\n"
2394 		"WHILE 1 = 1 LOOP\n"
2395 		"  FETCH c INTO my_func();\n"
2396 		"  IF c % NOTFOUND THEN\n"
2397 		"    EXIT;\n"
2398 		"  END IF;\n"
2399 		"END LOOP;\n"
2400 		"CLOSE c;");
2401 
2402 	for (;;) {
2403 		error = fts_eval_sql(trx, graph);
2404 
2405 		if (error == DB_SUCCESS) {
2406 
2407 			break;				/* Exit the loop. */
2408 		} else {
2409 
2410 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2411 				ib::warn() << "lock wait timeout reading FTS"
2412 					" doc id table. Retrying!";
2413 
2414 				trx->error_state = DB_SUCCESS;
2415 			} else {
2416 				ib::error() << error << " while reading FTS"
2417 					" doc id table.";
2418 
2419 				break;			/* Exit the loop. */
2420 			}
2421 		}
2422 	}
2423 
2424 	fts_que_graph_free(graph);
2425 
2426 	return(error);
2427 }
2428 #endif
2429 
2430 /*****************************************************************//**
2431 Retrieve the document and match the phrase tokens.
2432 @return DB_SUCCESS or error code */
2433 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2434 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2435 fts_query_match_document(
2436 /*=====================*/
2437 	ib_vector_t*	tokens,		/*!< in: phrase tokens */
2438 	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
2439 	fts_match_t*	match,		/*!< in: doc id and positions */
2440 	ulint		distance,	/*!< in: proximity distance */
2441 	st_mysql_ftparser* parser,	/*!< in: fts plugin parser */
2442 	ibool*		found)		/*!< out: TRUE if phrase found */
2443 {
2444 	dberr_t		error;
2445 	fts_phrase_t	phrase(get_doc->index_cache->index->table);
2446 
2447 	phrase.match = match;		/* Positions to match */
2448 	phrase.tokens = tokens;		/* Tokens to match */
2449 	phrase.distance = distance;
2450 	phrase.charset = get_doc->index_cache->charset;
2451 	phrase.heap = mem_heap_create(512);
2452 	phrase.parser = parser;
2453 
2454 	*found = phrase.found = FALSE;
2455 
2456 	error = fts_doc_fetch_by_doc_id(
2457 		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2458 		fts_query_fetch_document, &phrase);
2459 
2460 	if (error != DB_SUCCESS) {
2461 		ib::error() << "(" << ut_strerr(error)
2462 			<< ") matching document.";
2463 	} else {
2464 		*found = phrase.found;
2465 	}
2466 
2467 	mem_heap_free(phrase.heap);
2468 
2469 	return(error);
2470 }
2471 
2472 /*****************************************************************//**
2473 This function fetches the original documents and count the
2474 words in between matching words to see that is in specified distance
2475 @return DB_SUCCESS if all OK */
2476 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2477 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2478 fts_query_is_in_proximity_range(
2479 /*============================*/
2480 	const fts_query_t*	query,		/*!< in:  query instance */
2481 	fts_match_t**		match,		/*!< in: query instance */
2482 	fts_proximity_t*	qualified_pos)	/*!< in: position info for
2483 						qualified ranges */
2484 {
2485 	fts_get_doc_t	get_doc;
2486 	fts_cache_t*	cache = query->index->table->fts->cache;
2487 	dberr_t		err;
2488 
2489 	memset(&get_doc, 0x0, sizeof(get_doc));
2490 
2491 	rw_lock_x_lock(&cache->lock);
2492 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2493 	rw_lock_x_unlock(&cache->lock);
2494 	ut_a(get_doc.index_cache != NULL);
2495 
2496 	fts_phrase_t	phrase(get_doc.index_cache->index->table);
2497 
2498 	phrase.distance = query->distance;
2499 	phrase.charset = get_doc.index_cache->charset;
2500 	phrase.heap = mem_heap_create(512);
2501 	phrase.proximity_pos = qualified_pos;
2502 	phrase.found = FALSE;
2503 
2504 	err = fts_doc_fetch_by_doc_id(
2505 		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2506 		fts_query_fetch_document, &phrase);
2507 
2508 	if (err != DB_SUCCESS) {
2509 		ib::error() << "(" << ut_strerr(err) << ") in verification"
2510 			" phase of proximity search";
2511 	}
2512 
2513 	/* Free the prepared statement. */
2514 	if (get_doc.get_document_graph) {
2515 		fts_que_graph_free(get_doc.get_document_graph);
2516 		get_doc.get_document_graph = NULL;
2517 	}
2518 
2519 	mem_heap_free(phrase.heap);
2520 
2521 	return(err == DB_SUCCESS && phrase.found);
2522 }
2523 
2524 /*****************************************************************//**
2525 Iterate over the matched document ids and search the for the
2526 actual phrase in the text.
2527 @return DB_SUCCESS if all OK */
2528 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2529 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2530 fts_query_search_phrase(
2531 /*====================*/
2532 	fts_query_t*		query,		/*!< in: query instance */
2533 	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
2534 						with any stopwords in the
2535 						original phrase */
2536 	ib_vector_t*		tokens)		/*!< in: tokens that does
2537 						not include stopwords and
2538 						can be used to calculate
2539 						ranking */
2540 {
2541 	ulint			i;
2542 	fts_get_doc_t		get_doc;
2543 	ulint			n_matched;
2544 	fts_cache_t*		cache = query->index->table->fts->cache;
2545 
2546 	n_matched = ib_vector_size(query->matched);
2547 
2548 	/* Setup the doc retrieval infrastructure. */
2549 	memset(&get_doc, 0x0, sizeof(get_doc));
2550 
2551 	rw_lock_x_lock(&cache->lock);
2552 
2553 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2554 
2555 	/* Must find the index cache */
2556 	ut_a(get_doc.index_cache != NULL);
2557 
2558 	rw_lock_x_unlock(&cache->lock);
2559 
2560 #ifdef FTS_INTERNAL_DIAG_PRINT
2561 	ib::info() << "Start phrase search";
2562 #endif
2563 
2564 	/* Read the document from disk and do the actual
2565 	match, matching documents will be added to the current
2566 	doc id set. */
2567 	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2568 		fts_match_t*	match;
2569 		ibool		found = FALSE;
2570 
2571 		match = static_cast<fts_match_t*>(
2572 			ib_vector_get(query->matched, i));
2573 
2574 		/* Skip the document ids that were filtered out by
2575 		an earlier pass. */
2576 		if (match->doc_id != 0) {
2577 
2578 			query->error = fts_query_match_document(
2579 				orig_tokens, &get_doc, match,
2580 				query->distance, query->parser, &found);
2581 
2582 			if (query->error == DB_SUCCESS && found) {
2583 				ulint	z;
2584 
2585 				query->error = fts_query_process_doc_id(query,
2586 							 match->doc_id, 0);
2587 				if (query->error != DB_SUCCESS) {
2588 					goto func_exit;
2589 				}
2590 
2591 				for (z = 0; z < ib_vector_size(tokens); z++) {
2592 					fts_string_t*   token;
2593 					token = static_cast<fts_string_t*>(
2594 						ib_vector_get(tokens, z));
2595 					fts_query_add_word_to_document(
2596 						query, match->doc_id, token);
2597 				}
2598 			}
2599 		}
2600 	}
2601 
2602 func_exit:
2603 	/* Free the prepared statement. */
2604 	if (get_doc.get_document_graph) {
2605 		fts_que_graph_free(get_doc.get_document_graph);
2606 		get_doc.get_document_graph = NULL;
2607 	}
2608 
2609 	return(query->error);
2610 }
2611 
2612 /** Split the phrase into tokens
2613 @param[in,out]	query		query instance
2614 @param[in]	node		query node to search
2615 @param[in,out]	tokens		token vector
2616 @param[in,out]	orig_tokens	original node tokens include stopword
2617 @param[in,out]	heap	mem heap */
2618 static
2619 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2620 fts_query_phrase_split(
2621 	fts_query_t*		query,
2622 	const fts_ast_node_t*	node,
2623 	ib_vector_t*		tokens,
2624 	ib_vector_t*		orig_tokens,
2625 	mem_heap_t*		heap)
2626 {
2627 	fts_string_t		phrase;
2628 	ulint			len = 0;
2629 	ulint			cur_pos = 0;
2630 	fts_ast_node_t*		term_node = NULL;
2631 
2632 	if (node->type == FTS_AST_TEXT) {
2633 		phrase.f_str = node->text.ptr->str;
2634 		phrase.f_len = node->text.ptr->len;
2635 		len = phrase.f_len;
2636 	} else {
2637 		ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2638 		phrase.f_str = NULL;
2639 		phrase.f_len = 0;
2640 		term_node = node->list.head;
2641 	}
2642 
2643 	while (true) {
2644 		fts_cache_t*	cache = query->index->table->fts->cache;
2645 		ulint		cur_len;
2646 		fts_string_t	result_str;
2647 
2648 		if (node->type == FTS_AST_TEXT) {
2649 			if (cur_pos >= len) {
2650 				break;
2651 			}
2652 
2653 			cur_len = innobase_mysql_fts_get_token(
2654 				query->fts_index_table.charset,
2655 				reinterpret_cast<const byte*>(phrase.f_str)
2656 				+ cur_pos,
2657 				reinterpret_cast<const byte*>(phrase.f_str)
2658 				+ len,
2659 				&result_str);
2660 
2661 			if (cur_len == 0) {
2662 				break;
2663 			}
2664 
2665 			cur_pos += cur_len;
2666 		} else {
2667 			ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2668 			/* Term node in parser phrase list */
2669 			if (term_node == NULL) {
2670 				break;
2671 			}
2672 
2673 			ut_a(term_node->type == FTS_AST_TERM);
2674 			result_str.f_str = term_node->term.ptr->str;
2675 			result_str.f_len = term_node->term.ptr->len;
2676 			result_str.f_n_char = fts_get_token_size(
2677 				query->fts_index_table.charset,
2678 				reinterpret_cast<char*>(result_str.f_str),
2679 				result_str.f_len);
2680 
2681 			term_node = term_node->next;
2682 		}
2683 
2684 		if (result_str.f_n_char == 0) {
2685 			continue;
2686 		}
2687 
2688 		fts_string_t*	token = static_cast<fts_string_t*>(
2689 			ib_vector_push(tokens, NULL));
2690 		fts_string_dup(token, &result_str, heap);
2691 
2692 		if (fts_check_token(
2693 			   &result_str,
2694 			   cache->stopword_info.cached_stopword,
2695 			   query->index->is_ngram,
2696 			   query->fts_index_table.charset)) {
2697 			/* Add the word to the RB tree so that we can
2698 			calculate it's frequencey within a document. */
2699 			fts_query_add_word_freq(query, token);
2700 		} else {
2701 			ib_vector_pop(tokens);
2702 		}
2703 
2704 		/* we will start to store all words including stopwords
2705 		in the "orig_tokens" vector, but skip any leading words
2706 		that are stopwords */
2707 		if (!ib_vector_is_empty(tokens)) {
2708 			fts_string_t*	orig_token = static_cast<fts_string_t*>(
2709 				ib_vector_push(orig_tokens, NULL));
2710 
2711 			orig_token->f_str = token->f_str;
2712 			orig_token->f_len = token->f_len;
2713 		}
2714 	}
2715 }
2716 
2717 /*****************************************************************//**
2718 Text/Phrase search.
2719 @return DB_SUCCESS or error code */
2720 static MY_ATTRIBUTE((warn_unused_result))
2721 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2722 fts_query_phrase_search(
2723 /*====================*/
2724 	fts_query_t*		query,	/*!< in: query instance */
2725 	const fts_ast_node_t*	node)	/*!< in: node to search */
2726 {
2727 	ib_vector_t*		tokens;
2728 	ib_vector_t*		orig_tokens;
2729 	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
2730 	ib_alloc_t*		heap_alloc;
2731 	ulint			num_token;
2732 
2733 	heap_alloc = ib_heap_allocator_create(heap);
2734 
2735 	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2736 	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2737 
2738 	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2739 		query->flags = FTS_PROXIMITY;
2740 	} else {
2741 		query->flags = FTS_PHRASE;
2742 	}
2743 
2744 	/* Split the phrase into tokens. */
2745 	fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2746 
2747 	num_token = ib_vector_size(tokens);
2748 	if (num_token > MAX_PROXIMITY_ITEM) {
2749 		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2750 		goto func_exit;
2751 	}
2752 
2753 	ut_ad(ib_vector_size(orig_tokens) >= num_token);
2754 
2755 	/* Ignore empty strings. */
2756 	if (num_token > 0) {
2757 		fts_string_t*	token;
2758 		fts_fetch_t	fetch;
2759 		trx_t*		trx = query->trx;
2760 		fts_ast_oper_t	oper = query->oper;
2761 		que_t*		graph = NULL;
2762 		ulint		i;
2763 		dberr_t		error;
2764 
2765 		/* Create the vector for storing matching document ids
2766 		and the positions of the first token of the phrase. */
2767 		if (!query->matched) {
2768 			ib_alloc_t*	heap_alloc;
2769 
2770 			heap_alloc = ib_heap_allocator_create(heap);
2771 
2772 			if (!(query->flags & FTS_PROXIMITY)
2773 			    && !(query->flags & FTS_PHRASE)) {
2774 				query->matched = ib_vector_create(
2775 					heap_alloc, sizeof(fts_match_t),
2776 					64);
2777 			} else {
2778 				ut_a(num_token <= MAX_PROXIMITY_ITEM);
2779 				query->match_array =
2780 					(ib_vector_t**) mem_heap_alloc(
2781 						heap,
2782 						num_token *
2783 						sizeof(query->matched));
2784 
2785 				for (i = 0; i < num_token; i++) {
2786 					query->match_array[i] =
2787 					ib_vector_create(
2788 						heap_alloc, sizeof(fts_match_t),
2789 						64);
2790 				}
2791 
2792 				query->matched = query->match_array[0];
2793 			}
2794 		}
2795 
2796 		/* Setup the callback args for filtering and consolidating
2797 		the ilist. */
2798 		fetch.read_arg = query;
2799 		fetch.read_record = fts_query_index_fetch_nodes;
2800 
2801 		for (i = 0; i < num_token; i++) {
2802 			/* Search for the first word from the phrase. */
2803 			token = static_cast<fts_string_t*>(
2804 				ib_vector_get(tokens, i));
2805 
2806 			if (query->flags & FTS_PROXIMITY
2807 			    || query->flags & FTS_PHRASE) {
2808 				query->matched = query->match_array[i];
2809 			}
2810 
2811 			error = fts_index_fetch_nodes(
2812 				trx, &graph, &query->fts_index_table,
2813 				token, &fetch);
2814 
2815 			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2816 			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2817 			if (error != DB_SUCCESS) {
2818 				query->error = error;
2819 			}
2820 
2821 			fts_que_graph_free(graph);
2822 			graph = NULL;
2823 
2824 			fts_query_cache(query, token);
2825 
2826 			if (!(query->flags & FTS_PHRASE)
2827 			    && !(query->flags & FTS_PROXIMITY)) {
2828 				break;
2829 			}
2830 
2831 			/* If any of the token can't be found,
2832 			no need to continue match */
2833 			if (ib_vector_is_empty(query->match_array[i])
2834 			    || query->error != DB_SUCCESS) {
2835 				goto func_exit;
2836 			}
2837 		}
2838 
2839 		/* Just a single word, no need to fetch the original
2840 		documents to do phrase matching */
2841 		if (ib_vector_size(orig_tokens) == 1
2842 		    && !ib_vector_is_empty(query->match_array[0])) {
2843 			fts_match_t*    match;
2844 			ulint		n_matched;
2845 
2846 			n_matched = ib_vector_size(query->match_array[0]);
2847 
2848 			for (i = 0; i < n_matched; i++) {
2849 				match = static_cast<fts_match_t*>(
2850 					ib_vector_get(
2851 						query->match_array[0], i));
2852 
2853 				query->error = fts_query_process_doc_id(
2854 						query, match->doc_id, 0);
2855 				if (query->error != DB_SUCCESS) {
2856 					goto func_exit;
2857 				}
2858 
2859 				fts_query_add_word_to_document(
2860 					query, match->doc_id, token);
2861 			}
2862 			query->oper = oper;
2863 			goto func_exit;
2864 		}
2865 
2866 		/* If we are doing proximity search, verify the distance
2867 		between all words, and check they are in specified distance. */
2868 		if (query->flags & FTS_PROXIMITY) {
2869 			fts_phrase_or_proximity_search(query, tokens);
2870 		} else {
2871 			ibool	matched;
2872 
2873 			/* Phrase Search case:
2874 			We filter out the doc ids that don't contain
2875 			all the tokens in the phrase. It's cheaper to
2876 			search the ilist than bringing the documents in
2877 			and then doing a search through the text. Isolated
2878 			testing shows this also helps in mitigating disruption
2879 			of the buffer cache. */
2880 			matched = fts_phrase_or_proximity_search(query, tokens);
2881 			query->matched = query->match_array[0];
2882 
2883 			/* Read the actual text in and search for the phrase. */
2884 			if (matched) {
2885 				ut_ad(query->error == DB_SUCCESS);
2886 				query->error = fts_query_search_phrase(
2887 					query, orig_tokens, tokens);
2888 			}
2889 		}
2890 
2891 		/* Restore original operation. */
2892 		query->oper = oper;
2893 
2894 		if (query->error != DB_SUCCESS) {
2895 			goto func_exit;
2896 		}
2897 	}
2898 
2899 func_exit:
2900 	mem_heap_free(heap);
2901 
2902 	/* Don't need it anymore. */
2903 	query->matched = NULL;
2904 
2905 	return(query->error);
2906 }
2907 
2908 /*****************************************************************//**
2909 Find the word and evaluate.
2910 @return DB_SUCCESS if all go well */
2911 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2912 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2913 fts_query_execute(
2914 /*==============*/
2915 	fts_query_t*		query,	/*!< in: query instance */
2916 	fts_string_t*		token)	/*!< in: token to search */
2917 {
2918 	switch (query->oper) {
2919 	case FTS_NONE:
2920 	case FTS_NEGATE:
2921 	case FTS_INCR_RATING:
2922 	case FTS_DECR_RATING:
2923 		query->error = fts_query_union(query, token);
2924 		break;
2925 
2926 	case FTS_EXIST:
2927 		query->error = fts_query_intersect(query, token);
2928 		break;
2929 
2930 	case FTS_IGNORE:
2931 		query->error = fts_query_difference(query, token);
2932 		break;
2933 
2934 	default:
2935 		ut_error;
2936 	}
2937 
2938 	return(query->error);
2939 }
2940 
2941 /*****************************************************************//**
2942 Create a wildcard string. It's the responsibility of the caller to
2943 free the byte* pointer. It's allocated using ut_malloc_nokey().
2944 @return ptr to allocated memory */
2945 static
2946 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2947 fts_query_get_token(
2948 /*================*/
2949 	fts_ast_node_t*	node,		/*!< in: the current sub tree */
2950 	fts_string_t*	token)		/*!< in: token to create */
2951 {
2952 	ulint		str_len;
2953 	byte*		new_ptr = NULL;
2954 
2955 	str_len = node->term.ptr->len;
2956 
2957 	ut_a(node->type == FTS_AST_TERM);
2958 
2959 	token->f_len = str_len;
2960 	token->f_str = node->term.ptr->str;
2961 
2962 	if (node->term.wildcard) {
2963 
2964 		token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2965 		token->f_len = str_len + 1;
2966 
2967 		memcpy(token->f_str, node->term.ptr->str, str_len);
2968 
2969 		token->f_str[str_len] = '%';
2970 		token->f_str[token->f_len] = 0;
2971 
2972 		new_ptr = token->f_str;
2973 	}
2974 
2975 	return(new_ptr);
2976 }
2977 
2978 /*****************************************************************//**
2979 Visit every node of the AST. */
2980 static
2981 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2982 fts_query_visitor(
2983 /*==============*/
2984 	fts_ast_oper_t	oper,		/*!< in: current operator */
2985 	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
2986 	void*		arg)		/*!< in: callback arg*/
2987 {
2988 	byte*		ptr;
2989 	fts_string_t	token;
2990 	fts_query_t*	query = static_cast<fts_query_t*>(arg);
2991 
2992 	ut_a(node);
2993 	DBUG_ENTER("fts_query_visitor");
2994 	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2995 
2996 	token.f_n_char = 0;
2997 	query->oper = oper;
2998 	query->cur_node = node;
2999 
3000 	switch (node->type) {
3001 	case FTS_AST_TEXT:
3002 	case FTS_AST_PARSER_PHRASE_LIST:
3003 
3004 		if (query->oper == FTS_EXIST) {
3005 			ut_ad(query->intersection == NULL);
3006 			query->intersection = rbt_create(
3007 				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3008 
3009 			query->total_size += SIZEOF_RBT_CREATE;
3010 		}
3011 
3012 		/* Set the current proximity distance. */
3013 		query->distance = node->text.distance;
3014 
3015 		/* Force collection of doc ids and the positions. */
3016 		query->collect_positions = TRUE;
3017 
3018 		query->error = fts_query_phrase_search(query, node);
3019 
3020 		query->collect_positions = FALSE;
3021 
3022 		if (query->oper == FTS_EXIST) {
3023 			fts_query_free_doc_ids(query, query->doc_ids);
3024 			query->doc_ids = query->intersection;
3025 			query->intersection = NULL;
3026 		}
3027 
3028 		break;
3029 
3030 	case FTS_AST_TERM:
3031 		token.f_str = node->term.ptr->str;
3032 		token.f_len = node->term.ptr->len;
3033 
3034 		/* Collect wildcard words for QUERY EXPANSION. */
3035 		if (node->term.wildcard && query->wildcard_words != NULL) {
3036 			ib_rbt_bound_t          parent;
3037 
3038 			if (rbt_search(query->wildcard_words, &parent, &token)
3039 			     != 0) {
3040 				fts_string_t	word;
3041 
3042 				fts_string_dup(&word, &token, query->heap);
3043 				rbt_add_node(query->wildcard_words, &parent,
3044 					     &word);
3045 			}
3046 		}
3047 
3048 		/* Add the word to our RB tree that will be used to
3049 		calculate this terms per document frequency. */
3050 		fts_query_add_word_freq(query, &token);
3051 
3052 		ptr = fts_query_get_token(node, &token);
3053 		query->error = fts_query_execute(query, &token);
3054 
3055 		if (ptr) {
3056 			ut_free(ptr);
3057 		}
3058 
3059 		break;
3060 
3061 	case FTS_AST_SUBEXP_LIST:
3062 		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3063 		break;
3064 
3065 	default:
3066 		ut_error;
3067 	}
3068 
3069 	if (query->oper == FTS_EXIST) {
3070 		query->multi_exist = true;
3071 	}
3072 
3073 	DBUG_RETURN(query->error);
3074 }
3075 
3076 /*****************************************************************//**
3077 Process (nested) sub-expression, create a new result set to store the
3078 sub-expression result by processing nodes under current sub-expression
3079 list. Merge the sub-expression result with that of parent expression list.
3080 @return DB_SUCCESS if all go well */
3081 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3082 fts_ast_visit_sub_exp(
3083 /*==================*/
3084 	fts_ast_node_t*		node,		/*!< in,out: current root node */
3085 	fts_ast_callback	visitor,	/*!< in: callback function */
3086 	void*			arg)		/*!< in,out: arg for callback */
3087 {
3088 	fts_ast_oper_t		cur_oper;
3089 	fts_query_t*		query = static_cast<fts_query_t*>(arg);
3090 	ib_rbt_t*		parent_doc_ids;
3091 	ib_rbt_t*		subexpr_doc_ids;
3092 	dberr_t			error = DB_SUCCESS;
3093 	bool			will_be_ignored = false;
3094 	bool			multi_exist;
3095 
3096 	DBUG_ENTER("fts_ast_visit_sub_exp");
3097 
3098 	/* sub-expression list may contains sub-expressions.
3099 	So we increase sub-expression depth counter.
3100 	If this counter reaches to the threshold then
3101 	we abort the search opertion and reports an error */
3102 	if (query->nested_exp_count > FTS_MAX_NESTED_EXP) {
3103 		query->error = DB_FTS_TOO_MANY_NESTED_EXP;
3104 		DBUG_RETURN(query->error);
3105 	}
3106 	query->nested_exp_count++;
3107 
3108 
3109 	ut_a(node->type == FTS_AST_SUBEXP_LIST);
3110 
3111 	cur_oper = query->oper;
3112 
3113 	/* Save current result set */
3114 	parent_doc_ids = query->doc_ids;
3115 
3116 	/* Create new result set to store the sub-expression result. We
3117 	will merge this result set with the parent after processing. */
3118 	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3119 				    fts_ranking_doc_id_cmp);
3120 
3121 	query->total_size += SIZEOF_RBT_CREATE;
3122 
3123 	multi_exist = query->multi_exist;
3124 	query->multi_exist = false;
3125 	/* Process nodes in current sub-expression and store its
3126 	result set in query->doc_ids we created above. */
3127 	error = fts_ast_visit(FTS_NONE, node, visitor,
3128 			      arg, &will_be_ignored);
3129 
3130 	/* Reinstate parent node state */
3131 	query->multi_exist = multi_exist;
3132 	query->oper = cur_oper;
3133 
3134 	/* Merge the sub-expression result with the parent result set. */
3135 	subexpr_doc_ids = query->doc_ids;
3136 	query->doc_ids = parent_doc_ids;
3137 	if (error == DB_SUCCESS) {
3138 		error = fts_merge_doc_ids(query, subexpr_doc_ids);
3139 	}
3140 
3141 	/* Free current result set. Result already merged into parent. */
3142 	fts_query_free_doc_ids(query, subexpr_doc_ids);
3143 
3144 	DBUG_RETURN(error);
3145 }
3146 
3147 #if 0
3148 /*****************************************************************//***
3149 Check if the doc id exists in the ilist.
3150 @return TRUE if doc id found */
3151 static
3152 ulint
3153 fts_query_find_doc_id(
3154 /*==================*/
3155 	fts_select_t*	select,		/*!< in/out: contains the doc id to
3156 					find, we update the word freq if
3157 					document found */
3158 	void*		data,		/*!< in: doc id ilist */
3159 	ulint		len)		/*!< in: doc id ilist size */
3160 {
3161 	byte*		ptr = data;
3162 	doc_id_t	doc_id = 0;
3163 	ulint		decoded = 0;
3164 
3165 	/* Decode the ilist and search for selected doc_id. We also
3166 	calculate the frequency of the word in the document if found. */
3167 	while (decoded < len && !select->found) {
3168 		ulint		freq = 0;
3169 		ulint		min_pos = 0;
3170 		ulint		last_pos = 0;
3171 		ulint		pos = fts_decode_vlc(&ptr);
3172 
3173 		/* Add the delta. */
3174 		doc_id += pos;
3175 
3176 		while (*ptr) {
3177 			++freq;
3178 			last_pos += fts_decode_vlc(&ptr);
3179 
3180 			/* Only if min_pos is not set and the current
3181 			term exists in a position greater than the
3182 			min_pos of the previous term. */
3183 			if (min_pos == 0 && last_pos > select->min_pos) {
3184 				min_pos = last_pos;
3185 			}
3186 		}
3187 
3188 		/* Skip the end of word position marker. */
3189 		++ptr;
3190 
3191 		/* Bytes decoded so far. */
3192 		decoded = ptr - (byte*) data;
3193 
3194 		/* A word may exist in the document but we only consider a
3195 		match if it exists in a position that is greater than the
3196 		position of the previous term. */
3197 		if (doc_id == select->doc_id && min_pos > 0) {
3198 			fts_doc_freq_t*	doc_freq;
3199 
3200 			/* Add the doc id to the doc freq rb tree, if
3201 			the doc id doesn't exist it will be created. */
3202 			doc_freq = fts_query_add_doc_freq(
3203 				select->word_freq->doc_freqs, doc_id);
3204 
3205 			/* Avoid duplicating the frequency tally */
3206 			if (doc_freq->freq == 0) {
3207 				doc_freq->freq = freq;
3208 			}
3209 
3210 			select->found = TRUE;
3211 			select->min_pos = min_pos;
3212 		}
3213 	}
3214 
3215 	return(select->found);
3216 }
3217 #endif
3218 
3219 /*****************************************************************//**
3220 Read and filter nodes.
3221 @return DB_SUCCESS if all go well,
3222 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3223 static
3224 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3225 fts_query_filter_doc_ids(
3226 /*=====================*/
3227 	fts_query_t*		query,		/*!< in: query instance */
3228 	const fts_string_t*	word,		/*!< in: the current word */
3229 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
3230 	const fts_node_t*	node,		/*!< in: current FTS node */
3231 	void*			data,		/*!< in: doc id ilist */
3232 	ulint			len,		/*!< in: doc id ilist size */
3233 	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
3234 {
3235 	byte*		ptr = static_cast<byte*>(data);
3236 	doc_id_t	doc_id = 0;
3237 	ulint		decoded = 0;
3238 	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
3239 
3240 	if (query->limit != ULONG_UNDEFINED
3241 	    && query->n_docs >= query->limit) {
3242 		return(DB_SUCCESS);
3243 	}
3244 
3245 	/* Decode the ilist and add the doc ids to the query doc_id set. */
3246 	while (decoded < len) {
3247 		ulint		freq = 0;
3248 		fts_doc_freq_t*	doc_freq;
3249 		fts_match_t*	match = NULL;
3250 		ulint		last_pos = 0;
3251 		ulint		pos = fts_decode_vlc(&ptr);
3252 
3253 		/* Some sanity checks. */
3254 		if (doc_id == 0) {
3255 			ut_a(pos == node->first_doc_id);
3256 		}
3257 
3258 		/* Add the delta. */
3259 		doc_id += pos;
3260 
3261 		if (calc_doc_count) {
3262 			word_freq->doc_count++;
3263 		}
3264 
3265 		/* We simply collect the matching instances here. */
3266 		if (query->collect_positions) {
3267 			ib_alloc_t*	heap_alloc;
3268 
3269 			/* Create a new fts_match_t instance. */
3270 			match = static_cast<fts_match_t*>(
3271 				ib_vector_push(query->matched, NULL));
3272 
3273 			match->start = 0;
3274 			match->doc_id = doc_id;
3275 			heap_alloc = ib_vector_allocator(query->matched);
3276 
3277 			/* Allocate from the same heap as the
3278 			parent container. */
3279 			match->positions = ib_vector_create(
3280 				heap_alloc, sizeof(ulint), 64);
3281 
3282 			query->total_size += sizeof(fts_match_t)
3283 				+ sizeof(ib_vector_t)
3284 				+ sizeof(ulint) * 64;
3285 		}
3286 
3287 		/* Unpack the positions within the document. */
3288 		while (*ptr) {
3289 			last_pos += fts_decode_vlc(&ptr);
3290 
3291 			/* Collect the matching word positions, for phrase
3292 			matching later. */
3293 			if (query->collect_positions) {
3294 				ib_vector_push(match->positions, &last_pos);
3295 			}
3296 
3297 			++freq;
3298 		}
3299 
3300 		/* End of list marker. */
3301 		last_pos = (ulint) -1;
3302 
3303 		if (query->collect_positions) {
3304 			ut_a(match != NULL);
3305 			ib_vector_push(match->positions, &last_pos);
3306 		}
3307 
3308 		/* Add the doc id to the doc freq rb tree, if the doc id
3309 		doesn't exist it will be created. */
3310 		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3311 
3312 		/* Avoid duplicating frequency tally. */
3313 		if (doc_freq->freq == 0) {
3314 			doc_freq->freq = freq;
3315 		}
3316 
3317 		/* Skip the end of word position marker. */
3318 		++ptr;
3319 
3320 		/* Bytes decoded so far */
3321 		decoded = ptr - (byte*) data;
3322 
3323 		/* We simply collect the matching documents and the
3324 		positions here and match later. */
3325 		if (!query->collect_positions) {
3326 			/* We ignore error here and will check it later */
3327 			fts_query_process_doc_id(query, doc_id, 0);
3328 
3329 			/* Add the word to the document's matched RB tree. */
3330 			fts_query_add_word_to_document(query, doc_id, word);
3331 		}
3332 
3333 		if (query->limit != ULONG_UNDEFINED
3334 		    && query->limit <= ++query->n_docs) {
3335 			goto func_exit;
3336 		}
3337 	}
3338 
3339 	/* Some sanity checks. */
3340 	ut_a(doc_id == node->last_doc_id);
3341 
3342 func_exit:
3343 	if (query->total_size > fts_result_cache_limit) {
3344 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3345 	} else {
3346 		query->n_docs = 0;
3347 		return(DB_SUCCESS);
3348 	}
3349 }
3350 
3351 /*****************************************************************//**
3352 Read the FTS INDEX row.
3353 @return DB_SUCCESS if all go well. */
3354 static
3355 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3356 fts_query_read_node(
3357 /*================*/
3358 	fts_query_t*		query,	/*!< in: query instance */
3359 	const fts_string_t*	word,	/*!< in: current word */
3360 	que_node_t*		exp)	/*!< in: query graph node */
3361 {
3362 	int			i;
3363 	int			ret;
3364 	fts_node_t		node;
3365 	ib_rbt_bound_t		parent;
3366 	fts_word_freq_t*	word_freq;
3367 	ibool			skip = FALSE;
3368 	fts_string_t		term;
3369 	byte			buf[FTS_MAX_WORD_LEN + 1];
3370 	dberr_t			error = DB_SUCCESS;
3371 
3372 	ut_a(query->cur_node->type == FTS_AST_TERM
3373 	     || query->cur_node->type == FTS_AST_TEXT
3374 	     || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3375 
3376 	memset(&node, 0, sizeof(node));
3377 	term.f_str = buf;
3378 
3379 	/* Need to consider the wildcard search case, the word frequency
3380 	is created on the search string not the actual word. So we need
3381 	to assign the frequency on search string behalf. */
3382 	if (query->cur_node->type == FTS_AST_TERM
3383 	    && query->cur_node->term.wildcard) {
3384 
3385 		term.f_len = query->cur_node->term.ptr->len;
3386 		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3387 		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3388 	} else {
3389 		term.f_len = word->f_len;
3390 		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3391 		memcpy(term.f_str, word->f_str, word->f_len);
3392 	}
3393 
3394 	/* Lookup the word in our rb tree, it must exist. */
3395 	ret = rbt_search(query->word_freqs, &parent, &term);
3396 
3397 	ut_a(ret == 0);
3398 
3399 	word_freq = rbt_value(fts_word_freq_t, parent.last);
3400 
3401 	/* Start from 1 since the first column has been read by the caller.
3402 	Also, we rely on the order of the columns projected, to filter
3403 	out ilists that are out of range and we always want to read
3404 	the doc_count irrespective of the suitablility of the row. */
3405 
3406 	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3407 
3408 		dfield_t*	dfield = que_node_get_val(exp);
3409 		byte*		data = static_cast<byte*>(
3410 			dfield_get_data(dfield));
3411 		ulint		len = dfield_get_len(dfield);
3412 
3413 		ut_a(len != UNIV_SQL_NULL);
3414 
3415 		/* Note: The column numbers below must match the SELECT. */
3416 
3417 		switch (i) {
3418 		case 1: /* DOC_COUNT */
3419 			word_freq->doc_count += mach_read_from_4(data);
3420 			break;
3421 
3422 		case 2: /* FIRST_DOC_ID */
3423 			node.first_doc_id = fts_read_doc_id(data);
3424 
3425 			/* Skip nodes whose doc ids are out range. */
3426 			if (query->oper == FTS_EXIST
3427 			    && query->upper_doc_id > 0
3428 			    && node.first_doc_id > query->upper_doc_id) {
3429 				skip = TRUE;
3430 			}
3431 			break;
3432 
3433 		case 3: /* LAST_DOC_ID */
3434 			node.last_doc_id = fts_read_doc_id(data);
3435 
3436 			/* Skip nodes whose doc ids are out range. */
3437 			if (query->oper == FTS_EXIST
3438 			    && query->lower_doc_id > 0
3439 			    && node.last_doc_id < query->lower_doc_id) {
3440 				skip = TRUE;
3441 			}
3442 			break;
3443 
3444 		case 4: /* ILIST */
3445 
3446 			error = fts_query_filter_doc_ids(
3447 					query, &word_freq->word, word_freq,
3448 					&node, data, len, FALSE);
3449 
3450 			break;
3451 
3452 		default:
3453 			ut_error;
3454 		}
3455 	}
3456 
3457 	if (!skip) {
3458 		/* Make sure all columns were read. */
3459 
3460 		ut_a(i == 5);
3461 	}
3462 
3463 	return error;
3464 }
3465 
3466 /*****************************************************************//**
3467 Callback function to fetch the rows in an FTS INDEX record.
3468 @return always returns TRUE */
3469 static
3470 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3471 fts_query_index_fetch_nodes(
3472 /*========================*/
3473 	void*		row,		/*!< in: sel_node_t* */
3474 	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
3475 {
3476 	fts_string_t	key;
3477 	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
3478 	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
3479 	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
3480 	que_node_t*	exp = sel_node->select_list;
3481 	dfield_t*	dfield = que_node_get_val(exp);
3482 	void*		data = dfield_get_data(dfield);
3483 	ulint		dfield_len = dfield_get_len(dfield);
3484 
3485 	key.f_str = static_cast<byte*>(data);
3486 	key.f_len = dfield_len;
3487 
3488 	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3489 
3490 	/* Note: we pass error out by 'query->error' */
3491 	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3492 
3493 	if (query->error != DB_SUCCESS) {
3494 		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3495 		return(FALSE);
3496 	} else {
3497 		return(TRUE);
3498 	}
3499 }
3500 
3501 /*****************************************************************//**
3502 Calculate the inverse document frequency (IDF) for all the terms. */
3503 static
3504 void
fts_query_calculate_idf(fts_query_t * query)3505 fts_query_calculate_idf(
3506 /*====================*/
3507 	fts_query_t*	query)	/*!< in: Query state */
3508 {
3509 	const ib_rbt_node_t*	node;
3510 	ib_uint64_t		total_docs = query->total_docs;
3511 
3512 	/* We need to free any instances of fts_doc_freq_t that we
3513 	may have allocated. */
3514 	for (node = rbt_first(query->word_freqs);
3515 	     node;
3516 	     node = rbt_next(query->word_freqs, node)) {
3517 
3518 		fts_word_freq_t*	word_freq;
3519 
3520 		word_freq = rbt_value(fts_word_freq_t, node);
3521 
3522 		if (word_freq->doc_count > 0) {
3523 			if (total_docs == word_freq->doc_count) {
3524 				/* QP assume ranking > 0 if we find
3525 				a match. Since Log10(1) = 0, we cannot
3526 				make IDF a zero value if do find a
3527 				word in all documents. So let's make
3528 				it an arbitrary very small number */
3529 				word_freq->idf = log10(1.0001);
3530 			} else {
3531 				word_freq->idf = log10(
3532 					total_docs
3533 					/ (double) word_freq->doc_count);
3534 			}
3535 		}
3536 
3537 		if (fts_enable_diag_print) {
3538 			ib::info() << "'" << word_freq->word.f_str << "' -> "
3539 				<< query->total_docs << "/"
3540 				<< word_freq->doc_count << " "
3541 				<< std::setw(6) << std::setprecision(5)
3542 				<< word_freq->idf;
3543 		}
3544 	}
3545 }
3546 
3547 /*****************************************************************//**
3548 Calculate the ranking of the document. */
3549 static
3550 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3551 fts_query_calculate_ranking(
3552 /*========================*/
3553 	const fts_query_t*	query,		/*!< in: query state */
3554 	fts_ranking_t*		ranking)	/*!< in: Document to rank */
3555 {
3556 	ulint	pos = 0;
3557 	fts_string_t	word;
3558 
3559 	/* At this stage, ranking->rank should not exceed the 1.0
3560 	bound */
3561 	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3562 	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3563 
3564 	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3565 		int			ret;
3566 		ib_rbt_bound_t		parent;
3567 		double			weight;
3568 		fts_doc_freq_t*		doc_freq;
3569 		fts_word_freq_t*	word_freq;
3570 
3571 		ret = rbt_search(query->word_freqs, &parent, &word);
3572 
3573 		/* It must exist. */
3574 		ut_a(ret == 0);
3575 
3576 		word_freq = rbt_value(fts_word_freq_t, parent.last);
3577 
3578 		ret = rbt_search(
3579 			word_freq->doc_freqs, &parent, &ranking->doc_id);
3580 
3581 		/* It must exist. */
3582 		ut_a(ret == 0);
3583 
3584 		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3585 
3586 		weight = (double) doc_freq->freq * word_freq->idf;
3587 
3588 		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3589 	}
3590 }
3591 
3592 /*****************************************************************//**
3593 Add ranking to the result set. */
3594 static
3595 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3596 fts_query_add_ranking(
3597 /*==================*/
3598 	fts_query_t*		query,		/*!< in: query state */
3599 	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
3600 	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
3601 {
3602 	ib_rbt_bound_t		parent;
3603 
3604 	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
3605 	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3606 		fts_ranking_t*	ranking;
3607 
3608 		ranking = rbt_value(fts_ranking_t, parent.last);
3609 
3610 		ranking->rank += new_ranking->rank;
3611 
3612 		ut_a(ranking->words == NULL);
3613 	} else {
3614 		rbt_add_node(ranking_tree, &parent, new_ranking);
3615 
3616 		query->total_size += SIZEOF_RBT_NODE_ADD
3617 			+ sizeof(fts_ranking_t);
3618 	}
3619 }
3620 
3621 /*****************************************************************//**
3622 Retrieve the FTS Relevance Ranking result for doc with doc_id
3623 @return the relevance ranking value, 0 if no ranking value
3624 present. */
3625 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3626 fts_retrieve_ranking(
3627 /*=================*/
3628 	fts_result_t*	result,	/*!< in: FTS result structure */
3629 	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
3630 {
3631 	ib_rbt_bound_t		parent;
3632 	fts_ranking_t		new_ranking;
3633 
3634 	DBUG_ENTER("fts_retrieve_ranking");
3635 
3636 	if (!result || !result->rankings_by_id) {
3637 		DBUG_RETURN(0);
3638 	}
3639 
3640 	new_ranking.doc_id = doc_id;
3641 
3642 	/* Lookup the ranking in our rb tree */
3643 	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3644 		fts_ranking_t*  ranking;
3645 
3646 		ranking = rbt_value(fts_ranking_t, parent.last);
3647 
3648 		DBUG_RETURN(ranking->rank);
3649 	}
3650 
3651 	DBUG_RETURN(0);
3652 }
3653 
3654 /*****************************************************************//**
3655 Create the result and copy the data to it. */
3656 static
3657 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3658 fts_query_prepare_result(
3659 /*=====================*/
3660 	fts_query_t*	query,	/*!< in: Query state */
3661 	fts_result_t*	result)	/*!< in: result this can contain
3662 				data from a previous search on
3663 				another FTS index */
3664 {
3665 	const ib_rbt_node_t*	node;
3666 	bool			result_is_null = false;
3667 
3668 	DBUG_ENTER("fts_query_prepare_result");
3669 
3670 	if (result == NULL) {
3671 		result = static_cast<fts_result_t*>(
3672 			ut_zalloc_nokey(sizeof(*result)));
3673 
3674 		result->rankings_by_id = rbt_create(
3675 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3676 
3677 		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3678 		result_is_null = true;
3679 	}
3680 
3681 	if (query->flags == FTS_OPT_RANKING) {
3682 		fts_word_freq_t*	word_freq;
3683 		ulint		size = ib_vector_size(query->deleted->doc_ids);
3684 		fts_update_t*	array =
3685 			(fts_update_t*) query->deleted->doc_ids->data;
3686 
3687 		node = rbt_first(query->word_freqs);
3688 		ut_ad(node);
3689 		word_freq = rbt_value(fts_word_freq_t, node);
3690 
3691 		for (node = rbt_first(word_freq->doc_freqs);
3692 		     node;
3693 		     node = rbt_next(word_freq->doc_freqs, node)) {
3694 			fts_doc_freq_t* doc_freq;
3695 			fts_ranking_t	ranking;
3696 
3697 			doc_freq = rbt_value(fts_doc_freq_t, node);
3698 
3699 			/* Don't put deleted docs into result */
3700 			if (fts_bsearch(array, 0, static_cast<int>(size),
3701 					doc_freq->doc_id) >= 0) {
3702 				/* one less matching doc count */
3703 				--word_freq->doc_count;
3704 				continue;
3705 			}
3706 
3707 			ranking.doc_id = doc_freq->doc_id;
3708 			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3709 			ranking.words = NULL;
3710 
3711 			fts_query_add_ranking(query, result->rankings_by_id,
3712 					      &ranking);
3713 
3714 			if (query->total_size > fts_result_cache_limit) {
3715 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3716 				fts_query_free_result(result);
3717 				DBUG_RETURN(NULL);
3718 			}
3719 		}
3720 
3721 		/* Calculate IDF only after we exclude the deleted items */
3722 		fts_query_calculate_idf(query);
3723 
3724 		node = rbt_first(query->word_freqs);
3725 		word_freq = rbt_value(fts_word_freq_t, node);
3726 
3727 		/* Calculate the ranking for each doc */
3728 		for (node = rbt_first(result->rankings_by_id);
3729 		     node != NULL;
3730 		     node = rbt_next(result->rankings_by_id, node)) {
3731 
3732 			fts_ranking_t*  ranking;
3733 
3734 			ranking = rbt_value(fts_ranking_t, node);
3735 
3736 			ranking->rank = static_cast<fts_rank_t>(
3737 				ranking->rank * word_freq->idf * word_freq->idf);
3738 		}
3739 
3740 		DBUG_RETURN(result);
3741 	}
3742 
3743 	ut_a(rbt_size(query->doc_ids) > 0);
3744 
3745 	for (node = rbt_first(query->doc_ids);
3746 	     node;
3747 	     node = rbt_next(query->doc_ids, node)) {
3748 
3749 		fts_ranking_t*	ranking;
3750 
3751 		ranking = rbt_value(fts_ranking_t, node);
3752 		fts_query_calculate_ranking(query, ranking);
3753 
3754 		// FIXME: I think we may requre this information to improve the
3755 		// ranking of doc ids which have more word matches from
3756 		// different FTS indexes.
3757 
3758 		/* We don't need these anymore free the resources. */
3759 		ranking->words = NULL;
3760 
3761 		if (!result_is_null) {
3762 			fts_query_add_ranking(query, result->rankings_by_id, ranking);
3763 
3764 			 if (query->total_size > fts_result_cache_limit) {
3765 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3766 				fts_query_free_result(result);
3767 				DBUG_RETURN(NULL);
3768                         }
3769 		}
3770 	}
3771 
3772 	if (result_is_null) {
3773 		/* Use doc_ids directly */
3774 		rbt_free(result->rankings_by_id);
3775 		result->rankings_by_id = query->doc_ids;
3776 		query->doc_ids = NULL;
3777 	}
3778 
3779 	DBUG_RETURN(result);
3780 }
3781 
3782 /*****************************************************************//**
3783 Get the result of the query. Calculate the similarity coefficient. */
3784 static
3785 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3786 fts_query_get_result(
3787 /*=================*/
3788 	fts_query_t*		query,	/*!< in: query instance */
3789 	fts_result_t*		result)	/*!< in: result */
3790 {
3791 	DBUG_ENTER("fts_query_get_result");
3792 
3793 	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3794 		/* Copy the doc ids to the result. */
3795 		result = fts_query_prepare_result(query, result);
3796 	} else {
3797 		/* Create an empty result instance. */
3798 		result = static_cast<fts_result_t*>(
3799 			ut_zalloc_nokey(sizeof(*result)));
3800 	}
3801 
3802 	DBUG_RETURN(result);
3803 }
3804 
3805 /*****************************************************************//**
3806 FTS Query free resources and reset. */
3807 static
3808 void
fts_query_free(fts_query_t * query)3809 fts_query_free(
3810 /*===========*/
3811 	fts_query_t*	query)		/*!< in: query instance to free*/
3812 {
3813 
3814 	if (query->read_nodes_graph) {
3815 		fts_que_graph_free(query->read_nodes_graph);
3816 	}
3817 
3818 	if (query->root) {
3819 		fts_ast_free_node(query->root);
3820 	}
3821 
3822 	if (query->deleted) {
3823 		fts_doc_ids_free(query->deleted);
3824 	}
3825 
3826 	if (query->intersection) {
3827 		fts_query_free_doc_ids(query, query->intersection);
3828 		query->intersection = NULL;
3829 	}
3830 
3831 	if (query->doc_ids) {
3832 		fts_query_free_doc_ids(query, query->doc_ids);
3833 	}
3834 
3835 	if (query->word_freqs) {
3836 		const ib_rbt_node_t*	node;
3837 
3838 		/* We need to free any instances of fts_doc_freq_t that we
3839 		may have allocated. */
3840 		for (node = rbt_first(query->word_freqs);
3841 		     node;
3842 		     node = rbt_next(query->word_freqs, node)) {
3843 
3844 			fts_word_freq_t*	word_freq;
3845 
3846 			word_freq = rbt_value(fts_word_freq_t, node);
3847 
3848 			/* We need to cast away the const. */
3849 			rbt_free(word_freq->doc_freqs);
3850 		}
3851 
3852 		rbt_free(query->word_freqs);
3853 	}
3854 
3855 	if (query->wildcard_words != NULL) {
3856 		rbt_free(query->wildcard_words);
3857 	}
3858 
3859 	ut_a(!query->intersection);
3860 
3861 	if (query->word_map) {
3862 		rbt_free(query->word_map);
3863 	}
3864 
3865 	if (query->word_vector != NULL) {
3866 		UT_DELETE(query->word_vector);
3867 	}
3868 
3869 	if (query->heap) {
3870 		mem_heap_free(query->heap);
3871 	}
3872 
3873 	memset(query, 0, sizeof(*query));
3874 }
3875 
3876 /*****************************************************************//**
3877 Parse the query using flex/bison or plugin parser.
3878 @return parse tree node. */
3879 static
3880 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3881 fts_query_parse(
3882 /*============*/
3883 	fts_query_t*	query,		/*!< in: query instance */
3884 	byte*		query_str,	/*!< in: query string */
3885 	ulint		query_len)	/*!< in: query string length */
3886 {
3887 	int		error;
3888 	fts_ast_state_t state;
3889 	bool		mode = query->boolean_mode;
3890 	DBUG_ENTER("fts_query_parse");
3891 
3892 	memset(&state, 0x0, sizeof(state));
3893 
3894 	state.charset = query->fts_index_table.charset;
3895 
3896 	DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3897 		query->parser = NULL;);
3898 
3899 	if (query->parser) {
3900 		state.root = state.cur_node =
3901 			fts_ast_create_node_list(&state, NULL);
3902 		error = fts_parse_by_parser(mode, query_str, query_len,
3903 					    query->parser, &state);
3904 	} else {
3905 		/* Setup the scanner to use, this depends on the mode flag. */
3906 		state.lexer = fts_lexer_create(mode, query_str, query_len);
3907 		state.charset = query->fts_index_table.charset;
3908 		error = fts_parse(&state);
3909 		fts_lexer_free(state.lexer);
3910 		state.lexer = NULL;
3911 	}
3912 
3913 	/* Error during parsing ? */
3914 	if (error) {
3915 		/* Free the nodes that were allocated during parsing. */
3916 		fts_ast_state_free(&state);
3917 	} else {
3918 		query->root = state.root;
3919 
3920 		if (fts_enable_diag_print && query->root != NULL) {
3921 			fts_ast_node_print(query->root);
3922 		}
3923 	}
3924 
3925 	DBUG_RETURN(state.root);
3926 }
3927 
3928 /*******************************************************************//**
3929 FTS Query optimization
3930 Set FTS_OPT_RANKING if it is a simple term query */
3931 static
3932 void
fts_query_can_optimize(fts_query_t * query,uint flags)3933 fts_query_can_optimize(
3934 /*===================*/
3935 	fts_query_t*	query,		/*!< in/out: query instance */
3936 	uint		flags)		/*!< In: FTS search mode */
3937 {
3938 	fts_ast_node_t*	node = query->root;
3939 
3940 	if (flags & FTS_EXPAND) {
3941 		return;
3942 	}
3943 
3944 	/* Check if it has only a term without oper */
3945 	ut_ad(node->type == FTS_AST_LIST);
3946 	node = node->list.head;
3947 	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3948 		query->flags = FTS_OPT_RANKING;
3949 	}
3950 }
3951 
3952 
3953 
3954 /** FTS Query entry point.
3955 @param[in]	trx		transaction
3956 @param[in]	index		fts index to search
3957 @param[in]	flags		FTS search mode
3958 @param[in]	query_str	FTS query
3959 @param[in]	query_len	FTS query string len in bytes
3960 @param[in,out]	result		result doc ids
3961 @param[in]	limit		limit value
3962 @return DB_SUCCESS if successful otherwise error code */
3963 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result,ulonglong limit)3964 fts_query(
3965 	trx_t*		trx,
3966 	dict_index_t*	index,
3967 	uint		flags,
3968 	const byte*	query_str,
3969 	ulint		query_len,
3970 	fts_result_t**	result,
3971 	ulonglong	limit)
3972 {
3973 	fts_query_t	query;
3974 	dberr_t		error = DB_SUCCESS;
3975 	byte*		lc_query_str;
3976 	ulint		lc_query_str_len;
3977 	ulint		result_len;
3978 	bool		boolean_mode;
3979 	trx_t*		query_trx;
3980 	CHARSET_INFO*	charset;
3981 	ib_time_monotonic_ms_t		start_time_ms;
3982 	bool		will_be_ignored = false;
3983 
3984 	boolean_mode = flags & FTS_BOOL;
3985 
3986 	*result = NULL;
3987 	memset(&query, 0x0, sizeof(query));
3988 	query_trx = trx_allocate_for_background();
3989 	query_trx->op_info = "FTS query";
3990 
3991 	start_time_ms = ut_time_monotonic_ms();
3992 
3993 	query.trx = query_trx;
3994 	query.index = index;
3995 	query.boolean_mode = boolean_mode;
3996 	query.deleted = fts_doc_ids_create();
3997 	query.cur_node = NULL;
3998 
3999 	query.fts_common_table.type = FTS_COMMON_TABLE;
4000 	query.fts_common_table.table_id = index->table->id;
4001 	query.fts_common_table.parent = index->table->name.m_name;
4002 	query.fts_common_table.table = index->table;
4003 
4004 	charset = fts_index_get_charset(index);
4005 
4006 	query.fts_index_table.type = FTS_INDEX_TABLE;
4007 	query.fts_index_table.index_id = index->id;
4008 	query.fts_index_table.table_id = index->table->id;
4009 	query.fts_index_table.parent = index->table->name.m_name;
4010 	query.fts_index_table.charset = charset;
4011 	query.fts_index_table.table = index->table;
4012 
4013 	query.word_map = rbt_create_arg_cmp(
4014 		sizeof(fts_string_t), innobase_fts_text_cmp, charset);
4015 	query.word_vector = UT_NEW_NOKEY(word_vector_t());
4016 	query.error = DB_SUCCESS;
4017 
4018 	/* Setup the RB tree that will be used to collect per term
4019 	statistics. */
4020 	query.word_freqs = rbt_create_arg_cmp(
4021 		sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset);
4022 
4023 	if (flags & FTS_EXPAND) {
4024 		query.wildcard_words = rbt_create_arg_cmp(
4025 			sizeof(fts_string_t), innobase_fts_text_cmp, charset);
4026 	}
4027 
4028 	query.total_size += SIZEOF_RBT_CREATE;
4029 
4030 	query.total_docs = dict_table_get_n_rows(index->table);
4031 
4032 	query.limit = limit;
4033 
4034 	query.n_docs = 0;
4035 #ifdef FTS_DOC_STATS_DEBUG
4036 	if (ft_enable_diag_print) {
4037 		error = fts_get_total_word_count(
4038 			trx, query.index, &query.total_words);
4039 
4040 		if (error != DB_SUCCESS) {
4041 			goto func_exit;
4042 		}
4043 
4044 		ib::info() << "Total docs: " << query.total_docs
4045 			<< " Total words: " << query.total_words;
4046 	}
4047 #endif /* FTS_DOC_STATS_DEBUG */
4048 
4049 	query.fts_common_table.suffix = "DELETED";
4050 
4051 	/* Read the deleted doc_ids, we need these for filtering. */
4052 	error = fts_table_fetch_doc_ids(
4053 		NULL, &query.fts_common_table, query.deleted);
4054 
4055 	if (error != DB_SUCCESS) {
4056 		goto func_exit;
4057 	}
4058 
4059 	query.fts_common_table.suffix = "DELETED_CACHE";
4060 
4061 	error = fts_table_fetch_doc_ids(
4062 		NULL, &query.fts_common_table, query.deleted);
4063 
4064 	if (error != DB_SUCCESS) {
4065 		goto func_exit;
4066 	}
4067 
4068 	/* Get the deleted doc ids that are in the cache. */
4069 	fts_cache_append_deleted_doc_ids(
4070 		index->table->fts->cache, query.deleted->doc_ids);
4071 	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4072 
4073 	/* Sort the vector so that we can do a binary search over the ids. */
4074 	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
4075 
4076 	/* Convert the query string to lower case before parsing. We own
4077 	the ut_malloc'ed result and so remember to free it before return. */
4078 
4079 	lc_query_str_len = query_len * charset->casedn_multiply + 1;
4080 	lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4081 
4082 	/* For binary collations, a case sensitive search is
4083 	performed. Hence don't convert to lower case. */
4084 	if (my_binary_compare(charset)) {
4085 	memcpy(lc_query_str, query_str, query_len);
4086 		lc_query_str[query_len]= 0;
4087 		result_len= query_len;
4088 	} else {
4089 	result_len = innobase_fts_casedn_str(
4090 				charset, (char*)( query_str), query_len,
4091 				(char*)(lc_query_str), lc_query_str_len);
4092 	}
4093 
4094 	ut_ad(result_len < lc_query_str_len);
4095 
4096 	lc_query_str[result_len] = 0;
4097 
4098 	query.heap = mem_heap_create(128);
4099 
4100 	/* Create the rb tree for the doc id (current) set. */
4101 	query.doc_ids = rbt_create(
4102 		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4103 	query.parser = index->parser;
4104 
4105 	query.total_size += SIZEOF_RBT_CREATE;
4106 
4107 	/* Parse the input query string. */
4108 	if (fts_query_parse(&query, lc_query_str, result_len)) {
4109 		fts_ast_node_t*	ast = query.root;
4110 		ast->trx = trx;
4111 
4112 		/* Optimize query to check if it's a single term */
4113 		fts_query_can_optimize(&query, flags);
4114 
4115 		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4116 			        fts_result_cache_limit = 2048;
4117 		);
4118 
4119 		/* Optimisation is allowed for limit value
4120 		when
4121 		i)  No ranking involved
4122 		ii) Only FTS Union operations involved. */
4123 		if (query.limit != ULONG_UNDEFINED
4124 		    && !fts_ast_node_check_union(ast)) {
4125 			query.limit = ULONG_UNDEFINED;
4126 		}
4127 
4128 		DBUG_EXECUTE_IF("fts_union_limit_off",
4129 			query.limit = ULONG_UNDEFINED;
4130 		);
4131 
4132 		/* Traverse the Abstract Syntax Tree (AST) and execute
4133 		the query. */
4134 		query.error = fts_ast_visit(
4135 			FTS_NONE, ast, fts_query_visitor,
4136 			&query, &will_be_ignored);
4137 		if (query.error == DB_INTERRUPTED) {
4138 			error = DB_INTERRUPTED;
4139 			ut_free(lc_query_str);
4140 			goto func_exit;
4141 		}
4142 
4143 		/* If query expansion is requested, extend the search
4144 		with first search pass result */
4145 		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4146 			query.error = fts_expand_query(index, &query);
4147 		}
4148 
4149 		/* Calculate the inverse document frequency of the terms. */
4150 		if (query.error == DB_SUCCESS
4151 		    && query.flags != FTS_OPT_RANKING) {
4152 			fts_query_calculate_idf(&query);
4153 		}
4154 
4155 		/* Copy the result from the query state, so that we can
4156 		return it to the caller. */
4157 		if (query.error == DB_SUCCESS) {
4158 			*result = fts_query_get_result(&query, *result);
4159 		}
4160 
4161 		error = query.error;
4162 	} else {
4163 		/* still return an empty result set */
4164 		*result = static_cast<fts_result_t*>(
4165 			ut_zalloc_nokey(sizeof(**result)));
4166 	}
4167 
4168 	if (trx_is_interrupted(trx)) {
4169 		error = DB_INTERRUPTED;
4170 		ut_free(lc_query_str);
4171 		if (result != NULL) {
4172 			fts_query_free_result(*result);
4173 		}
4174 		goto func_exit;
4175 	}
4176 
4177 	ut_free(lc_query_str);
4178 
4179 	if (fts_enable_diag_print && (*result)) {
4180 		uint64_t diff_time = ut_time_monotonic_ms() - start_time_ms;
4181 
4182 		ib::info() << "FTS Search Processing time: "
4183 			<< diff_time / 1000 << " secs: " << diff_time % 1000
4184 			<< " millisec: row(s) "
4185 			<< ((*result)->rankings_by_id
4186 			    ? rbt_size((*result)->rankings_by_id)
4187 			    : -1);
4188 
4189 		/* Log memory consumption & result size */
4190 		ib::info() << "Full Search Memory: " << query.total_size
4191 			<< " (bytes),  Row: "
4192 			<< ((*result)->rankings_by_id
4193 			    ? rbt_size((*result)->rankings_by_id)
4194 			    : 0)
4195 			<< ".";
4196 	}
4197 
4198 func_exit:
4199 	fts_query_free(&query);
4200 
4201 	trx_free_for_background(query_trx);
4202 
4203 	return(error);
4204 }
4205 
4206 /*****************************************************************//**
4207 FTS Query free result, returned by fts_query(). */
4208 void
fts_query_free_result(fts_result_t * result)4209 fts_query_free_result(
4210 /*==================*/
4211 	fts_result_t*	result)		/*!< in: result instance to free.*/
4212 {
4213 	if (result) {
4214 		if (result->rankings_by_id != NULL) {
4215 			rbt_free(result->rankings_by_id);
4216 			result->rankings_by_id = NULL;
4217 		}
4218 		if (result->rankings_by_rank != NULL) {
4219 			rbt_free(result->rankings_by_rank);
4220 			result->rankings_by_rank = NULL;
4221 		}
4222 
4223 		ut_free(result);
4224 		result = NULL;
4225 	}
4226 }
4227 
4228 /*****************************************************************//**
4229 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4230 void
fts_query_sort_result_on_rank(fts_result_t * result)4231 fts_query_sort_result_on_rank(
4232 /*==========================*/
4233 	fts_result_t*	result)		/*!< out: result instance to sort.*/
4234 {
4235 	const ib_rbt_node_t*	node;
4236 	ib_rbt_t*		ranked;
4237 
4238 	ut_a(result->rankings_by_id != NULL);
4239 	if (result->rankings_by_rank) {
4240 		rbt_free(result->rankings_by_rank);
4241 	}
4242 
4243 	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4244 
4245 	/* We need to free any instances of fts_doc_freq_t that we
4246 	may have allocated. */
4247 	for (node = rbt_first(result->rankings_by_id);
4248 	     node;
4249 	     node = rbt_next(result->rankings_by_id, node)) {
4250 
4251 		fts_ranking_t*	ranking;
4252 
4253 		ranking = rbt_value(fts_ranking_t, node);
4254 
4255 		ut_a(ranking->words == NULL);
4256 
4257 		rbt_insert(ranked, ranking, ranking);
4258 	}
4259 
4260 	/* Reset the current node too. */
4261 	result->current = NULL;
4262 	result->rankings_by_rank = ranked;
4263 }
4264 
4265 /*******************************************************************//**
4266 A debug function to print result doc_id set. */
4267 static
4268 void
fts_print_doc_id(fts_query_t * query)4269 fts_print_doc_id(
4270 /*=============*/
4271 	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
4272 {
4273 	const ib_rbt_node_t*	node;
4274 
4275 	/* Iterate each member of the doc_id set */
4276 	for (node = rbt_first(query->doc_ids);
4277 	     node;
4278 	     node = rbt_next(query->doc_ids, node)) {
4279 		fts_ranking_t*	ranking;
4280 		ranking = rbt_value(fts_ranking_t, node);
4281 
4282 		ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4283 
4284 		ulint		pos = 0;
4285 		fts_string_t	word;
4286 
4287 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4288 			ib::info() << "doc_ids info, value: " << word.f_str;
4289 		}
4290 	}
4291 }
4292 
4293 /*************************************************************//**
4294 This function implements a simple "blind" query expansion search:
4295 words in documents found in the first search pass will be used as
4296 search arguments to search the document again, thus "expand"
4297 the search result set.
4298 @return DB_SUCCESS if success, otherwise the error code */
4299 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4300 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4301 fts_expand_query(
4302 /*=============*/
4303 	dict_index_t*	index,		/*!< in: FTS index to search */
4304 	fts_query_t*	query)		/*!< in: FTS query instance */
4305 {
4306 	const ib_rbt_node_t*	node;
4307 	const ib_rbt_node_t*	token_node;
4308 	fts_doc_t		result_doc;
4309 	dberr_t			error = DB_SUCCESS;
4310 	const fts_index_cache_t*index_cache;
4311 
4312 	/* If no doc is found in first search pass, return */
4313 	if (!rbt_size(query->doc_ids)) {
4314 		return(error);
4315 	}
4316 
4317 	/* Init "result_doc", to hold words from the first search pass */
4318 	fts_doc_init(&result_doc);
4319 
4320 	rw_lock_x_lock(&index->table->fts->cache->lock);
4321 	index_cache = fts_find_index_cache(index->table->fts->cache, index);
4322 	rw_lock_x_unlock(&index->table->fts->cache->lock);
4323 
4324 	ut_a(index_cache);
4325 
4326 	result_doc.tokens = rbt_create_arg_cmp(
4327 		sizeof(fts_token_t), innobase_fts_text_cmp,
4328 		index_cache->charset);
4329 
4330 	result_doc.charset = index_cache->charset;
4331 	result_doc.parser = index_cache->index->parser;
4332 	result_doc.is_ngram = index_cache->index->is_ngram;
4333 
4334 	query->total_size += SIZEOF_RBT_CREATE;
4335 
4336 	if (fts_enable_diag_print) {
4337 		fts_print_doc_id(query);
4338 	}
4339 
4340 	for (node = rbt_first(query->doc_ids);
4341 	     node;
4342 	     node = rbt_next(query->doc_ids, node)) {
4343 
4344 		fts_ranking_t*	ranking;
4345 		ulint		prev_token_size;
4346 		ulint		estimate_size;
4347 
4348 		prev_token_size = rbt_size(result_doc.tokens);
4349 
4350 		ranking = rbt_value(fts_ranking_t, node);
4351 
4352 		/* Fetch the documents with the doc_id from the
4353 		result of first seach pass. Since we do not
4354 		store document-to-word mapping, we need to
4355 		fetch the original document and parse them.
4356 		Future optimization could be done here if we
4357 		support some forms of document-to-word mapping */
4358 		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4359 					FTS_FETCH_DOC_BY_ID_EQUAL,
4360 					fts_query_expansion_fetch_doc,
4361 					&result_doc);
4362 
4363 		/* Estimate memory used, see fts_process_token and fts_token_t.
4364 		   We ignore token size here. */
4365 		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4366 			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4367 			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
4368 		query->total_size += estimate_size;
4369 
4370 		if (query->total_size > fts_result_cache_limit) {
4371 			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4372 			goto	func_exit;
4373 		}
4374 	}
4375 
4376 	/* Remove words that have already been searched in the first pass */
4377 	for (ulint i = 0; i < query->word_vector->size(); i++) {
4378 		fts_string_t	word = query->word_vector->at(i);
4379 		ib_rbt_bound_t	parent;
4380 
4381 		if (query->wildcard_words
4382 		    && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4383 			/* If it's a wildcard word, remove words having
4384 			it as prefix. */
4385 			while (rbt_search_cmp(result_doc.tokens,
4386 					      &parent, &word, NULL,
4387 					      innobase_fts_text_cmp_prefix)
4388 			       == 0) {
4389 				ut_free(rbt_remove_node(result_doc.tokens,
4390 							parent.last));
4391 			}
4392 		} else {
4393 			/* We don't check return value, because the word may
4394 			have been deleted by a previous wildcard word as its
4395 			prefix, e.g. ('g * good'). */
4396 			rbt_delete(result_doc.tokens, &word);
4397 		}
4398 	}
4399 
4400 	/* Search the table the second time with expanded search list */
4401 	for (token_node = rbt_first(result_doc.tokens);
4402 	     token_node;
4403 	     token_node = rbt_next(result_doc.tokens, token_node)) {
4404 		fts_token_t*	mytoken;
4405 		mytoken = rbt_value(fts_token_t, token_node);
4406 
4407 		/* '%' in the end is treated as prefix search,
4408 		it can cause assert failure, so we skip it. */
4409 		if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4410 			continue;
4411 		}
4412 
4413 		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4414 		fts_query_add_word_freq(query, &mytoken->text);
4415 		error = fts_query_union(query, &mytoken->text);
4416 
4417 		if (error != DB_SUCCESS) {
4418 			break;
4419 		}
4420 	}
4421 
4422 func_exit:
4423 	fts_doc_free(&result_doc);
4424 
4425 	return(error);
4426 }
4427 /*************************************************************//**
4428 This function finds documents that contain all words in a
4429 phrase or proximity search. And if proximity search, verify
4430 the words are close enough to each other, as in specified distance.
4431 This function is called for phrase and proximity search.
4432 @return TRUE if documents are found, FALSE if otherwise */
4433 static
4434 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4435 fts_phrase_or_proximity_search(
4436 /*===========================*/
4437 	fts_query_t*	query,		/*!< in/out:  query instance.
4438 					query->doc_ids might be instantiated
4439 					with qualified doc IDs */
4440 	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
4441 {
4442 	ulint		n_matched;
4443 	ulint		i;
4444 	ibool		matched = FALSE;
4445 	ulint		num_token = ib_vector_size(tokens);
4446 	fts_match_t*	match[MAX_PROXIMITY_ITEM];
4447 	ibool		end_list = FALSE;
4448 
4449 	/* Number of matched documents for the first token */
4450 	n_matched = ib_vector_size(query->match_array[0]);
4451 
4452 	/* We have a set of match list for each word, we shall
4453 	walk through the list and find common documents that
4454 	contain all the matching words. */
4455 	for (i = 0; i < n_matched; i++) {
4456 		ulint		j;
4457 		ulint		k = 0;
4458 		fts_proximity_t	qualified_pos;
4459 
4460 		match[0] = static_cast<fts_match_t*>(
4461 			ib_vector_get(query->match_array[0], i));
4462 
4463 		/* For remaining match list for the token(word), we
4464 		try to see if there is a document with the same
4465 		doc id */
4466 		for (j = 1; j < num_token; j++) {
4467 			match[j] = static_cast<fts_match_t*>(
4468 				ib_vector_get(query->match_array[j], k));
4469 
4470 			while (match[j]->doc_id < match[0]->doc_id
4471 			       && k < ib_vector_size(query->match_array[j])) {
4472 				 match[j] = static_cast<fts_match_t*>(
4473 					ib_vector_get(
4474 						query->match_array[j], k));
4475 				k++;
4476 			}
4477 
4478 			if (match[j]->doc_id > match[0]->doc_id) {
4479 				/* no match */
4480 				if (query->flags & FTS_PHRASE) {
4481 					match[0]->doc_id = 0;
4482 				}
4483 				break;
4484 			}
4485 
4486 			if (k == ib_vector_size(query->match_array[j])) {
4487 				end_list = TRUE;
4488 
4489 				if (query->flags & FTS_PHRASE) {
4490 					ulint	s;
4491 					/* Since i is the last doc id in the
4492 					match_array[j], remove all doc ids > i
4493 					from the match_array[0]. */
4494 					fts_match_t*    match_temp;
4495 					for (s = i + 1; s < n_matched; s++) {
4496 						match_temp = static_cast<
4497 						fts_match_t*>(ib_vector_get(
4498 						query->match_array[0], s));
4499 						match_temp->doc_id = 0;
4500 					}
4501 
4502 					if (match[j]->doc_id !=
4503 						match[0]->doc_id) {
4504 						/* no match */
4505 						match[0]->doc_id = 0;
4506 					}
4507 				}
4508 
4509 				if (match[j]->doc_id != match[0]->doc_id) {
4510 					goto func_exit;
4511 				}
4512 			}
4513 
4514 			/* FIXME: A better solution will be a counter array
4515 			remember each run's last position. So we don't
4516 			reset it here very time */
4517 			k = 0;
4518 		}
4519 
4520 		if (j != num_token) {
4521 			continue;
4522 		}
4523 
4524 		/* For this matching doc, we need to further
4525 		verify whether the words in the doc are close
4526 		to each other, and within the distance specified
4527 		in the proximity search */
4528 		if (query->flags & FTS_PHRASE) {
4529 			matched = TRUE;
4530 		} else if (fts_proximity_get_positions(
4531 			match, num_token, ULINT_MAX, &qualified_pos)) {
4532 
4533 			/* Fetch the original documents and count the
4534 			words in between matching words to see that is in
4535 			specified distance */
4536 			if (fts_query_is_in_proximity_range(
4537 				query, match, &qualified_pos)) {
4538 				/* If so, mark we find a matching doc */
4539 				query->error = fts_query_process_doc_id(
4540 					query, match[0]->doc_id, 0);
4541 				if (query->error != DB_SUCCESS) {
4542 					matched = FALSE;
4543 					goto func_exit;
4544 				}
4545 
4546 				matched = TRUE;
4547 				for (ulint z = 0; z < num_token; z++) {
4548 					fts_string_t*	token;
4549 					token = static_cast<fts_string_t*>(
4550 						ib_vector_get(tokens, z));
4551 					fts_query_add_word_to_document(
4552 						query, match[0]->doc_id, token);
4553 				}
4554 			}
4555 		}
4556 
4557 		if (end_list) {
4558 			break;
4559 		}
4560 	}
4561 
4562 func_exit:
4563 	return(matched);
4564 }
4565 
4566 /*************************************************************//**
4567 This function checks whether words in result documents are close to
4568 each other (within proximity range as specified by "distance").
4569 If "distance" is MAX_ULINT, then it will find all combinations of
4570 positions of matching words and store min and max positions
4571 in the "qualified_pos" for later verification.
4572 @return true if words are close to each other, false if otherwise */
4573 static
4574 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4575 fts_proximity_get_positions(
4576 /*========================*/
4577 	fts_match_t**		match,		/*!< in: query instance */
4578 	ulint			num_match,	/*!< in: number of matching
4579 						items */
4580 	ulint			distance,	/*!< in: distance value
4581 						for proximity search */
4582 	fts_proximity_t*	qualified_pos)	/*!< out: the position info
4583 						records ranges containing
4584 						all matching words. */
4585 {
4586 	ulint	i;
4587 	ulint	idx[MAX_PROXIMITY_ITEM];
4588 	ulint	num_pos[MAX_PROXIMITY_ITEM];
4589 	ulint	min_idx;
4590 
4591 	qualified_pos->n_pos = 0;
4592 
4593 	ut_a(num_match <= MAX_PROXIMITY_ITEM);
4594 
4595 	/* Each word could appear multiple times in a doc. So
4596 	we need to walk through each word's position list, and find
4597 	closest distance between different words to see if
4598 	they are in the proximity distance. */
4599 
4600 	/* Assume each word's position list is sorted, we
4601 	will just do a walk through to all words' lists
4602 	similar to a the merge phase of a merge sort */
4603 	for (i = 0; i < num_match; i++) {
4604 		/* idx is the current position we are checking
4605 		for a particular word */
4606 		idx[i] = 0;
4607 
4608 		/* Number of positions for this word */
4609 		num_pos[i] = ib_vector_size(match[i]->positions);
4610 	}
4611 
4612 	/* Start with the first word */
4613 	min_idx = 0;
4614 
4615 	while (idx[min_idx] < num_pos[min_idx]) {
4616 		ulint	position[MAX_PROXIMITY_ITEM];
4617 		ulint	min_pos = ULINT_MAX;
4618 		ulint	max_pos = 0;
4619 
4620 		/* Check positions in each word position list, and
4621 		record the max/min position */
4622 		for (i = 0; i < num_match; i++) {
4623 			position[i] = *(ulint*) ib_vector_get_const(
4624 				match[i]->positions, idx[i]);
4625 
4626 			if (position[i] == ULINT_UNDEFINED) {
4627 				break;
4628 			}
4629 
4630 			if (position[i] < min_pos) {
4631 				min_pos = position[i];
4632 				min_idx = i;
4633 			}
4634 
4635 			if (position[i] > max_pos) {
4636 				max_pos = position[i];
4637 			}
4638 		}
4639 
4640 		/* If max and min position are within range, we
4641 		find a good match */
4642 		if (max_pos - min_pos <= distance
4643 		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4644 			/* The charset has variable character
4645 			length encoding, record the min_pos and
4646 			max_pos, we will need to verify the actual
4647 			number of characters */
4648 			qualified_pos->min_pos.push_back(min_pos);
4649 			qualified_pos->max_pos.push_back(max_pos);
4650 			qualified_pos->n_pos++;
4651 		}
4652 
4653 		/* Otherwise, move to the next position is the
4654 		list for the word with the smallest position */
4655 		idx[min_idx]++;
4656 	}
4657 
4658 	return(qualified_pos->n_pos != 0);
4659 }
4660