1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file fts/fts0que.cc
22 Full Text Search functionality.
23 
24 Created 2007/03/27 Sunny Bains
25 Completed 2011/7/10 Sunny and Jimmy Yang
26 *******************************************************/
27 
28 #include "dict0dict.h"
29 #include "ut0rbt.h"
30 #include "row0sel.h"
31 #include "fts0fts.h"
32 #include "fts0priv.h"
33 #include "fts0ast.h"
34 #include "fts0pars.h"
35 #include "fts0types.h"
36 #include "fts0plugin.h"
37 #include "fts0vlc.h"
38 
39 #include <iomanip>
40 #include <vector>
41 
42 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
43 
44 #define RANK_DOWNGRADE		(-1.0F)
45 #define RANK_UPGRADE		(1.0F)
46 
47 /* Maximum number of words supported in a phrase or proximity search. */
48 #define MAX_PROXIMITY_ITEM	128
49 
50 /* Memory used by rbt itself for create and node add */
51 #define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
52 #define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
53 
54 /*Initial byte length for 'words' in fts_ranking_t */
55 #define RANKING_WORDS_INIT_LEN	4
56 
57 // FIXME: Need to have a generic iterator that traverses the ilist.
58 
59 typedef std::vector<fts_string_t, ut_allocator<fts_string_t> >	word_vector_t;
60 
61 struct fts_word_freq_t;
62 
63 /** State of an FTS query. */
64 struct fts_query_t {
65 	mem_heap_t*	heap;		/*!< Heap to use for allocations */
66 
67 	trx_t*		trx;		/*!< The query transaction */
68 
69 	dict_index_t*	index;		/*!< The FTS index to search */
70 					/*!< FTS auxiliary common table def */
71 
72 	fts_table_t	fts_common_table;
73 
74 	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
75 
76 	size_t		total_size;	/*!< total memory size used by query */
77 
78 	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
79 					filtered from the output */
80 
81 	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
82 
83 	fts_ast_node_t* cur_node;	/*!< Current tree node */
84 
85 	ib_rbt_t*	word_map;	/*!< Matched word map for
86 					searching by word*/
87 
88 	word_vector_t*	word_vector;	/*!< Matched word vector for
89 					searching by index */
90 
91 	ib_rbt_t*       doc_ids;	/*!< The current set of matching
92 					doc ids, elements are of
93 					type fts_ranking_t */
94 
95 	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
96 					doc_ids, this tree will become
97 					the new doc_ids, elements are of type
98 					fts_ranking_t */
99 
100 					/*!< Prepared statement to read the
101 					nodes from the FTS INDEX */
102 	que_t*		read_nodes_graph;
103 
104 	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
105 
106 					/*!< TRUE if we want to collect the
107 					word positions within the document */
108 	ibool		collect_positions;
109 
110 	ulint		flags;		/*!< Specify the full text search type,
111 					such as  boolean search, phrase
112 					search, proximity search etc. */
113 
114 	ulint		distance;	/*!< The proximity distance of a
115 					phrase search. */
116 
117 					/*!< These doc ids are used as a
118 					boundary condition when searching the
119 					FTS index rows */
120 
121 	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
122 
123 	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
124 
125 	bool		boolean_mode;	/*!< TRUE if boolean mode query */
126 
127 	ib_vector_t*	matched;	/*!< Array of matching documents
128 					(fts_match_t) to search for a phrase */
129 
130 	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
131 					position info for each matched word
132 					in the word list */
133 
134 	ib_uint64_t	total_docs;	/*!< The total number of documents */
135 
136 	ulint		total_words;	/*!< The total number of words */
137 
138 	dberr_t		error;		/*!< Error code if any, that is
139 					encountered during query processing */
140 
141 	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
142 					document, its elements are of type
143 					fts_word_freq_t */
144 
145 	ib_rbt_t*	wildcard_words;	/*!< words with wildcard */
146 
147 	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
148 	byte		visiting_sub_exp; /*!< count of nested
149 					fts_ast_visit_sub_exp() */
150 
151 	st_mysql_ftparser*	parser;	/*!< fts plugin parser */
152 };
153 
154 /** For phrase matching, first we collect the documents and the positions
155 then we match. */
156 struct fts_match_t {
157 	doc_id_t	doc_id;		/*!< Document id */
158 
159 	ulint		start;		/*!< Start the phrase match from
160 					this offset within the positions
161 					vector. */
162 
163 	ib_vector_t*	positions;	/*!< Offsets of a word in a
164 					document */
165 };
166 
167 /** For matching tokens in a phrase search. We use this data structure in
168 the callback that determines whether a document should be accepted or
169 rejected for a phrase search. */
170 struct fts_select_t {
171 	doc_id_t	doc_id;		/*!< The document id to match */
172 
173 	ulint		min_pos;	/*!< For found to be TRUE at least
174 					one position must be greater than
175 					min_pos. */
176 
177 	ibool		found;		/*!< TRUE if found */
178 
179 	fts_word_freq_t*
180 			word_freq;	/*!< Word frequency instance of the
181 					current word being looked up in
182 					the FTS index */
183 };
184 
185 typedef std::vector<ulint, ut_allocator<ulint> >       pos_vector_t;
186 
187 /** structure defines a set of ranges for original documents, each of which
188 has a minimum position and maximum position. Text in such range should
189 contain all words in the proximity search. We will need to count the
190 words in such range to make sure it is less than the specified distance
191 of the proximity search */
192 struct fts_proximity_t {
193 	ulint		n_pos;		/*!< number of position set, defines
194 					a range (min to max) containing all
195 					matching words */
196 	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
197 					of the range */
198 	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
199 					of the range */
200 };
201 
202 /** The match positions and tokesn to match */
203 struct fts_phrase_t {
fts_phrase_tfts_phrase_t204 	fts_phrase_t(const dict_table_t* table)
205 		:
206 		found(false),
207 		match(NULL),
208 		tokens(NULL),
209 		distance(0),
210 		charset(NULL),
211 		heap(NULL),
212 		zip_size(table->space->zip_size()),
213 		proximity_pos(NULL),
214 		parser(NULL)
215 	{
216 	}
217 
218 	/** Match result */
219 	ibool			found;
220 
221 	/** Positions within text */
222 	const fts_match_t*	match;
223 
224 	/** Tokens to match */
225 	const ib_vector_t*	tokens;
226 
227 	/** For matching on proximity distance. Can be 0 for exact match */
228 	ulint			distance;
229 
230 	/** Phrase match charset */
231 	CHARSET_INFO*		charset;
232 
233 	/** Heap for word processing */
234 	mem_heap_t*		heap;
235 
236 	/** ROW_FORMAT=COMPRESSED page size, or 0 */
237 	const ulint		zip_size;
238 
239 	/** Position info for proximity search verification. Records the
240 	min and max position of words matched */
241 	fts_proximity_t*	proximity_pos;
242 
243 	/** FTS plugin parser */
244 	st_mysql_ftparser*	parser;
245 };
246 
247 /** Paramter passed to fts phrase match by parser */
248 struct fts_phrase_param_t {
249 	fts_phrase_t*	phrase;		/*!< Match phrase instance */
250 	ulint		token_index;	/*!< Index of token to match next */
251 	mem_heap_t*	heap;		/*!< Heap for word processing */
252 };
253 
254 /** For storing the frequncy of a word/term in a document */
255 struct fts_doc_freq_t {
256 	doc_id_t	doc_id;		/*!< Document id */
257 	ulint		freq;		/*!< Frequency of a word in a document */
258 };
259 
260 /** To determine the word frequency per document. */
261 struct fts_word_freq_t {
262 	fts_string_t	word;		/*!< Word for which we need the freq,
263 					it's allocated on the query heap */
264 
265 	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
266 					word frequencies. The elements are
267 					of type fts_doc_freq_t */
268 	ib_uint64_t	doc_count;	/*!< Total number of documents that
269 					contain this word */
270 	double		idf;		/*!< Inverse document frequency */
271 };
272 
273 /********************************************************************
274 Callback function to fetch the rows in an FTS INDEX record.
275 @return always TRUE */
276 static
277 ibool
278 fts_query_index_fetch_nodes(
279 /*========================*/
280 	void*		row,		/*!< in: sel_node_t* */
281 	void*		user_arg);	/*!< in: pointer to ib_vector_t */
282 
283 /********************************************************************
284 Read and filter nodes.
285 @return fts_node_t instance */
286 static
287 dberr_t
288 fts_query_filter_doc_ids(
289 /*=====================*/
290 	fts_query_t*		query,		/*!< in: query instance */
291 	const fts_string_t*	word,		/*!< in: the current word */
292 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
293 	const fts_node_t*	node,		/*!< in: current FTS node */
294 	void*			data,		/*!< in: doc id ilist */
295 	ulint			len,		/*!< in: doc id ilist size */
296 	ibool			calc_doc_count);/*!< in: whether to remember doc
297 						count */
298 
299 /** Process (nested) sub-expression, create a new result set to store the
300 sub-expression result by processing nodes under current sub-expression
301 list. Merge the sub-expression result with that of parent expression list.
302 @param[in,out]	node	current root node
303 @param[in,out]	visitor	callback function
304 @param[in,out]	arg	argument for callback
305 @return DB_SUCCESS if all go well */
306 static
307 dberr_t
308 fts_ast_visit_sub_exp(
309 	fts_ast_node_t*		node,
310 	fts_ast_callback	visitor,
311 	void*			arg);
312 
313 #if 0
314 /*****************************************************************//***
315 Find a doc_id in a word's ilist.
316 @return TRUE if found. */
317 static
318 ibool
319 fts_query_find_doc_id(
320 /*==================*/
321 	fts_select_t*	select,		/*!< in/out: search the doc id selected,
322 					update the frequency if found. */
323 	void*		data,		/*!< in: doc id ilist */
324 	ulint		len);		/*!< in: doc id ilist size */
325 #endif
326 
327 /*************************************************************//**
328 This function implements a simple "blind" query expansion search:
329 words in documents found in the first search pass will be used as
330 search arguments to search the document again, thus "expand"
331 the search result set.
332 @return DB_SUCCESS if success, otherwise the error code */
333 static
334 dberr_t
335 fts_expand_query(
336 /*=============*/
337 	dict_index_t*	index,		/*!< in: FTS index to search */
338 	fts_query_t*	query)		/*!< in: query result, to be freed
339 					by the client */
340 	MY_ATTRIBUTE((nonnull, warn_unused_result));
341 /*************************************************************//**
342 This function finds documents that contain all words in a
343 phrase or proximity search. And if proximity search, verify
344 the words are close enough to each other, as in specified distance.
345 This function is called for phrase and proximity search.
346 @return TRUE if documents are found, FALSE if otherwise */
347 static
348 ibool
349 fts_phrase_or_proximity_search(
350 /*===========================*/
351 	fts_query_t*	query,		/*!< in/out:  query instance
352 					query->doc_ids might be instantiated
353 					with qualified doc IDs */
354 	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
355 /*************************************************************//**
356 This function checks whether words in result documents are close to
357 each other (within proximity range as specified by "distance").
358 If "distance" is MAX_ULINT, then it will find all combinations of
359 positions of matching words and store min and max positions
360 in the "qualified_pos" for later verification.
361 @return true if words are close to each other, false if otherwise */
362 static
363 bool
364 fts_proximity_get_positions(
365 /*========================*/
366 	fts_match_t**		match,		/*!< in: query instance */
367 	ulint			num_match,	/*!< in: number of matching
368 						items */
369 	ulint			distance,	/*!< in: distance value
370 						for proximity search */
371 	fts_proximity_t*	qualified_pos);	/*!< out: the position info
372 						records ranges containing
373 						all matching words. */
374 #if 0
375 /********************************************************************
376 Get the total number of words in a documents. */
377 static
378 ulint
379 fts_query_terms_in_document(
380 /*========================*/
381 					/*!< out: DB_SUCCESS if all go well
382 					else error code */
383 	fts_query_t*	query,		/*!< in: FTS query state */
384 	doc_id_t	doc_id,		/*!< in: the word to check */
385 	ulint*		total);		/*!< out: total words in document */
386 #endif
387 
388 /********************************************************************
389 Compare two fts_doc_freq_t doc_ids.
390 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
391 UNIV_INLINE
392 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)393 fts_freq_doc_id_cmp(
394 /*================*/
395 	const void*	p1,			/*!< in: id1 */
396 	const void*	p2)			/*!< in: id2 */
397 {
398 	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
399 	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
400 
401 	return((int) (fq1->doc_id - fq2->doc_id));
402 }
403 
404 #if 0
405 /*******************************************************************//**
406 Print the table used for calculating LCS. */
407 static
408 void
409 fts_print_lcs_table(
410 /*================*/
411 	const ulint*	table,		/*!< in: array to print */
412 	ulint		n_rows,		/*!< in: total no. of rows */
413 	ulint		n_cols)		/*!< in: total no. of cols */
414 {
415 	ulint		i;
416 
417 	for (i = 0; i < n_rows; ++i) {
418 		ulint	j;
419 
420 		printf("\n");
421 
422 		for (j = 0; j < n_cols; ++j) {
423 
424 			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425 		}
426 	}
427 }
428 
429 /********************************************************************
430 Find the longest common subsequence between the query string and
431 the document. */
432 static
433 ulint
434 fts_query_lcs(
435 /*==========*/
436 					/*!< out: LCS (length) between
437 					two ilists */
438 	const	ulint*	p1,		/*!< in: word positions of query */
439 	ulint	len_p1,			/*!< in: no. of elements in p1 */
440 	const	ulint*	p2,		/*!< in: word positions within document */
441 	ulint	len_p2)			/*!< in: no. of elements in p2 */
442 {
443 	int	i;
444 	ulint	len = 0;
445 	ulint	r = len_p1;
446 	ulint	c = len_p2;
447 	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
448 	ulint*	table = (ulint*) ut_malloc_nokey(size);
449 
450 	/* Traverse the table backwards, from the last row to the first and
451 	also from the last column to the first. We compute the smaller
452 	common subsequeces first, then use the caluclated values to determine
453 	the longest common subsequence. The result will be in TABLE[0][0]. */
454 	for (i = r; i >= 0; --i) {
455 		int	j;
456 
457 		for (j = c; j >= 0; --j) {
458 
459 			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
460 
461 				FTS_ELEM(table, c, i, j) = 0;
462 
463 			} else if (p1[i] == p2[j]) {
464 
465 				FTS_ELEM(table, c, i, j) = FTS_ELEM(
466 					table, c, i + 1, j + 1) + 1;
467 
468 			} else {
469 
470 				ulint	value;
471 
472 				value = ut_max(
473 					FTS_ELEM(table, c, i + 1, j),
474 					FTS_ELEM(table, c, i, j + 1));
475 
476 				FTS_ELEM(table, c, i, j) = value;
477 			}
478 		}
479 	}
480 
481 	len = FTS_ELEM(table, c, 0, 0);
482 
483 	fts_print_lcs_table(table, r, c);
484 	printf("\nLen=" ULINTPF "\n", len);
485 
486 	ut_free(table);
487 
488 	return(len);
489 }
490 #endif
491 
492 /*******************************************************************//**
493 Compare two fts_ranking_t instance on their rank value and doc ids in
494 descending order on the rank and ascending order on doc id.
495 @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
496 static
497 int
fts_query_compare_rank(const void * p1,const void * p2)498 fts_query_compare_rank(
499 /*===================*/
500 	const void*	p1,		/*!< in: pointer to elem */
501 	const void*	p2)		/*!< in: pointer to elem */
502 {
503 	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
504 	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
505 
506 	if (r2->rank < r1->rank) {
507 		return(-1);
508 	} else if (r2->rank == r1->rank) {
509 
510 		if (r1->doc_id < r2->doc_id) {
511 			return(1);
512 		} else if (r1->doc_id > r2->doc_id) {
513 			return(1);
514 		}
515 
516 		return(0);
517 	}
518 
519 	return(1);
520 }
521 
522 /*******************************************************************//**
523 Create words in ranking */
524 static
525 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)526 fts_ranking_words_create(
527 /*=====================*/
528 	fts_query_t*	query,		/*!< in: query instance */
529 	fts_ranking_t*	ranking)	/*!< in: ranking instance */
530 {
531 	ranking->words = static_cast<byte*>(
532 		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533 	ranking->words_len = RANKING_WORDS_INIT_LEN;
534 }
535 
536 /*
537 The optimization here is using a char array(bitmap) to replace words rb tree
538 in fts_ranking_t.
539 
540 It can save lots of memory except in some cases of QUERY EXPANSION.
541 
542 'word_map' is used as a word dictionary, in which the key is a word, the value
543 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544 if not, we add it into 'word_map', and give it a position(actually a number).
545 then we set the corresponding bit to '1' at the position in the char array 'words'.
546 
547 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548 more quickly than searching by value in 'word_map'. we use 'word_vector'
549 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551 by the word.
552 */
553 
554 /*******************************************************************//**
555 Add a word into ranking */
556 static
557 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)558 fts_ranking_words_add(
559 /*==================*/
560 	fts_query_t*		query,		/*!< in: query instance */
561 	fts_ranking_t*		ranking,	/*!< in: ranking instance */
562 	const fts_string_t*	word)		/*!< in: term/word to add */
563 {
564 	ulint	pos;
565 	ulint	byte_offset;
566 	ulint	bit_offset;
567 	ib_rbt_bound_t	parent;
568 
569 	/* Note: we suppose the word map and vector are append-only. */
570 	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571 
572 	/* We use ib_rbt to simulate a map, f_n_char means position. */
573 	if (rbt_search(query->word_map, &parent, word) == 0) {
574 		fts_string_t*	result_word;
575 
576 		result_word = rbt_value(fts_string_t, parent.last);
577 		pos = result_word->f_n_char;
578 		ut_ad(pos < rbt_size(query->word_map));
579 	} else {
580 		/* Add the word to map. */
581 		fts_string_t	new_word;
582 
583 		pos = rbt_size(query->word_map);
584 
585 		fts_string_dup(&new_word, word, query->heap);
586 		new_word.f_n_char = pos;
587 
588 		rbt_add_node(query->word_map, &parent, &new_word);
589 		ut_ad(rbt_validate(query->word_map));
590 		query->word_vector->push_back(new_word);
591 	}
592 
593 	/* Check words len */
594 	byte_offset = pos / CHAR_BIT;
595 	if (byte_offset >= ranking->words_len) {
596 		byte*	words = ranking->words;
597 		ulint	words_len = ranking->words_len;
598 
599 		while (byte_offset >= words_len) {
600 			words_len *= 2;
601 		}
602 
603 		ranking->words = static_cast<byte*>(
604 			mem_heap_zalloc(query->heap, words_len));
605 		ut_memcpy(ranking->words, words, ranking->words_len);
606 		ranking->words_len = words_len;
607 	}
608 
609 	/* Set ranking words */
610 	ut_ad(byte_offset < ranking->words_len);
611 	bit_offset = pos % CHAR_BIT;
612 	ranking->words[byte_offset] |= 1 << bit_offset;
613 }
614 
615 /*******************************************************************//**
616 Get a word from a ranking
617 @return true if it's successful */
618 static
619 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)620 fts_ranking_words_get_next(
621 /*=======================*/
622 	const	fts_query_t*	query,	/*!< in: query instance */
623 	fts_ranking_t*		ranking,/*!< in: ranking instance */
624 	ulint*			pos,	/*!< in/out: word start pos */
625 	fts_string_t*		word)	/*!< in/out: term/word to add */
626 {
627 	bool	ret = false;
628 	ulint	max_pos = ranking->words_len * CHAR_BIT;
629 
630 	/* Search for next word */
631 	while (*pos < max_pos) {
632 		ulint	byte_offset = *pos / CHAR_BIT;
633 		ulint	bit_offset = *pos % CHAR_BIT;
634 
635 		if (ranking->words[byte_offset] & (1 << bit_offset)) {
636 			ret = true;
637 			break;
638 		}
639 
640 		*pos += 1;
641 	};
642 
643 	/* Get next word from word vector */
644 	if (ret) {
645 		ut_ad(*pos < query->word_vector->size());
646 		*word = query->word_vector->at((size_t)*pos);
647 		*pos += 1;
648 	}
649 
650 	return ret;
651 }
652 
653 /*******************************************************************//**
654 Add a word if it doesn't exist, to the term freq RB tree. We store
655 a pointer to the word that is passed in as the argument.
656 @return pointer to word */
657 static
658 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)659 fts_query_add_word_freq(
660 /*====================*/
661 	fts_query_t*		query,		/*!< in: query instance */
662 	const fts_string_t*	word)		/*!< in: term/word to add */
663 {
664 	ib_rbt_bound_t		parent;
665 
666 	/* Lookup the word in our rb tree and add if it doesn't exist. */
667 	if (rbt_search(query->word_freqs, &parent, word) != 0) {
668 		fts_word_freq_t	word_freq;
669 
670 		memset(&word_freq, 0, sizeof(word_freq));
671 
672 		fts_string_dup(&word_freq.word, word, query->heap);
673 
674 		word_freq.doc_count = 0;
675 
676 		word_freq.doc_freqs = rbt_create(
677 			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
678 
679 		parent.last = rbt_add_node(
680 			query->word_freqs, &parent, &word_freq);
681 
682 		query->total_size += word->f_len
683 			+ SIZEOF_RBT_CREATE
684 			+ SIZEOF_RBT_NODE_ADD
685 			+ sizeof(fts_word_freq_t);
686 	}
687 
688 	return(rbt_value(fts_word_freq_t, parent.last));
689 }
690 
691 /*******************************************************************//**
692 Add a doc id if it doesn't exist, to the doc freq RB tree.
693 @return pointer to word */
694 static
695 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)696 fts_query_add_doc_freq(
697 /*===================*/
698 	fts_query_t*	query,		/*!< in: query instance	*/
699 	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
700 	doc_id_t	doc_id)		/*!< in: doc id to add */
701 {
702 	ib_rbt_bound_t	parent;
703 
704 	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
705 	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
706 		fts_doc_freq_t	doc_freq;
707 
708 		memset(&doc_freq, 0, sizeof(doc_freq));
709 
710 		doc_freq.freq = 0;
711 		doc_freq.doc_id = doc_id;
712 
713 		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
714 
715 		query->total_size += SIZEOF_RBT_NODE_ADD
716 			+ sizeof(fts_doc_freq_t);
717 	}
718 
719 	return(rbt_value(fts_doc_freq_t, parent.last));
720 }
721 
722 /*******************************************************************//**
723 Add the doc id to the query set only if it's not in the
724 deleted array. */
725 static
726 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)727 fts_query_union_doc_id(
728 /*===================*/
729 	fts_query_t*	query,		/*!< in: query instance */
730 	doc_id_t	doc_id,		/*!< in: the doc id to add */
731 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
732 					rank associated with the doc_id */
733 {
734 	ib_rbt_bound_t	parent;
735 	ulint		size = ib_vector_size(query->deleted->doc_ids);
736 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
737 
738 	/* Check if the doc id is deleted and it's not already in our set. */
739 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
740 	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
741 
742 		fts_ranking_t	ranking;
743 
744 		ranking.rank = rank;
745 		ranking.doc_id = doc_id;
746 		fts_ranking_words_create(query, &ranking);
747 
748 		rbt_add_node(query->doc_ids, &parent, &ranking);
749 
750 		query->total_size += SIZEOF_RBT_NODE_ADD
751 			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
752 	}
753 }
754 
755 /*******************************************************************//**
756 Remove the doc id from the query set only if it's not in the
757 deleted set. */
758 static
759 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)760 fts_query_remove_doc_id(
761 /*====================*/
762 	fts_query_t*	query,		/*!< in: query instance */
763 	doc_id_t	doc_id)		/*!< in: the doc id to add */
764 {
765 	ib_rbt_bound_t	parent;
766 	ulint		size = ib_vector_size(query->deleted->doc_ids);
767 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
768 
769 	/* Check if the doc id is deleted and it's in our set. */
770 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
771 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
772 		ut_free(rbt_remove_node(query->doc_ids, parent.last));
773 
774 		ut_ad(query->total_size >=
775 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
776 		query->total_size -= SIZEOF_RBT_NODE_ADD
777 			+ sizeof(fts_ranking_t);
778 	}
779 }
780 
781 /*******************************************************************//**
782 Find the doc id in the query set but not in the deleted set, artificialy
783 downgrade or upgrade its ranking by a value and make/initialize its ranking
784 under or above its normal range 0 to 1. This is used for Boolean Search
785 operator such as Negation operator, which makes word's contribution to the
786 row's relevance to be negative */
787 static
788 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)789 fts_query_change_ranking(
790 /*====================*/
791 	fts_query_t*	query,		/*!< in: query instance */
792 	doc_id_t	doc_id,		/*!< in: the doc id to add */
793 	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
794 {
795 	ib_rbt_bound_t	parent;
796 	ulint		size = ib_vector_size(query->deleted->doc_ids);
797 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
798 
799 	/* Check if the doc id is deleted and it's in our set. */
800 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
801 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
802 
803 		fts_ranking_t*	ranking;
804 
805 		ranking = rbt_value(fts_ranking_t, parent.last);
806 
807 		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
808 
809 		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
810 		and RANK_UPGRADE (0.5) */
811 		if (ranking->rank >= 1.0F) {
812 			ranking->rank = 1.0F;
813 		} else if (ranking->rank <= -1.0F) {
814 			ranking->rank = -1.0F;
815 		}
816 	}
817 }
818 
819 /*******************************************************************//**
820 Check the doc id in the query set only if it's not in the
821 deleted array. The doc ids that were found are stored in
822 another rb tree (fts_query_t::intersect). */
823 static
824 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)825 fts_query_intersect_doc_id(
826 /*=======================*/
827 	fts_query_t*	query,		/*!< in: query instance */
828 	doc_id_t	doc_id,		/*!< in: the doc id to add */
829 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
830 					rank associated with the doc_id */
831 {
832 	ib_rbt_bound_t	parent;
833 	ulint		size = ib_vector_size(query->deleted->doc_ids);
834 	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
835 	fts_ranking_t*	ranking= NULL;
836 
837 	/* There are three types of intersect:
838 	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
839 	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
840 	      if it matches 'b'. if the doc is also in  doc_ids, then change the
841 	      doc's rank, and add 'a' in doc's words.
842 	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
843 	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
844 
845 	/* Check if the doc id is deleted and it's in our set */
846 	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
847 		fts_ranking_t	new_ranking;
848 
849 		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
850 			if (query->multi_exist) {
851 				return;
852 			} else {
853 				new_ranking.words = NULL;
854 			}
855 		} else {
856 			ranking = rbt_value(fts_ranking_t, parent.last);
857 
858 			/* We've just checked the doc id before */
859 			if (ranking->words == NULL) {
860 				ut_ad(rbt_search(query->intersection, &parent,
861 					ranking) == 0);
862 				return;
863 			}
864 
865 			/* Merge rank */
866 			rank += ranking->rank;
867 			if (rank >= 1.0F) {
868 				rank = 1.0F;
869 			} else if (rank <= -1.0F) {
870 				rank = -1.0F;
871 			}
872 
873 			/* Take words */
874 			new_ranking.words = ranking->words;
875 			new_ranking.words_len = ranking->words_len;
876 		}
877 
878 		new_ranking.rank = rank;
879 		new_ranking.doc_id = doc_id;
880 
881 		if (rbt_search(query->intersection, &parent,
882 			       &new_ranking) != 0) {
883 			if (new_ranking.words == NULL) {
884 				fts_ranking_words_create(query, &new_ranking);
885 
886 				query->total_size += RANKING_WORDS_INIT_LEN;
887 			} else {
888 				/* Note that the intersection has taken
889 				ownership of the ranking data. */
890 				ranking->words = NULL;
891 			}
892 
893 			rbt_add_node(query->intersection,
894 				     &parent, &new_ranking);
895 
896 			query->total_size += SIZEOF_RBT_NODE_ADD
897 				+ sizeof(fts_ranking_t);
898 		}
899 	}
900 }
901 
902 /*******************************************************************//**
903 Free the document ranking rb tree. */
904 static
905 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)906 fts_query_free_doc_ids(
907 /*===================*/
908 	fts_query_t*	query,		/*!< in: query instance */
909 	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
910 {
911 	const ib_rbt_node_t*	node;
912 
913 	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
914 
915 		fts_ranking_t*	ranking;
916 
917 		ranking = rbt_value(fts_ranking_t, node);
918 
919 		if (ranking->words) {
920 			ranking->words = NULL;
921 		}
922 
923 		ut_free(rbt_remove_node(doc_ids, node));
924 
925 		ut_ad(query->total_size >=
926 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
927 		query->total_size -= SIZEOF_RBT_NODE_ADD
928 			+ sizeof(fts_ranking_t);
929 	}
930 
931 	rbt_free(doc_ids);
932 
933 	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
934 	query->total_size -= SIZEOF_RBT_CREATE;
935 }
936 
937 /*******************************************************************//**
938 Add the word to the documents "list" of matching words from
939 the query. We make a copy of the word from the query heap. */
940 static
941 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)942 fts_query_add_word_to_document(
943 /*===========================*/
944 	fts_query_t*		query,	/*!< in: query to update */
945 	doc_id_t		doc_id,	/*!< in: the document to update */
946 	const fts_string_t*	word)	/*!< in: the token to add */
947 {
948 	ib_rbt_bound_t		parent;
949 	fts_ranking_t*		ranking = NULL;
950 
951 	if (query->flags == FTS_OPT_RANKING) {
952 		return;
953 	}
954 
955 	/* First we search the intersection RB tree as it could have
956 	taken ownership of the words rb tree instance. */
957 	if (query->intersection
958 	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
959 
960 		ranking = rbt_value(fts_ranking_t, parent.last);
961 	}
962 
963 	if (ranking == NULL
964 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
965 
966 		ranking = rbt_value(fts_ranking_t, parent.last);
967 	}
968 
969 	if (ranking != NULL) {
970 		fts_ranking_words_add(query, ranking, word);
971 	}
972 }
973 
974 /*******************************************************************//**
975 Check the node ilist. */
976 static
977 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)978 fts_query_check_node(
979 /*=================*/
980 	fts_query_t*		query,	/*!< in: query to update */
981 	const fts_string_t*	token,	/*!< in: the token to search */
982 	const fts_node_t*	node)	/*!< in: node to check */
983 {
984 	/* Skip nodes whose doc ids are out range. */
985 	if (query->oper == FTS_EXIST
986 	    && ((query->upper_doc_id > 0
987 		&& node->first_doc_id > query->upper_doc_id)
988 		|| (query->lower_doc_id > 0
989 		    && node->last_doc_id < query->lower_doc_id))) {
990 
991 		/* Ignore */
992 
993 	} else {
994 		int		ret;
995 		ib_rbt_bound_t	parent;
996 		ulint		ilist_size = node->ilist_size;
997 		fts_word_freq_t*word_freqs;
998 
999 		/* The word must exist. */
1000 		ret = rbt_search(query->word_freqs, &parent, token);
1001 		ut_a(ret == 0);
1002 
1003 		word_freqs = rbt_value(fts_word_freq_t, parent.last);
1004 
1005 		query->error = fts_query_filter_doc_ids(
1006 					query, token, word_freqs, node,
1007 					node->ilist, ilist_size, TRUE);
1008 	}
1009 }
1010 
1011 /*****************************************************************//**
1012 Search index cache for word with wildcard match.
1013 @return number of words matched */
1014 static
1015 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1016 fts_cache_find_wildcard(
1017 /*====================*/
1018 	fts_query_t*		query,		/*!< in: query instance */
1019 	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
1020 	const fts_string_t*	token)		/*!< in: token to search */
1021 {
1022 	ib_rbt_bound_t		parent;
1023 	const ib_vector_t*	nodes = NULL;
1024 	fts_string_t		srch_text;
1025 	byte			term[FTS_MAX_WORD_LEN + 1];
1026 	ulint			num_word = 0;
1027 
1028 	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1029 			? token->f_len - 1
1030 			: token->f_len;
1031 
1032 	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1033 	term[srch_text.f_len] = '\0';
1034 	srch_text.f_str = term;
1035 
1036 	/* Lookup the word in the rb tree */
1037 	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1038 			   innobase_fts_text_cmp_prefix) == 0) {
1039 		const fts_tokenizer_word_t*     word;
1040 		ulint				i;
1041 		const ib_rbt_node_t*		cur_node;
1042 		ibool				forward = FALSE;
1043 
1044 		word = rbt_value(fts_tokenizer_word_t, parent.last);
1045 		cur_node = parent.last;
1046 
1047 		while (innobase_fts_text_cmp_prefix(
1048 			index_cache->charset, &srch_text, &word->text) == 0) {
1049 
1050 			nodes = word->nodes;
1051 
1052 			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1053 				int                     ret;
1054 				const fts_node_t*       node;
1055 				ib_rbt_bound_t          freq_parent;
1056 				fts_word_freq_t*	word_freqs;
1057 
1058 				node = static_cast<const fts_node_t*>(
1059 					ib_vector_get_const(nodes, i));
1060 
1061 				ret = rbt_search(query->word_freqs,
1062 						 &freq_parent,
1063 						 &srch_text);
1064 
1065 				ut_a(ret == 0);
1066 
1067 				word_freqs = rbt_value(
1068 					fts_word_freq_t,
1069 					freq_parent.last);
1070 
1071 				query->error = fts_query_filter_doc_ids(
1072 					query, &srch_text,
1073 					word_freqs, node,
1074 					node->ilist, node->ilist_size, TRUE);
1075 
1076 				if (query->error != DB_SUCCESS) {
1077 					return(0);
1078 				}
1079 			}
1080 
1081 			num_word++;
1082 
1083 			if (!forward) {
1084 				cur_node = rbt_prev(
1085 					index_cache->words, cur_node);
1086 			} else {
1087 cont_search:
1088 				cur_node = rbt_next(
1089 					index_cache->words, cur_node);
1090 			}
1091 
1092 			if (!cur_node) {
1093 				break;
1094 			}
1095 
1096 			word = rbt_value(fts_tokenizer_word_t, cur_node);
1097 		}
1098 
1099 		if (!forward) {
1100 			forward = TRUE;
1101 			cur_node = parent.last;
1102 			goto cont_search;
1103 		}
1104 	}
1105 
1106 	return(num_word);
1107 }
1108 
1109 /*****************************************************************//**
1110 Set difference.
1111 @return DB_SUCCESS if all go well */
1112 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1113 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1114 fts_query_difference(
1115 /*=================*/
1116 	fts_query_t*		query,	/*!< in: query instance */
1117 	const fts_string_t*	token)	/*!< in: token to search */
1118 {
1119 	ulint			n_doc_ids= 0;
1120 	trx_t*			trx = query->trx;
1121 	dict_table_t*		table = query->index->table;
1122 
1123 	ut_a(query->oper == FTS_IGNORE);
1124 
1125 #ifdef FTS_INTERNAL_DIAG_PRINT
1126 	{
1127 		ib::info	out;
1128 		out << "DIFFERENCE: Searching: '";
1129 		out.write(token->f_str, token->f_len);
1130 		out << "'";
1131 	}
1132 #endif
1133 
1134 	if (query->doc_ids) {
1135 		n_doc_ids = rbt_size(query->doc_ids);
1136 	}
1137 
1138 	/* There is nothing we can substract from an empty set. */
1139 	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1140 		ulint			i;
1141 		fts_fetch_t		fetch;
1142 		const ib_vector_t*	nodes;
1143 		const fts_index_cache_t*index_cache;
1144 		que_t*			graph = NULL;
1145 		fts_cache_t*		cache = table->fts->cache;
1146 		dberr_t			error;
1147 
1148 		rw_lock_x_lock(&cache->lock);
1149 
1150 		index_cache = fts_find_index_cache(cache, query->index);
1151 
1152 		/* Must find the index cache */
1153 		ut_a(index_cache != NULL);
1154 
1155 		/* Search the cache for a matching word first. */
1156 		if (query->cur_node->term.wildcard
1157 		    && query->flags != FTS_PROXIMITY
1158 		    && query->flags != FTS_PHRASE) {
1159 			fts_cache_find_wildcard(query, index_cache, token);
1160 		} else {
1161 			nodes = fts_cache_find_word(index_cache, token);
1162 
1163 			for (i = 0; nodes && i < ib_vector_size(nodes)
1164 			     && query->error == DB_SUCCESS; ++i) {
1165 				const fts_node_t*	node;
1166 
1167 				node = static_cast<const fts_node_t*>(
1168 					ib_vector_get_const(nodes, i));
1169 
1170 				fts_query_check_node(query, token, node);
1171 			}
1172 		}
1173 
1174 		rw_lock_x_unlock(&cache->lock);
1175 
1176 		/* error is passed by 'query->error' */
1177 		if (query->error != DB_SUCCESS) {
1178 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1179 			return(query->error);
1180 		}
1181 
1182 		/* Setup the callback args for filtering and
1183 		consolidating the ilist. */
1184 		fetch.read_arg = query;
1185 		fetch.read_record = fts_query_index_fetch_nodes;
1186 
1187 		error = fts_index_fetch_nodes(
1188 			trx, &graph, &query->fts_index_table, token, &fetch);
1189 
1190 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1191 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1192 		if (error != DB_SUCCESS) {
1193 			query->error = error;
1194 		}
1195 
1196 		fts_que_graph_free(graph);
1197 	}
1198 
1199 	/* The size can't increase. */
1200 	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1201 
1202 	return(query->error);
1203 }
1204 
1205 /*****************************************************************//**
1206 Intersect the token doc ids with the current set.
1207 @return DB_SUCCESS if all go well */
1208 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1209 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1210 fts_query_intersect(
1211 /*================*/
1212 	fts_query_t*		query,	/*!< in: query instance */
1213 	const fts_string_t*	token)	/*!< in: the token to search */
1214 {
1215 	trx_t*			trx = query->trx;
1216 	dict_table_t*		table = query->index->table;
1217 
1218 	ut_a(query->oper == FTS_EXIST);
1219 
1220 #ifdef FTS_INTERNAL_DIAG_PRINT
1221 	{
1222 		ib::info	out;
1223 		out << "INTERSECT: Searching: '";
1224 		out.write(token->f_str, token->f_len);
1225 		out << "'";
1226 	}
1227 #endif
1228 
1229 	/* If the words set is not empty and multi exist is true,
1230 	we know the intersection set is empty in advance. */
1231 	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1232 		ulint                   n_doc_ids = 0;
1233 		ulint			i;
1234 		fts_fetch_t		fetch;
1235 		const ib_vector_t*	nodes;
1236 		const fts_index_cache_t*index_cache;
1237 		que_t*			graph = NULL;
1238 		fts_cache_t*		cache = table->fts->cache;
1239 		dberr_t			error;
1240 
1241 		ut_a(!query->intersection);
1242 
1243 		n_doc_ids = rbt_size(query->doc_ids);
1244 
1245 		/* Create the rb tree that will hold the doc ids of
1246 		the intersection. */
1247 		query->intersection = rbt_create(
1248 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1249 
1250 		query->total_size += SIZEOF_RBT_CREATE;
1251 
1252 		/* This is to avoid decompressing the ilist if the
1253 		node's ilist doc ids are out of range. */
1254 		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1255 			const ib_rbt_node_t*	node;
1256 			doc_id_t*		doc_id;
1257 
1258 			node = rbt_first(query->doc_ids);
1259 			doc_id = rbt_value(doc_id_t, node);
1260 			query->lower_doc_id = *doc_id;
1261 
1262 			node = rbt_last(query->doc_ids);
1263 			doc_id = rbt_value(doc_id_t, node);
1264 			query->upper_doc_id = *doc_id;
1265 
1266 		} else {
1267 			query->lower_doc_id = 0;
1268 			query->upper_doc_id = 0;
1269 		}
1270 
1271 		/* Search the cache for a matching word first. */
1272 
1273 		rw_lock_x_lock(&cache->lock);
1274 
1275 		/* Search for the index specific cache. */
1276 		index_cache = fts_find_index_cache(cache, query->index);
1277 
1278 		/* Must find the index cache. */
1279 		ut_a(index_cache != NULL);
1280 
1281 		if (query->cur_node->term.wildcard) {
1282 			/* Wildcard search the index cache */
1283 			fts_cache_find_wildcard(query, index_cache, token);
1284 		} else {
1285 			nodes = fts_cache_find_word(index_cache, token);
1286 
1287 			for (i = 0; nodes && i < ib_vector_size(nodes)
1288 			     && query->error == DB_SUCCESS; ++i) {
1289 				const fts_node_t*	node;
1290 
1291 				node = static_cast<const fts_node_t*>(
1292 					ib_vector_get_const(nodes, i));
1293 
1294 				fts_query_check_node(query, token, node);
1295 			}
1296 		}
1297 
1298 		rw_lock_x_unlock(&cache->lock);
1299 
1300 		/* error is passed by 'query->error' */
1301 		if (query->error != DB_SUCCESS) {
1302 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1303 			return(query->error);
1304 		}
1305 
1306 		/* Setup the callback args for filtering and
1307 		consolidating the ilist. */
1308 		fetch.read_arg = query;
1309 		fetch.read_record = fts_query_index_fetch_nodes;
1310 
1311 		error = fts_index_fetch_nodes(
1312 			trx, &graph, &query->fts_index_table, token, &fetch);
1313 
1314 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1315 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1316 		if (error != DB_SUCCESS) {
1317 			query->error = error;
1318 		}
1319 
1320 		fts_que_graph_free(graph);
1321 
1322 		if (query->error == DB_SUCCESS) {
1323 			/* Make the intesection (rb tree) the current doc id
1324 			set and free the old set. */
1325 			fts_query_free_doc_ids(query, query->doc_ids);
1326 			query->doc_ids = query->intersection;
1327 			query->intersection = NULL;
1328 
1329 			ut_a(!query->multi_exist || (query->multi_exist
1330 			     && rbt_size(query->doc_ids) <= n_doc_ids));
1331 		}
1332 	}
1333 
1334 	return(query->error);
1335 }
1336 
1337 /*****************************************************************//**
1338 Query index cache.
1339 @return DB_SUCCESS if all go well */
1340 static
1341 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1342 fts_query_cache(
1343 /*============*/
1344 	fts_query_t*		query,	/*!< in/out: query instance */
1345 	const fts_string_t*	token)	/*!< in: token to search */
1346 {
1347 	const fts_index_cache_t*index_cache;
1348 	dict_table_t*		table = query->index->table;
1349 	fts_cache_t*		cache = table->fts->cache;
1350 
1351 	/* Search the cache for a matching word first. */
1352 	rw_lock_x_lock(&cache->lock);
1353 
1354 	/* Search for the index specific cache. */
1355 	index_cache = fts_find_index_cache(cache, query->index);
1356 
1357 	/* Must find the index cache. */
1358 	ut_a(index_cache != NULL);
1359 
1360 	if (query->cur_node->term.wildcard
1361 	    && query->flags != FTS_PROXIMITY
1362 	    && query->flags != FTS_PHRASE) {
1363 		/* Wildcard search the index cache */
1364 		fts_cache_find_wildcard(query, index_cache, token);
1365 	} else {
1366 		const ib_vector_t*      nodes;
1367 		ulint			i;
1368 
1369 		nodes = fts_cache_find_word(index_cache, token);
1370 
1371 		for (i = 0; nodes && i < ib_vector_size(nodes)
1372 		     && query->error == DB_SUCCESS; ++i) {
1373 			const fts_node_t*	node;
1374 
1375 			node = static_cast<const fts_node_t*>(
1376 				ib_vector_get_const(nodes, i));
1377 
1378 			fts_query_check_node(query, token, node);
1379 		}
1380 	}
1381 
1382 	rw_lock_x_unlock(&cache->lock);
1383 
1384 	return(query->error);
1385 }
1386 
1387 /*****************************************************************//**
1388 Set union.
1389 @return DB_SUCCESS if all go well */
1390 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1391 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1392 fts_query_union(
1393 /*============*/
1394 	fts_query_t*		query,	/*!< in: query instance */
1395 	fts_string_t*		token)	/*!< in: token to search */
1396 {
1397 	fts_fetch_t		fetch;
1398 	ulint			n_doc_ids = 0;
1399 	trx_t*			trx = query->trx;
1400 	que_t*			graph = NULL;
1401 	dberr_t			error;
1402 
1403 	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1404 	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1405 
1406 #ifdef FTS_INTERNAL_DIAG_PRINT
1407 	{
1408 		ib::info	out;
1409 		out << "UNION: Searching: '";
1410 		out.write(token->f_str, token->f_len);
1411 		out << "'";
1412 	}
1413 #endif
1414 
1415 	if (query->doc_ids) {
1416 		n_doc_ids = rbt_size(query->doc_ids);
1417 	}
1418 
1419 	if (token->f_len == 0) {
1420 		return(query->error);
1421 	}
1422 
1423 	fts_query_cache(query, token);
1424 
1425 	/* Setup the callback args for filtering and
1426 	consolidating the ilist. */
1427 	fetch.read_arg = query;
1428 	fetch.read_record = fts_query_index_fetch_nodes;
1429 
1430 	/* Read the nodes from disk. */
1431 	error = fts_index_fetch_nodes(
1432 		trx, &graph, &query->fts_index_table, token, &fetch);
1433 
1434 	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1435 	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1436 	if (error != DB_SUCCESS) {
1437 		query->error = error;
1438 	}
1439 
1440 	fts_que_graph_free(graph);
1441 
1442 	if (query->error == DB_SUCCESS) {
1443 
1444 		/* The size can't decrease. */
1445 		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1446 
1447 		/* Calulate the number of doc ids that were added to
1448 		the current doc id set. */
1449 		if (query->doc_ids) {
1450 			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1451 		}
1452 	}
1453 
1454 	return(query->error);
1455 }
1456 
1457 /*****************************************************************//**
1458 Depending upon the current query operator process the doc id.
1459 return DB_SUCCESS if all go well
1460 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1461 static
1462 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1463 fts_query_process_doc_id(
1464 /*=====================*/
1465 	fts_query_t*	query,		/*!< in: query instance */
1466 	doc_id_t	doc_id,		/*!< in: doc id to process */
1467 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
1468 					rank associated with the doc_id */
1469 {
1470 	if (query->flags == FTS_OPT_RANKING) {
1471 		return(DB_SUCCESS);
1472 	}
1473 
1474 	switch (query->oper) {
1475 	case FTS_NONE:
1476 		fts_query_union_doc_id(query, doc_id, rank);
1477 		break;
1478 
1479 	case FTS_EXIST:
1480 		fts_query_intersect_doc_id(query, doc_id, rank);
1481 		break;
1482 
1483 	case FTS_IGNORE:
1484 		fts_query_remove_doc_id(query, doc_id);
1485 		break;
1486 
1487 	case FTS_NEGATE:
1488 		fts_query_change_ranking(query, doc_id, TRUE);
1489 		break;
1490 
1491 	case FTS_DECR_RATING:
1492 		fts_query_union_doc_id(query, doc_id, rank);
1493 		fts_query_change_ranking(query, doc_id, TRUE);
1494 		break;
1495 
1496 	case FTS_INCR_RATING:
1497 		fts_query_union_doc_id(query, doc_id, rank);
1498 		fts_query_change_ranking(query, doc_id, FALSE);
1499 		break;
1500 
1501 	default:
1502 		ut_error;
1503 	}
1504 
1505 	if (query->total_size > fts_result_cache_limit) {
1506 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1507 	} else {
1508 		return(DB_SUCCESS);
1509 	}
1510 }
1511 
1512 /*****************************************************************//**
1513 Merge two result sets. */
1514 static
1515 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1516 fts_merge_doc_ids(
1517 /*==============*/
1518 	fts_query_t*	query,		/*!< in,out: query instance */
1519 	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
1520 {
1521 	const ib_rbt_node_t*	node;
1522 
1523 	DBUG_ENTER("fts_merge_doc_ids");
1524 
1525 	ut_a(!query->intersection);
1526 
1527 	/* To process FTS_EXIST operation (intersection), we need
1528 	to create a new result set for fts_query_intersect(). */
1529 	if (query->oper == FTS_EXIST) {
1530 
1531 		query->intersection = rbt_create(
1532 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1533 
1534 		query->total_size += SIZEOF_RBT_CREATE;
1535 	}
1536 
1537 	/* Merge the elements to the result set. */
1538 	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1539 		fts_ranking_t*		ranking;
1540 		ulint			pos = 0;
1541 		fts_string_t		word;
1542 
1543 		ranking = rbt_value(fts_ranking_t, node);
1544 
1545 		query->error = fts_query_process_doc_id(
1546 				query, ranking->doc_id, ranking->rank);
1547 
1548 		if (query->error != DB_SUCCESS) {
1549 			DBUG_RETURN(query->error);
1550 		}
1551 
1552 		/* Merge words. Don't need to take operator into account. */
1553 		ut_a(ranking->words);
1554 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1555 			fts_query_add_word_to_document(query, ranking->doc_id,
1556 						       &word);
1557 		}
1558 	}
1559 
1560 	/* If it is an intersection operation, reset query->doc_ids
1561 	to query->intersection and free the old result list. */
1562 	if (query->oper == FTS_EXIST && query->intersection != NULL) {
1563 		fts_query_free_doc_ids(query, query->doc_ids);
1564 		query->doc_ids = query->intersection;
1565 		query->intersection = NULL;
1566 	}
1567 
1568 	DBUG_RETURN(DB_SUCCESS);
1569 }
1570 
1571 /*****************************************************************//**
1572 Skip non-whitespace in a string. Move ptr to the next word boundary.
1573 @return pointer to first whitespace character or end */
1574 UNIV_INLINE
1575 byte*
fts_query_skip_word(byte * ptr,const byte * end)1576 fts_query_skip_word(
1577 /*================*/
1578 	byte*		ptr,		/*!< in: start of scan */
1579 	const byte*	end)		/*!< in: pointer to end of string */
1580 {
1581 	/* TODO: Does this have to be UTF-8 too ? */
1582 	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1583 		++ptr;
1584 	}
1585 
1586 	return(ptr);
1587 }
1588 
1589 /*****************************************************************//**
1590 Check whether the remaining terms in the phrase match the text.
1591 @return TRUE if matched else FALSE */
1592 static
1593 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1594 fts_query_match_phrase_terms(
1595 /*=========================*/
1596 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1597 	byte**		start,		/*!< in/out: text to search, we can't
1598 					make this const becase we need to
1599 					first convert the string to
1600 					lowercase */
1601 	const byte*	end,		/*!< in: pointer to the end of
1602 					the string to search */
1603 	mem_heap_t*	heap)		/*!< in: heap */
1604 {
1605 	ulint			i;
1606 	byte*			ptr = *start;
1607 	const ib_vector_t*	tokens = phrase->tokens;
1608 	ulint			distance = phrase->distance;
1609 
1610 	/* We check only from the second term onwards, since the first
1611 	must have matched otherwise we wouldn't be here. */
1612 	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1613 		fts_string_t		match;
1614 		fts_string_t		cmp_str;
1615 		const fts_string_t*	token;
1616 		int			result;
1617 		ulint			ret;
1618 
1619 		ret = innobase_mysql_fts_get_token(
1620 			phrase->charset, ptr,
1621 			const_cast<byte*>(end), &match);
1622 
1623 		if (match.f_len > 0) {
1624 			/* Get next token to match. */
1625 			token = static_cast<const fts_string_t*>(
1626 				ib_vector_get_const(tokens, i));
1627 
1628 			fts_string_dup(&cmp_str, &match, heap);
1629 
1630 			result = innobase_fts_text_case_cmp(
1631 				phrase->charset, token, &cmp_str);
1632 
1633 			/* Skip the rest of the tokens if this one doesn't
1634 			match and the proximity distance is exceeded. */
1635 			if (result
1636 			    && (distance == ULINT_UNDEFINED
1637 				|| distance == 0)) {
1638 
1639 				break;
1640 			}
1641 
1642 			/* This token matched move to the next token. */
1643 			if (result == 0) {
1644 				/* Advance the text to search by the length
1645 				of the last token. */
1646 				ptr += ret;
1647 
1648 				/* Advance to the next token. */
1649 				++i;
1650 			} else {
1651 
1652 				ut_a(distance != ULINT_UNDEFINED);
1653 
1654 				ptr = fts_query_skip_word(ptr, end);
1655 			}
1656 
1657 			/* Distance can be 0 for exact matches. */
1658 			if (distance != ULINT_UNDEFINED && distance > 0) {
1659 				--distance;
1660 			}
1661 		} else {
1662 			ptr += ret;
1663 		}
1664 	}
1665 
1666 	*start = ptr;
1667 
1668 	/* Can't be greater than the number of elements. */
1669 	ut_a(i <= ib_vector_size(tokens));
1670 
1671 	/* This is the case for multiple words. */
1672 	if (i == ib_vector_size(tokens)) {
1673 		phrase->found = TRUE;
1674 	}
1675 
1676 	return(phrase->found);
1677 }
1678 
1679 /*****************************************************************//**
1680 Callback function to count the number of words in position ranges,
1681 and see whether the word count is in specified "phrase->distance"
1682 @return true if the number of characters is less than the "distance" */
1683 static
1684 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1685 fts_proximity_is_word_in_range(
1686 /*===========================*/
1687 	const fts_phrase_t*
1688 			phrase,		/*!< in: phrase with the search info */
1689 	byte*		start,		/*!< in: text to search */
1690 	ulint		total_len)	/*!< in: length of text */
1691 {
1692 	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
1693 
1694 	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1695 	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1696 
1697 	/* Search each matched position pair (with min and max positions)
1698 	and count the number of words in the range */
1699 	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1700 		ulint		cur_pos = proximity_pos->min_pos[i];
1701 		ulint		n_word = 0;
1702 
1703 		ut_ad(proximity_pos->max_pos[i] <= total_len);
1704 
1705 		/* Walk through words in the range and count them */
1706 		while (cur_pos <= proximity_pos->max_pos[i]) {
1707 			ulint		len;
1708 			fts_string_t	str;
1709 
1710 			len = innobase_mysql_fts_get_token(
1711 				phrase->charset,
1712 				start + cur_pos,
1713 				start + total_len, &str);
1714 
1715 			if (len == 0) {
1716 				break;
1717 			}
1718 
1719 			/* Advances position with "len" bytes */
1720 			cur_pos += len;
1721 
1722 			/* Record the number of words */
1723 			if (str.f_n_char > 0) {
1724 				n_word++;
1725 			}
1726 
1727 			if (n_word > phrase->distance) {
1728 				break;
1729 			}
1730 		}
1731 
1732 		/* Check if the number of words is less than specified
1733 		"distance" */
1734 		if (n_word && n_word <= phrase->distance) {
1735 			return(true);
1736 		}
1737 	}
1738 
1739 	return(false);
1740 }
1741 
1742 /*****************************************************************//**
1743 FTS plugin parser 'myql_add_word' callback function for phrase match
1744 Refer to 'st_mysql_ftparser_param' for more detail.
1745 @return 0 if match, or return non-zero */
1746 static
1747 int
fts_query_match_phrase_add_word_for_parser(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO *)1748 fts_query_match_phrase_add_word_for_parser(
1749 /*=======================================*/
1750 	MYSQL_FTPARSER_PARAM*	param,		/*!< in: parser param */
1751 	const char*			word,		/*!< in: token */
1752 	int			word_len,	/*!< in: token length */
1753 	MYSQL_FTPARSER_BOOLEAN_INFO*)
1754 {
1755 	fts_phrase_param_t*	phrase_param;
1756 	fts_phrase_t*		phrase;
1757 	const ib_vector_t*	tokens;
1758 	fts_string_t		match;
1759 	fts_string_t		cmp_str;
1760 	const fts_string_t*	token;
1761 	int			result;
1762 	mem_heap_t*		heap;
1763 
1764 	phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1765 	heap = phrase_param->heap;
1766 	phrase = phrase_param->phrase;
1767 	tokens = phrase->tokens;
1768 
1769 	/* In case plugin parser doesn't check return value */
1770 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1771 		return(1);
1772 	}
1773 
1774 	match.f_str = (uchar *)(word);
1775 	match.f_len = ulint(word_len);
1776 	match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1777 
1778 	if (match.f_len > 0) {
1779 		/* Get next token to match. */
1780 		ut_a(phrase_param->token_index < ib_vector_size(tokens));
1781 		token = static_cast<const fts_string_t*>(
1782 			ib_vector_get_const(tokens, phrase_param->token_index));
1783 
1784 		fts_string_dup(&cmp_str, &match, heap);
1785 
1786 		result = innobase_fts_text_case_cmp(
1787 			phrase->charset, token, &cmp_str);
1788 
1789 		if (result == 0) {
1790 			phrase_param->token_index++;
1791 		} else {
1792 			return(1);
1793 		}
1794 	}
1795 
1796 	/* Can't be greater than the number of elements. */
1797 	ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1798 
1799 	/* This is the case for multiple words. */
1800 	if (phrase_param->token_index == ib_vector_size(tokens)) {
1801 		phrase->found = TRUE;
1802 	}
1803 
1804 	return(static_cast<int>(phrase->found));
1805 }
1806 
1807 /*****************************************************************//**
1808 Check whether the terms in the phrase match the text.
1809 @return TRUE if matched else FALSE */
1810 static
1811 ibool
fts_query_match_phrase_terms_by_parser(fts_phrase_param_t * phrase_param,st_mysql_ftparser * parser,byte * text,ulint len)1812 fts_query_match_phrase_terms_by_parser(
1813 /*===================================*/
1814 	fts_phrase_param_t*	phrase_param,	/* in/out: phrase param */
1815 	st_mysql_ftparser*	parser,		/* in: plugin fts parser */
1816 	byte*			text,		/* in: text to check */
1817 	ulint			len)		/* in: text length */
1818 {
1819 	MYSQL_FTPARSER_PARAM	param;
1820 
1821 	ut_a(parser);
1822 
1823 	/* Set paramters for param */
1824 	param.mysql_parse = fts_tokenize_document_internal;
1825 	param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1826 	param.mysql_ftparam = phrase_param;
1827 	param.cs = phrase_param->phrase->charset;
1828 	param.doc = reinterpret_cast<char*>(text);
1829 	param.length = static_cast<int>(len);
1830 	param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1831 
1832 	PARSER_INIT(parser, &param);
1833 	parser->parse(&param);
1834 	PARSER_DEINIT(parser, &param);
1835 
1836 	return(phrase_param->phrase->found);
1837 }
1838 
1839 /*****************************************************************//**
1840 Callback function to fetch and search the document.
1841 @return TRUE if matched else FALSE */
1842 static
1843 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1844 fts_query_match_phrase(
1845 /*===================*/
1846 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1847 	byte*		start,		/*!< in: text to search, we can't make
1848 					this const becase we need to first
1849 					convert the string to lowercase */
1850 	ulint		cur_len,	/*!< in: length of text */
1851 	ulint		prev_len,	/*!< in: total length for searched
1852 					doc fields*/
1853 	mem_heap_t*	heap)		/* heap */
1854 {
1855 	ulint			i;
1856 	const fts_string_t*	first;
1857 	const byte*		end = start + cur_len;
1858 	const ib_vector_t*	tokens = phrase->tokens;
1859 	const ib_vector_t*	positions = phrase->match->positions;
1860 
1861 	ut_a(!phrase->found);
1862 	ut_a(phrase->match->doc_id > 0);
1863 	ut_a(ib_vector_size(tokens) > 0);
1864 	ut_a(ib_vector_size(positions) > 0);
1865 
1866 	first = static_cast<const fts_string_t*>(
1867 		ib_vector_get_const(tokens, 0));
1868 
1869 	ut_a(phrase->match->start < ib_vector_size(positions));
1870 
1871 	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1872 		ulint		pos;
1873 		byte*		ptr = start;
1874 
1875 		pos = *(ulint*) ib_vector_get_const(positions, i);
1876 
1877 		if (pos == ULINT_UNDEFINED) {
1878 			break;
1879 		}
1880 
1881 		if (pos < prev_len) {
1882 			continue;
1883 		}
1884 
1885 		/* Document positions are calculated from the beginning
1886 		of the first field, need to save the length for each
1887 		searched field to adjust the doc position when search
1888 		phrases. */
1889 		pos -= prev_len;
1890 		ptr = start + pos;
1891 
1892 		/* Within limits ? */
1893 		if (ptr >= end) {
1894 			break;
1895 		}
1896 
1897 		if (phrase->parser) {
1898 			fts_phrase_param_t	phrase_param;
1899 
1900 			phrase_param.phrase = phrase;
1901 			phrase_param.token_index = 0;
1902 			phrase_param.heap = heap;
1903 
1904 			if (fts_query_match_phrase_terms_by_parser(
1905 				&phrase_param,
1906 				phrase->parser,
1907 				ptr,
1908 				ulint(end - ptr))) {
1909 				break;
1910 			}
1911 		} else {
1912 			fts_string_t	match;
1913 			fts_string_t	cmp_str;
1914 			ulint		ret;
1915 
1916 			match.f_str = ptr;
1917 			ret = innobase_mysql_fts_get_token(
1918 				phrase->charset, start + pos,
1919 				const_cast<byte*>(end), &match);
1920 
1921 			if (match.f_len == 0) {
1922 				break;
1923 			}
1924 
1925 			fts_string_dup(&cmp_str, &match, heap);
1926 
1927 			if (innobase_fts_text_case_cmp(
1928 				phrase->charset, first, &cmp_str) == 0) {
1929 
1930 				/* This is the case for the single word
1931 				in the phrase. */
1932 				if (ib_vector_size(phrase->tokens) == 1) {
1933 					phrase->found = TRUE;
1934 					break;
1935 				}
1936 
1937 				ptr += ret;
1938 
1939 				/* Match the remaining terms in the phrase. */
1940 				if (fts_query_match_phrase_terms(phrase, &ptr,
1941 								 end, heap)) {
1942 					break;
1943 				}
1944 			}
1945 		}
1946 	}
1947 
1948 	return(phrase->found);
1949 }
1950 
1951 /*****************************************************************//**
1952 Callback function to fetch and search the document.
1953 @return whether the phrase is found */
1954 static
1955 ibool
fts_query_fetch_document(void * row,void * user_arg)1956 fts_query_fetch_document(
1957 /*=====================*/
1958 	void*		row,		/*!< in:  sel_node_t* */
1959 	void*		user_arg)	/*!< in:  fts_doc_t* */
1960 {
1961 
1962 	que_node_t*	exp;
1963 	sel_node_t*	node = static_cast<sel_node_t*>(row);
1964 	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
1965 	ulint		prev_len = 0;
1966 	ulint		total_len = 0;
1967 	byte*		document_text = NULL;
1968 
1969 	exp = node->select_list;
1970 
1971 	phrase->found = FALSE;
1972 
1973 	/* For proximity search, we will need to get the whole document
1974 	from all fields, so first count the total length of the document
1975 	from all the fields */
1976 	if (phrase->proximity_pos) {
1977 		 while (exp) {
1978 			ulint		field_len;
1979 			dfield_t*	dfield = que_node_get_val(exp);
1980 			byte*		data = static_cast<byte*>(
1981 						dfield_get_data(dfield));
1982 
1983 			if (dfield_is_ext(dfield)) {
1984 				ulint	local_len = dfield_get_len(dfield);
1985 
1986 				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1987 
1988 				field_len = mach_read_from_4(
1989 					data + local_len + BTR_EXTERN_LEN + 4);
1990 			} else {
1991 				field_len = dfield_get_len(dfield);
1992 			}
1993 
1994 			if (field_len != UNIV_SQL_NULL) {
1995 				total_len += field_len + 1;
1996 			}
1997 
1998 			exp = que_node_get_next(exp);
1999 		}
2000 
2001 		document_text = static_cast<byte*>(mem_heap_zalloc(
2002 					phrase->heap, total_len));
2003 
2004 		if (!document_text) {
2005 			return(FALSE);
2006 		}
2007 	}
2008 
2009 	exp = node->select_list;
2010 
2011 	while (exp) {
2012 		dfield_t*	dfield = que_node_get_val(exp);
2013 		byte*		data = static_cast<byte*>(
2014 					dfield_get_data(dfield));
2015 		ulint		cur_len;
2016 
2017 		if (dfield_is_ext(dfield)) {
2018 			data = btr_copy_externally_stored_field(
2019 				&cur_len, data, phrase->zip_size,
2020 				dfield_get_len(dfield), phrase->heap);
2021 		} else {
2022 			cur_len = dfield_get_len(dfield);
2023 		}
2024 
2025 		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2026 			if (phrase->proximity_pos) {
2027 				ut_ad(prev_len + cur_len <= total_len);
2028 				memcpy(document_text + prev_len, data, cur_len);
2029 			} else {
2030 				/* For phrase search */
2031 				phrase->found =
2032 					fts_query_match_phrase(
2033 						phrase,
2034 						static_cast<byte*>(data),
2035 						cur_len, prev_len,
2036 						phrase->heap);
2037 			}
2038 
2039 			/* Document positions are calculated from the beginning
2040 			of the first field, need to save the length for each
2041 			searched field to adjust the doc position when search
2042 			phrases. */
2043 			prev_len += cur_len + 1;
2044 		}
2045 
2046 		if (phrase->found) {
2047 			break;
2048 		}
2049 
2050 		exp = que_node_get_next(exp);
2051 	}
2052 
2053 	if (phrase->proximity_pos) {
2054 		ut_ad(prev_len <= total_len);
2055 
2056 		phrase->found = fts_proximity_is_word_in_range(
2057 			phrase, document_text, total_len);
2058 	}
2059 
2060 	return(phrase->found);
2061 }
2062 
2063 #if 0
2064 /********************************************************************
2065 Callback function to check whether a record was found or not. */
2066 static
2067 ibool
2068 fts_query_select(
2069 /*=============*/
2070 	void*		row,		/*!< in:  sel_node_t* */
2071 	void*		user_arg)	/*!< in:  fts_doc_t* */
2072 {
2073 	int		i;
2074 	que_node_t*	exp;
2075 	sel_node_t*	node = row;
2076 	fts_select_t*	select = user_arg;
2077 
2078 	ut_a(select->word_freq);
2079 	ut_a(select->word_freq->doc_freqs);
2080 
2081 	exp = node->select_list;
2082 
2083 	for (i = 0; exp && !select->found; ++i) {
2084 		dfield_t*	dfield = que_node_get_val(exp);
2085 		void*		data = dfield_get_data(dfield);
2086 		ulint		len = dfield_get_len(dfield);
2087 
2088 		switch (i) {
2089 		case 0: /* DOC_COUNT */
2090 			if (len != UNIV_SQL_NULL && len != 0) {
2091 
2092 				select->word_freq->doc_count +=
2093 					mach_read_from_4(data);
2094 			}
2095 			break;
2096 
2097 		case 1: /* ILIST */
2098 			if (len != UNIV_SQL_NULL && len != 0) {
2099 
2100 				fts_query_find_doc_id(select, data, len);
2101 			}
2102 			break;
2103 
2104 		default:
2105 			ut_error;
2106 		}
2107 
2108 		exp = que_node_get_next(exp);
2109 	}
2110 
2111 	return(FALSE);
2112 }
2113 
2114 /********************************************************************
2115 Read the rows from the FTS index, that match word and where the
2116 doc id is between first and last doc id.
2117 @return DB_SUCCESS if all go well else error code */
2118 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2119 dberr_t
2120 fts_query_find_term(
2121 /*================*/
2122 	fts_query_t*		query,	/*!< in: FTS query state */
2123 	que_t**			graph,	/*!< in: prepared statement */
2124 	const fts_string_t*	word,	/*!< in: the word to fetch */
2125 	doc_id_t		doc_id,	/*!< in: doc id to match */
2126 	ulint*			min_pos,/*!< in/out: pos found must be
2127 					 greater than this minimum value. */
2128 	ibool*			found)	/*!< out: TRUE if found else FALSE */
2129 {
2130 	pars_info_t*		info;
2131 	dberr_t			error;
2132 	fts_select_t		select;
2133 	doc_id_t		match_doc_id;
2134 	trx_t*			trx = query->trx;
2135 	char			table_name[MAX_FULL_NAME_LEN];
2136 
2137 	trx->op_info = "fetching FTS index matching nodes";
2138 
2139 	if (*graph) {
2140 		info = (*graph)->info;
2141 	} else {
2142 		ulint	selected;
2143 
2144 		info = pars_info_create();
2145 
2146 		selected = fts_select_index(*word->f_str);
2147 		query->fts_index_table.suffix = fts_get_suffix(selected);
2148 
2149 		fts_get_table_name(&query->fts_index_table, table_name);
2150 		pars_info_bind_id(info, "index_table_name", table_name);
2151 	}
2152 
2153 	select.found = FALSE;
2154 	select.doc_id = doc_id;
2155 	select.min_pos = *min_pos;
2156 	select.word_freq = fts_query_add_word_freq(query, word->f_str);
2157 
2158 	pars_info_bind_function(info, "my_func", fts_query_select, &select);
2159 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2160 
2161 	/* Convert to "storage" byte order. */
2162 	fts_write_doc_id((byte*) &match_doc_id, doc_id);
2163 
2164 	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2165 
2166 	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2167 
2168 	if (!*graph) {
2169 
2170 		*graph = fts_parse_sql(
2171 			&query->fts_index_table,
2172 			info,
2173 			"DECLARE FUNCTION my_func;\n"
2174 			"DECLARE CURSOR c IS"
2175 			" SELECT doc_count, ilist\n"
2176 			" FROM $index_table_name\n"
2177 			" WHERE word LIKE :word AND"
2178 			" first_doc_id <= :min_doc_id AND"
2179 			" last_doc_id >= :max_doc_id\n"
2180 			" ORDER BY first_doc_id;\n"
2181 			"BEGIN\n"
2182 			"\n"
2183 			"OPEN c;\n"
2184 			"WHILE 1 = 1 LOOP\n"
2185 			"  FETCH c INTO my_func();\n"
2186 			"  IF c % NOTFOUND THEN\n"
2187 			"    EXIT;\n"
2188 			"  END IF;\n"
2189 			"END LOOP;\n"
2190 			"CLOSE c;");
2191 	}
2192 
2193 	for (;;) {
2194 		error = fts_eval_sql(trx, *graph);
2195 
2196 		if (error == DB_SUCCESS) {
2197 
2198 			break;				/* Exit the loop. */
2199 		} else {
2200 
2201 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2202 				ib::warn() << "lock wait timeout reading FTS"
2203 					" index. Retrying!";
2204 
2205 				trx->error_state = DB_SUCCESS;
2206 			} else {
2207 				ib::error() << error
2208 					<< " while reading FTS index.";
2209 
2210 				break;			/* Exit the loop. */
2211 			}
2212 		}
2213 	}
2214 
2215 	/* Value to return */
2216 	*found = select.found;
2217 
2218 	if (*found) {
2219 		*min_pos = select.min_pos;
2220 	}
2221 
2222 	return(error);
2223 }
2224 
2225 /********************************************************************
2226 Callback aggregator for int columns. */
2227 static
2228 ibool
2229 fts_query_sum(
2230 /*==========*/
2231 					/*!< out: always returns TRUE */
2232 	void*		row,		/*!< in:  sel_node_t* */
2233 	void*		user_arg)	/*!< in:  ulint* */
2234 {
2235 
2236 	que_node_t*	exp;
2237 	sel_node_t*	node = row;
2238 	ulint*		total = user_arg;
2239 
2240 	exp = node->select_list;
2241 
2242 	while (exp) {
2243 		dfield_t*	dfield = que_node_get_val(exp);
2244 		void*		data = dfield_get_data(dfield);
2245 		ulint		len = dfield_get_len(dfield);
2246 
2247 		if (len != UNIV_SQL_NULL && len != 0) {
2248 			*total += mach_read_from_4(data);
2249 		}
2250 
2251 		exp = que_node_get_next(exp);
2252 	}
2253 
2254 	return(TRUE);
2255 }
2256 
2257 /********************************************************************
2258 Calculate the total documents that contain a particular word (term).
2259 @return DB_SUCCESS if all go well else error code */
2260 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2261 dberr_t
2262 fts_query_total_docs_containing_term(
2263 /*=================================*/
2264 	fts_query_t*		query,	/*!< in: FTS query state */
2265 	const fts_string_t*	word,	/*!< in: the word to check */
2266 	ulint*			total)	/*!< out: documents containing word */
2267 {
2268 	pars_info_t*		info;
2269 	dberr_t			error;
2270 	que_t*			graph;
2271 	ulint			selected;
2272 	trx_t*			trx = query->trx;
2273 	char			table_name[MAX_FULL_NAME_LEN]
2274 
2275 	trx->op_info = "fetching FTS index document count";
2276 
2277 	*total = 0;
2278 
2279 	info = pars_info_create();
2280 
2281 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2282 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2283 
2284 	selected = fts_select_index(*word->f_str);
2285 
2286 	query->fts_index_table.suffix = fts_get_suffix(selected);
2287 
2288 	fts_get_table_name(&query->fts_index_table, table_name);
2289 
2290 	pars_info_bind_id(info, "index_table_name", table_name);
2291 
2292 	graph = fts_parse_sql(
2293 		&query->fts_index_table,
2294 		info,
2295 		"DECLARE FUNCTION my_func;\n"
2296 		"DECLARE CURSOR c IS"
2297 		" SELECT doc_count\n"
2298 		" FROM $index_table_name\n"
2299 		" WHERE word = :word"
2300 		" ORDER BY first_doc_id;\n"
2301 		"BEGIN\n"
2302 		"\n"
2303 		"OPEN c;\n"
2304 		"WHILE 1 = 1 LOOP\n"
2305 		"  FETCH c INTO my_func();\n"
2306 		"  IF c % NOTFOUND THEN\n"
2307 		"    EXIT;\n"
2308 		"  END IF;\n"
2309 		"END LOOP;\n"
2310 		"CLOSE c;");
2311 
2312 	for (;;) {
2313 		error = fts_eval_sql(trx, graph);
2314 
2315 		if (error == DB_SUCCESS) {
2316 
2317 			break;				/* Exit the loop. */
2318 		} else {
2319 
2320 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2321 				ib::warn() << "lock wait timeout reading FTS"
2322 					" index. Retrying!";
2323 
2324 				trx->error_state = DB_SUCCESS;
2325 			} else {
2326 				ib::error() << error
2327 					<< " while reading FTS index.";
2328 
2329 				break;			/* Exit the loop. */
2330 			}
2331 		}
2332 	}
2333 
2334 	fts_que_graph_free(graph);
2335 
2336 	return(error);
2337 }
2338 
2339 /********************************************************************
2340 Get the total number of words in a documents.
2341 @return DB_SUCCESS if all go well else error code */
2342 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2343 dberr_t
2344 fts_query_terms_in_document(
2345 /*========================*/
2346 	fts_query_t*	query,		/*!< in: FTS query state */
2347 	doc_id_t	doc_id,		/*!< in: the word to check */
2348 	ulint*		total)		/*!< out: total words in document */
2349 {
2350 	pars_info_t*	info;
2351 	dberr_t		error;
2352 	que_t*		graph;
2353 	doc_id_t	read_doc_id;
2354 	trx_t*		trx = query->trx;
2355 	char		table_name[MAX_FULL_NAME_LEN];
2356 
2357 	trx->op_info = "fetching FTS document term count";
2358 
2359 	*total = 0;
2360 
2361 	info = pars_info_create();
2362 
2363 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2364 
2365 	/* Convert to "storage" byte order. */
2366 	fts_write_doc_id((byte*) &read_doc_id, doc_id);
2367 	fts_bind_doc_id(info, "doc_id", &read_doc_id);
2368 
2369 	query->fts_index_table.suffix = "DOC_ID";
2370 
2371 	fts_get_table_name(&query->fts_index_table, table_name);
2372 
2373 	pars_info_bind_id(info, "index_table_name", table_name);
2374 
2375 	graph = fts_parse_sql(
2376 		&query->fts_index_table,
2377 		info,
2378 		"DECLARE FUNCTION my_func;\n"
2379 		"DECLARE CURSOR c IS"
2380 		" SELECT count\n"
2381 		" FROM $index_table_name\n"
2382 		" WHERE doc_id = :doc_id"
2383 		" BEGIN\n"
2384 		"\n"
2385 		"OPEN c;\n"
2386 		"WHILE 1 = 1 LOOP\n"
2387 		"  FETCH c INTO my_func();\n"
2388 		"  IF c % NOTFOUND THEN\n"
2389 		"    EXIT;\n"
2390 		"  END IF;\n"
2391 		"END LOOP;\n"
2392 		"CLOSE c;");
2393 
2394 	for (;;) {
2395 		error = fts_eval_sql(trx, graph);
2396 
2397 		if (error == DB_SUCCESS) {
2398 
2399 			break;				/* Exit the loop. */
2400 		} else {
2401 
2402 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2403 				ib::warn() << "lock wait timeout reading FTS"
2404 					" doc id table. Retrying!";
2405 
2406 				trx->error_state = DB_SUCCESS;
2407 			} else {
2408 				ib::error() << error << " while reading FTS"
2409 					" doc id table.";
2410 
2411 				break;			/* Exit the loop. */
2412 			}
2413 		}
2414 	}
2415 
2416 	fts_que_graph_free(graph);
2417 
2418 	return(error);
2419 }
2420 #endif
2421 
2422 /*****************************************************************//**
2423 Retrieve the document and match the phrase tokens.
2424 @return DB_SUCCESS or error code */
2425 MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
2426 static
2427 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,st_mysql_ftparser * parser,ibool * found)2428 fts_query_match_document(
2429 /*=====================*/
2430 	ib_vector_t*	tokens,		/*!< in: phrase tokens */
2431 	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
2432 	fts_match_t*	match,		/*!< in: doc id and positions */
2433 	ulint		distance,	/*!< in: proximity distance */
2434 	st_mysql_ftparser* parser,	/*!< in: fts plugin parser */
2435 	ibool*		found)		/*!< out: TRUE if phrase found */
2436 {
2437 	dberr_t		error;
2438 	fts_phrase_t	phrase(get_doc->index_cache->index->table);
2439 
2440 	phrase.match = match;		/* Positions to match */
2441 	phrase.tokens = tokens;		/* Tokens to match */
2442 	phrase.distance = distance;
2443 	phrase.charset = get_doc->index_cache->charset;
2444 	phrase.heap = mem_heap_create(512);
2445 	phrase.parser = parser;
2446 
2447 	*found = phrase.found = FALSE;
2448 
2449 	error = fts_doc_fetch_by_doc_id(
2450 		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2451 		fts_query_fetch_document, &phrase);
2452 
2453 	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
2454 		ib::error() << "(" << error << ") matching document.";
2455 	} else {
2456 		*found = phrase.found;
2457 	}
2458 
2459 	mem_heap_free(phrase.heap);
2460 
2461 	return(error);
2462 }
2463 
2464 /*****************************************************************//**
2465 This function fetches the original documents and count the
2466 words in between matching words to see that is in specified distance
2467 @return DB_SUCCESS if all OK */
2468 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2469 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2470 fts_query_is_in_proximity_range(
2471 /*============================*/
2472 	const fts_query_t*	query,		/*!< in:  query instance */
2473 	fts_match_t**		match,		/*!< in: query instance */
2474 	fts_proximity_t*	qualified_pos)	/*!< in: position info for
2475 						qualified ranges */
2476 {
2477 	fts_get_doc_t	get_doc;
2478 	fts_cache_t*	cache = query->index->table->fts->cache;
2479 	dberr_t		err;
2480 
2481 	memset(&get_doc, 0x0, sizeof(get_doc));
2482 
2483 	rw_lock_x_lock(&cache->lock);
2484 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2485 	rw_lock_x_unlock(&cache->lock);
2486 	ut_a(get_doc.index_cache != NULL);
2487 
2488 	fts_phrase_t	phrase(get_doc.index_cache->index->table);
2489 
2490 	phrase.distance = query->distance;
2491 	phrase.charset = get_doc.index_cache->charset;
2492 	phrase.heap = mem_heap_create(512);
2493 	phrase.proximity_pos = qualified_pos;
2494 	phrase.found = FALSE;
2495 
2496 	err = fts_doc_fetch_by_doc_id(
2497 		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2498 		fts_query_fetch_document, &phrase);
2499 
2500 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
2501 		ib::error() << "(" << err << ") in verification"
2502 			" phase of proximity search";
2503 	}
2504 
2505 	/* Free the prepared statement. */
2506 	if (get_doc.get_document_graph) {
2507 		fts_que_graph_free(get_doc.get_document_graph);
2508 		get_doc.get_document_graph = NULL;
2509 	}
2510 
2511 	mem_heap_free(phrase.heap);
2512 
2513 	return(err == DB_SUCCESS && phrase.found);
2514 }
2515 
2516 /*****************************************************************//**
2517 Iterate over the matched document ids and search the for the
2518 actual phrase in the text.
2519 @return DB_SUCCESS if all OK */
2520 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2521 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2522 fts_query_search_phrase(
2523 /*====================*/
2524 	fts_query_t*		query,		/*!< in: query instance */
2525 	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
2526 						with any stopwords in the
2527 						original phrase */
2528 	ib_vector_t*		tokens)		/*!< in: tokens that does
2529 						not include stopwords and
2530 						can be used to calculate
2531 						ranking */
2532 {
2533 	ulint			i;
2534 	fts_get_doc_t		get_doc;
2535 	ulint			n_matched;
2536 	fts_cache_t*		cache = query->index->table->fts->cache;
2537 
2538 	n_matched = ib_vector_size(query->matched);
2539 
2540 	/* Setup the doc retrieval infrastructure. */
2541 	memset(&get_doc, 0x0, sizeof(get_doc));
2542 
2543 	rw_lock_x_lock(&cache->lock);
2544 
2545 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2546 
2547 	/* Must find the index cache */
2548 	ut_a(get_doc.index_cache != NULL);
2549 
2550 	rw_lock_x_unlock(&cache->lock);
2551 
2552 #ifdef FTS_INTERNAL_DIAG_PRINT
2553 	ib::info() << "Start phrase search";
2554 #endif
2555 
2556 	/* Read the document from disk and do the actual
2557 	match, matching documents will be added to the current
2558 	doc id set. */
2559 	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2560 		fts_match_t*	match;
2561 		ibool		found = FALSE;
2562 
2563 		match = static_cast<fts_match_t*>(
2564 			ib_vector_get(query->matched, i));
2565 
2566 		/* Skip the document ids that were filtered out by
2567 		an earlier pass. */
2568 		if (match->doc_id != 0) {
2569 
2570 			query->error = fts_query_match_document(
2571 				orig_tokens, &get_doc, match,
2572 				query->distance, query->parser, &found);
2573 
2574 			if (query->error == DB_SUCCESS && found) {
2575 				ulint	z;
2576 
2577 				query->error = fts_query_process_doc_id(query,
2578 							 match->doc_id, 0);
2579 				if (query->error != DB_SUCCESS) {
2580 					goto func_exit;
2581 				}
2582 
2583 				for (z = 0; z < ib_vector_size(tokens); z++) {
2584 					fts_string_t*   token;
2585 					token = static_cast<fts_string_t*>(
2586 						ib_vector_get(tokens, z));
2587 					fts_query_add_word_to_document(
2588 						query, match->doc_id, token);
2589 				}
2590 			}
2591 		}
2592 	}
2593 
2594 func_exit:
2595 	/* Free the prepared statement. */
2596 	if (get_doc.get_document_graph) {
2597 		fts_que_graph_free(get_doc.get_document_graph);
2598 		get_doc.get_document_graph = NULL;
2599 	}
2600 
2601 	return(query->error);
2602 }
2603 
2604 /** Split the phrase into tokens
2605 @param[in,out]	query		query instance
2606 @param[in]	node		query node to search
2607 @param[in,out]	tokens		token vector
2608 @param[in,out]	orig_tokens	original node tokens include stopword
2609 @param[in,out]	heap	mem heap */
2610 static
2611 void
fts_query_phrase_split(fts_query_t * query,const fts_ast_node_t * node,ib_vector_t * tokens,ib_vector_t * orig_tokens,mem_heap_t * heap)2612 fts_query_phrase_split(
2613 	fts_query_t*		query,
2614 	const fts_ast_node_t*	node,
2615 	ib_vector_t*		tokens,
2616 	ib_vector_t*		orig_tokens,
2617 	mem_heap_t*		heap)
2618 {
2619 	fts_string_t		phrase;
2620 	ulint			len = 0;
2621 	ulint			cur_pos = 0;
2622 	fts_ast_node_t*		term_node = NULL;
2623 
2624 	if (node->type == FTS_AST_TEXT) {
2625 		phrase.f_str = node->text.ptr->str;
2626 		phrase.f_len = node->text.ptr->len;
2627 		len = phrase.f_len;
2628 	} else {
2629 		ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2630 		phrase.f_str = NULL;
2631 		phrase.f_len = 0;
2632 		term_node = node->list.head;
2633 	}
2634 
2635 	while (true) {
2636 		fts_cache_t*	cache = query->index->table->fts->cache;
2637 		ulint		cur_len;
2638 		fts_string_t	result_str;
2639 
2640 		if (node->type == FTS_AST_TEXT) {
2641 			if (cur_pos >= len) {
2642 				break;
2643 			}
2644 
2645 			cur_len = innobase_mysql_fts_get_token(
2646 				query->fts_index_table.charset,
2647 				reinterpret_cast<const byte*>(phrase.f_str)
2648 				+ cur_pos,
2649 				reinterpret_cast<const byte*>(phrase.f_str)
2650 				+ len,
2651 				&result_str);
2652 
2653 			if (cur_len == 0) {
2654 				break;
2655 			}
2656 
2657 			cur_pos += cur_len;
2658 		} else {
2659 			ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2660 			/* Term node in parser phrase list */
2661 			if (term_node == NULL) {
2662 				break;
2663 			}
2664 
2665 			ut_a(term_node->type == FTS_AST_TERM);
2666 			result_str.f_str = term_node->term.ptr->str;
2667 			result_str.f_len = term_node->term.ptr->len;
2668 			result_str.f_n_char = fts_get_token_size(
2669 				query->fts_index_table.charset,
2670 				reinterpret_cast<char*>(result_str.f_str),
2671 				result_str.f_len);
2672 
2673 			term_node = term_node->next;
2674 		}
2675 
2676 		if (result_str.f_n_char == 0) {
2677 			continue;
2678 		}
2679 
2680 		fts_string_t*	token = static_cast<fts_string_t*>(
2681 			ib_vector_push(tokens, NULL));
2682 		fts_string_dup(token, &result_str, heap);
2683 
2684 		if (fts_check_token(
2685 			   &result_str,
2686 			   cache->stopword_info.cached_stopword,
2687 			   query->fts_index_table.charset)) {
2688 			/* Add the word to the RB tree so that we can
2689 			calculate it's frequencey within a document. */
2690 			fts_query_add_word_freq(query, token);
2691 		} else {
2692 			ib_vector_pop(tokens);
2693 		}
2694 
2695 		/* we will start to store all words including stopwords
2696 		in the "orig_tokens" vector, but skip any leading words
2697 		that are stopwords */
2698 		if (!ib_vector_is_empty(tokens)) {
2699 			fts_string_t*	orig_token = static_cast<fts_string_t*>(
2700 				ib_vector_push(orig_tokens, NULL));
2701 
2702 			orig_token->f_str = token->f_str;
2703 			orig_token->f_len = token->f_len;
2704 		}
2705 	}
2706 }
2707 
2708 /*****************************************************************//**
2709 Text/Phrase search.
2710 @return DB_SUCCESS or error code */
2711 static MY_ATTRIBUTE((warn_unused_result))
2712 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_ast_node_t * node)2713 fts_query_phrase_search(
2714 /*====================*/
2715 	fts_query_t*		query,	/*!< in: query instance */
2716 	const fts_ast_node_t*	node)	/*!< in: node to search */
2717 {
2718 	ib_vector_t*		tokens;
2719 	ib_vector_t*		orig_tokens;
2720 	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
2721 	ib_alloc_t*		heap_alloc;
2722 	ulint			num_token;
2723 
2724 	heap_alloc = ib_heap_allocator_create(heap);
2725 
2726 	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2727 	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2728 
2729 	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2730 		query->flags = FTS_PROXIMITY;
2731 	} else {
2732 		query->flags = FTS_PHRASE;
2733 	}
2734 
2735 	/* Split the phrase into tokens. */
2736 	fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2737 
2738 	num_token = ib_vector_size(tokens);
2739 	if (num_token > MAX_PROXIMITY_ITEM) {
2740 		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2741 		goto func_exit;
2742 	}
2743 
2744 	ut_ad(ib_vector_size(orig_tokens) >= num_token);
2745 
2746 	/* Ignore empty strings. */
2747 	if (num_token > 0) {
2748 		fts_string_t*	token = NULL;
2749 		fts_fetch_t	fetch;
2750 		trx_t*		trx = query->trx;
2751 		fts_ast_oper_t	oper = query->oper;
2752 		que_t*		graph = NULL;
2753 		ulint		i;
2754 		dberr_t		error;
2755 
2756 		/* Create the vector for storing matching document ids
2757 		and the positions of the first token of the phrase. */
2758 		if (!query->matched) {
2759 			ib_alloc_t*	heap_alloc;
2760 
2761 			heap_alloc = ib_heap_allocator_create(heap);
2762 
2763 			if (!(query->flags & FTS_PROXIMITY)
2764 			    && !(query->flags & FTS_PHRASE)) {
2765 				query->matched = ib_vector_create(
2766 					heap_alloc, sizeof(fts_match_t),
2767 					64);
2768 			} else {
2769 				ut_a(num_token <= MAX_PROXIMITY_ITEM);
2770 				query->match_array =
2771 					(ib_vector_t**) mem_heap_alloc(
2772 						heap,
2773 						num_token *
2774 						sizeof(query->matched));
2775 
2776 				for (i = 0; i < num_token; i++) {
2777 					query->match_array[i] =
2778 					ib_vector_create(
2779 						heap_alloc, sizeof(fts_match_t),
2780 						64);
2781 				}
2782 
2783 				query->matched = query->match_array[0];
2784 			}
2785 		}
2786 
2787 		/* Setup the callback args for filtering and consolidating
2788 		the ilist. */
2789 		fetch.read_arg = query;
2790 		fetch.read_record = fts_query_index_fetch_nodes;
2791 
2792 		for (i = 0; i < num_token; i++) {
2793 			/* Search for the first word from the phrase. */
2794 			token = static_cast<fts_string_t*>(
2795 				ib_vector_get(tokens, i));
2796 
2797 			if (query->flags & FTS_PROXIMITY
2798 			    || query->flags & FTS_PHRASE) {
2799 				query->matched = query->match_array[i];
2800 			}
2801 
2802 			error = fts_index_fetch_nodes(
2803 				trx, &graph, &query->fts_index_table,
2804 				token, &fetch);
2805 
2806 			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2807 			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2808 			if (error != DB_SUCCESS) {
2809 				query->error = error;
2810 			}
2811 
2812 			fts_que_graph_free(graph);
2813 			graph = NULL;
2814 
2815 			fts_query_cache(query, token);
2816 
2817 			if (!(query->flags & FTS_PHRASE)
2818 			    && !(query->flags & FTS_PROXIMITY)) {
2819 				break;
2820 			}
2821 
2822 			/* If any of the token can't be found,
2823 			no need to continue match */
2824 			if (ib_vector_is_empty(query->match_array[i])
2825 			    || query->error != DB_SUCCESS) {
2826 				goto func_exit;
2827 			}
2828 		}
2829 
2830 		/* Just a single word, no need to fetch the original
2831 		documents to do phrase matching */
2832 		if (ib_vector_size(orig_tokens) == 1
2833 		    && !ib_vector_is_empty(query->match_array[0])) {
2834 			fts_match_t*    match;
2835 			ulint		n_matched;
2836 
2837 			n_matched = ib_vector_size(query->match_array[0]);
2838 
2839 			for (i = 0; i < n_matched; i++) {
2840 				match = static_cast<fts_match_t*>(
2841 					ib_vector_get(
2842 						query->match_array[0], i));
2843 
2844 				query->error = fts_query_process_doc_id(
2845 						query, match->doc_id, 0);
2846 				if (query->error != DB_SUCCESS) {
2847 					goto func_exit;
2848 				}
2849 
2850 				fts_query_add_word_to_document(
2851 					query, match->doc_id, token);
2852 			}
2853 			query->oper = oper;
2854 			goto func_exit;
2855 		}
2856 
2857 		/* If we are doing proximity search, verify the distance
2858 		between all words, and check they are in specified distance. */
2859 		if (query->flags & FTS_PROXIMITY) {
2860 			fts_phrase_or_proximity_search(query, tokens);
2861 		} else {
2862 			ibool	matched;
2863 
2864 			/* Phrase Search case:
2865 			We filter out the doc ids that don't contain
2866 			all the tokens in the phrase. It's cheaper to
2867 			search the ilist than bringing the documents in
2868 			and then doing a search through the text. Isolated
2869 			testing shows this also helps in mitigating disruption
2870 			of the buffer cache. */
2871 			matched = fts_phrase_or_proximity_search(query, tokens);
2872 			query->matched = query->match_array[0];
2873 
2874 			/* Read the actual text in and search for the phrase. */
2875 			if (matched) {
2876 				ut_ad(query->error == DB_SUCCESS);
2877 				query->error = fts_query_search_phrase(
2878 					query, orig_tokens, tokens);
2879 			}
2880 		}
2881 
2882 		/* Restore original operation. */
2883 		query->oper = oper;
2884 
2885 		if (query->error != DB_SUCCESS) {
2886 			goto func_exit;
2887 		}
2888 	}
2889 
2890 func_exit:
2891 	mem_heap_free(heap);
2892 
2893 	/* Don't need it anymore. */
2894 	query->matched = NULL;
2895 
2896 	return(query->error);
2897 }
2898 
2899 /*****************************************************************//**
2900 Find the word and evaluate.
2901 @return DB_SUCCESS if all go well */
2902 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2903 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2904 fts_query_execute(
2905 /*==============*/
2906 	fts_query_t*		query,	/*!< in: query instance */
2907 	fts_string_t*		token)	/*!< in: token to search */
2908 {
2909 	switch (query->oper) {
2910 	case FTS_NONE:
2911 	case FTS_NEGATE:
2912 	case FTS_INCR_RATING:
2913 	case FTS_DECR_RATING:
2914 		query->error = fts_query_union(query, token);
2915 		break;
2916 
2917 	case FTS_EXIST:
2918 		query->error = fts_query_intersect(query, token);
2919 		break;
2920 
2921 	case FTS_IGNORE:
2922 		query->error = fts_query_difference(query, token);
2923 		break;
2924 
2925 	default:
2926 		ut_error;
2927 	}
2928 
2929 	return(query->error);
2930 }
2931 
2932 /*****************************************************************//**
2933 Create a wildcard string. It's the responsibility of the caller to
2934 free the byte* pointer. It's allocated using ut_malloc_nokey().
2935 @return ptr to allocated memory */
2936 static
2937 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2938 fts_query_get_token(
2939 /*================*/
2940 	fts_ast_node_t*	node,		/*!< in: the current sub tree */
2941 	fts_string_t*	token)		/*!< in: token to create */
2942 {
2943 	ulint		str_len;
2944 	byte*		new_ptr = NULL;
2945 
2946 	str_len = node->term.ptr->len;
2947 
2948 	ut_a(node->type == FTS_AST_TERM);
2949 
2950 	token->f_len = str_len;
2951 	token->f_str = node->term.ptr->str;
2952 
2953 	if (node->term.wildcard) {
2954 
2955 		token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2956 		token->f_len = str_len + 1;
2957 
2958 		memcpy(token->f_str, node->term.ptr->str, str_len);
2959 
2960 		token->f_str[str_len] = '%';
2961 		token->f_str[token->f_len] = 0;
2962 
2963 		new_ptr = token->f_str;
2964 	}
2965 
2966 	return(new_ptr);
2967 }
2968 
2969 static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
2970 
2971 /*****************************************************************//**
2972 Visit every node of the AST. */
2973 static
2974 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2975 fts_query_visitor(
2976 /*==============*/
2977 	fts_ast_oper_t	oper,		/*!< in: current operator */
2978 	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
2979 	void*		arg)		/*!< in: callback arg*/
2980 {
2981 	byte*		ptr;
2982 	fts_string_t	token;
2983 	fts_query_t*	query = static_cast<fts_query_t*>(arg);
2984 
2985 	ut_a(node);
2986 	DBUG_ENTER("fts_query_visitor");
2987 	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2988 
2989 	token.f_n_char = 0;
2990 	query->oper = oper;
2991 	query->cur_node = node;
2992 
2993 	switch (node->type) {
2994 	case FTS_AST_TEXT:
2995 	case FTS_AST_PARSER_PHRASE_LIST:
2996 
2997 		if (query->oper == FTS_EXIST) {
2998 			ut_ad(query->intersection == NULL);
2999 			query->intersection = rbt_create(
3000 				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3001 
3002 			query->total_size += SIZEOF_RBT_CREATE;
3003 		}
3004 
3005 		/* Set the current proximity distance. */
3006 		query->distance = node->text.distance;
3007 
3008 		/* Force collection of doc ids and the positions. */
3009 		query->collect_positions = TRUE;
3010 
3011 		query->error = fts_query_phrase_search(query, node);
3012 
3013 		query->collect_positions = FALSE;
3014 
3015 		if (query->oper == FTS_EXIST) {
3016 			fts_query_free_doc_ids(query, query->doc_ids);
3017 			query->doc_ids = query->intersection;
3018 			query->intersection = NULL;
3019 		}
3020 
3021 		break;
3022 
3023 	case FTS_AST_TERM:
3024 		token.f_str = node->term.ptr->str;
3025 		token.f_len = node->term.ptr->len;
3026 
3027 		/* Collect wildcard words for QUERY EXPANSION. */
3028 		if (node->term.wildcard && query->wildcard_words != NULL) {
3029 			ib_rbt_bound_t          parent;
3030 
3031 			if (rbt_search(query->wildcard_words, &parent, &token)
3032 			     != 0) {
3033 				fts_string_t	word;
3034 
3035 				fts_string_dup(&word, &token, query->heap);
3036 				rbt_add_node(query->wildcard_words, &parent,
3037 					     &word);
3038 			}
3039 		}
3040 
3041 		/* Add the word to our RB tree that will be used to
3042 		calculate this terms per document frequency. */
3043 		fts_query_add_word_freq(query, &token);
3044 
3045 		ptr = fts_query_get_token(node, &token);
3046 		query->error = fts_query_execute(query, &token);
3047 
3048 		if (ptr) {
3049 			ut_free(ptr);
3050 		}
3051 
3052 		break;
3053 
3054 	case FTS_AST_SUBEXP_LIST:
3055 		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3056 		break;
3057 
3058 	default:
3059 		ut_error;
3060 	}
3061 
3062 	if (query->oper == FTS_EXIST) {
3063 		query->multi_exist = true;
3064 	}
3065 
3066 	DBUG_RETURN(query->error);
3067 }
3068 
3069 /** Process (nested) sub-expression, create a new result set to store the
3070 sub-expression result by processing nodes under current sub-expression
3071 list. Merge the sub-expression result with that of parent expression list.
3072 @param[in,out]	node	current root node
3073 @param[in,out]	visitor	callback function
3074 @param[in,out]	arg	argument for callback
3075 @return DB_SUCCESS if all go well */
3076 static
3077 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)3078 fts_ast_visit_sub_exp(
3079 	fts_ast_node_t*		node,
3080 	fts_ast_callback	visitor,
3081 	void*			arg)
3082 {
3083 	fts_ast_oper_t		cur_oper;
3084 	fts_query_t*		query = static_cast<fts_query_t*>(arg);
3085 	ib_rbt_t*		parent_doc_ids;
3086 	ib_rbt_t*		subexpr_doc_ids;
3087 	dberr_t			error = DB_SUCCESS;
3088 	bool			will_be_ignored = false;
3089 	bool			multi_exist;
3090 
3091 	DBUG_ENTER("fts_ast_visit_sub_exp");
3092 
3093 	ut_a(node->type == FTS_AST_SUBEXP_LIST);
3094 
3095 	/* To avoid stack overflow, we limit the mutual recursion
3096 	depth between fts_ast_visit(), fts_query_visitor() and
3097 	fts_ast_visit_sub_exp(). */
3098 	if (query->visiting_sub_exp++ > 31) {
3099 		query->error = DB_OUT_OF_MEMORY;
3100 		DBUG_RETURN(query->error);
3101 	}
3102 
3103 	cur_oper = query->oper;
3104 
3105 	/* Save current result set */
3106 	parent_doc_ids = query->doc_ids;
3107 
3108 	/* Create new result set to store the sub-expression result. We
3109 	will merge this result set with the parent after processing. */
3110 	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3111 				    fts_ranking_doc_id_cmp);
3112 
3113 	query->total_size += SIZEOF_RBT_CREATE;
3114 
3115 	multi_exist = query->multi_exist;
3116 	query->multi_exist = false;
3117 	/* Process nodes in current sub-expression and store its
3118 	result set in query->doc_ids we created above. */
3119 	error = fts_ast_visit(FTS_NONE, node, visitor,
3120 			      arg, &will_be_ignored);
3121 
3122 	/* Reinstate parent node state */
3123 	query->multi_exist = multi_exist;
3124 	query->oper = cur_oper;
3125 	query->visiting_sub_exp--;
3126 
3127 	/* Merge the sub-expression result with the parent result set. */
3128 	subexpr_doc_ids = query->doc_ids;
3129 	query->doc_ids = parent_doc_ids;
3130 	if (error == DB_SUCCESS) {
3131 		error = fts_merge_doc_ids(query, subexpr_doc_ids);
3132 	}
3133 
3134 	/* Free current result set. Result already merged into parent. */
3135 	fts_query_free_doc_ids(query, subexpr_doc_ids);
3136 
3137 	DBUG_RETURN(error);
3138 }
3139 
3140 #if 0
3141 /*****************************************************************//***
3142 Check if the doc id exists in the ilist.
3143 @return TRUE if doc id found */
3144 static
3145 ulint
3146 fts_query_find_doc_id(
3147 /*==================*/
3148 	fts_select_t*	select,		/*!< in/out: contains the doc id to
3149 					find, we update the word freq if
3150 					document found */
3151 	void*		data,		/*!< in: doc id ilist */
3152 	ulint		len)		/*!< in: doc id ilist size */
3153 {
3154 	byte*		ptr = data;
3155 	doc_id_t	doc_id = 0;
3156 	ulint		decoded = 0;
3157 
3158 	/* Decode the ilist and search for selected doc_id. We also
3159 	calculate the frequency of the word in the document if found. */
3160 	while (decoded < len && !select->found) {
3161 		ulint		freq = 0;
3162 		ulint		min_pos = 0;
3163 		ulint		last_pos = 0;
3164 		ulint		pos = fts_decode_vlc(&ptr);
3165 
3166 		/* Add the delta. */
3167 		doc_id += pos;
3168 
3169 		while (*ptr) {
3170 			++freq;
3171 			last_pos += fts_decode_vlc(&ptr);
3172 
3173 			/* Only if min_pos is not set and the current
3174 			term exists in a position greater than the
3175 			min_pos of the previous term. */
3176 			if (min_pos == 0 && last_pos > select->min_pos) {
3177 				min_pos = last_pos;
3178 			}
3179 		}
3180 
3181 		/* Skip the end of word position marker. */
3182 		++ptr;
3183 
3184 		/* Bytes decoded so far. */
3185 		decoded = ptr - (byte*) data;
3186 
3187 		/* A word may exist in the document but we only consider a
3188 		match if it exists in a position that is greater than the
3189 		position of the previous term. */
3190 		if (doc_id == select->doc_id && min_pos > 0) {
3191 			fts_doc_freq_t*	doc_freq;
3192 
3193 			/* Add the doc id to the doc freq rb tree, if
3194 			the doc id doesn't exist it will be created. */
3195 			doc_freq = fts_query_add_doc_freq(
3196 				select->word_freq->doc_freqs, doc_id);
3197 
3198 			/* Avoid duplicating the frequency tally */
3199 			if (doc_freq->freq == 0) {
3200 				doc_freq->freq = freq;
3201 			}
3202 
3203 			select->found = TRUE;
3204 			select->min_pos = min_pos;
3205 		}
3206 	}
3207 
3208 	return(select->found);
3209 }
3210 #endif
3211 
3212 /*****************************************************************//**
3213 Read and filter nodes.
3214 @return DB_SUCCESS if all go well,
3215 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3216 static
3217 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3218 fts_query_filter_doc_ids(
3219 /*=====================*/
3220 	fts_query_t*		query,		/*!< in: query instance */
3221 	const fts_string_t*	word,		/*!< in: the current word */
3222 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
3223 	const fts_node_t*	node,		/*!< in: current FTS node */
3224 	void*			data,		/*!< in: doc id ilist */
3225 	ulint			len,		/*!< in: doc id ilist size */
3226 	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
3227 {
3228 	const byte*	ptr = static_cast<byte*>(data);
3229 	doc_id_t	doc_id = 0;
3230 	ulint		decoded = 0;
3231 	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
3232 
3233 	/* Decode the ilist and add the doc ids to the query doc_id set. */
3234 	while (decoded < len) {
3235 		ulint		freq = 0;
3236 		fts_doc_freq_t*	doc_freq;
3237 		fts_match_t*	match = NULL;
3238 		doc_id_t	last_pos = 0;
3239 		doc_id_t	pos = fts_decode_vlc(&ptr);
3240 
3241 		/* Some sanity checks. */
3242 		if (doc_id == 0) {
3243 			ut_a(pos == node->first_doc_id);
3244 		}
3245 
3246 		/* Add the delta. */
3247 		doc_id += pos;
3248 
3249 		if (calc_doc_count) {
3250 			word_freq->doc_count++;
3251 		}
3252 
3253 		/* We simply collect the matching instances here. */
3254 		if (query->collect_positions) {
3255 			ib_alloc_t*	heap_alloc;
3256 
3257 			/* Create a new fts_match_t instance. */
3258 			match = static_cast<fts_match_t*>(
3259 				ib_vector_push(query->matched, NULL));
3260 
3261 			match->start = 0;
3262 			match->doc_id = doc_id;
3263 			heap_alloc = ib_vector_allocator(query->matched);
3264 
3265 			/* Allocate from the same heap as the
3266 			parent container. */
3267 			match->positions = ib_vector_create(
3268 				heap_alloc, sizeof(ulint), 64);
3269 
3270 			query->total_size += sizeof(fts_match_t)
3271 				+ sizeof(ib_vector_t)
3272 				+ sizeof(ulint) * 64;
3273 		}
3274 
3275 		/* Unpack the positions within the document. */
3276 		while (*ptr) {
3277 			last_pos += fts_decode_vlc(&ptr);
3278 
3279 			/* Collect the matching word positions, for phrase
3280 			matching later. */
3281 			if (query->collect_positions) {
3282 				ib_vector_push(match->positions, &last_pos);
3283 			}
3284 
3285 			++freq;
3286 		}
3287 
3288 		/* End of list marker. */
3289 		last_pos = (ulint) -1;
3290 
3291 		if (query->collect_positions) {
3292 			ut_a(match != NULL);
3293 			ib_vector_push(match->positions, &last_pos);
3294 		}
3295 
3296 		/* Add the doc id to the doc freq rb tree, if the doc id
3297 		doesn't exist it will be created. */
3298 		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3299 
3300 		/* Avoid duplicating frequency tally. */
3301 		if (doc_freq->freq == 0) {
3302 			doc_freq->freq = freq;
3303 		}
3304 
3305 		/* Skip the end of word position marker. */
3306 		++ptr;
3307 
3308 		/* Bytes decoded so far */
3309 		decoded = ulint(ptr - (byte*) data);
3310 
3311 		/* We simply collect the matching documents and the
3312 		positions here and match later. */
3313 		if (!query->collect_positions) {
3314 			/* We ignore error here and will check it later */
3315 			fts_query_process_doc_id(query, doc_id, 0);
3316 
3317 			/* Add the word to the document's matched RB tree. */
3318 			fts_query_add_word_to_document(query, doc_id, word);
3319 		}
3320 	}
3321 
3322 	/* Some sanity checks. */
3323 	ut_a(doc_id == node->last_doc_id);
3324 
3325 	if (query->total_size > fts_result_cache_limit) {
3326 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3327 	} else {
3328 		return(DB_SUCCESS);
3329 	}
3330 }
3331 
3332 /*****************************************************************//**
3333 Read the FTS INDEX row.
3334 @return DB_SUCCESS if all go well. */
3335 static
3336 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3337 fts_query_read_node(
3338 /*================*/
3339 	fts_query_t*		query,	/*!< in: query instance */
3340 	const fts_string_t*	word,	/*!< in: current word */
3341 	que_node_t*		exp)	/*!< in: query graph node */
3342 {
3343 	int			i;
3344 	int			ret;
3345 	fts_node_t		node;
3346 	ib_rbt_bound_t		parent;
3347 	fts_word_freq_t*	word_freq;
3348 	ibool			skip = FALSE;
3349 	fts_string_t		term;
3350 	byte			buf[FTS_MAX_WORD_LEN + 1];
3351 	dberr_t			error = DB_SUCCESS;
3352 
3353 	ut_a(query->cur_node->type == FTS_AST_TERM
3354 	     || query->cur_node->type == FTS_AST_TEXT
3355 	     || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3356 
3357 	memset(&node, 0, sizeof(node));
3358 	term.f_str = buf;
3359 
3360 	/* Need to consider the wildcard search case, the word frequency
3361 	is created on the search string not the actual word. So we need
3362 	to assign the frequency on search string behalf. */
3363 	if (query->cur_node->type == FTS_AST_TERM
3364 	    && query->cur_node->term.wildcard) {
3365 
3366 		term.f_len = query->cur_node->term.ptr->len;
3367 		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3368 		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3369 	} else {
3370 		term.f_len = word->f_len;
3371 		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3372 		memcpy(term.f_str, word->f_str, word->f_len);
3373 	}
3374 
3375 	/* Lookup the word in our rb tree, it must exist. */
3376 	ret = rbt_search(query->word_freqs, &parent, &term);
3377 
3378 	ut_a(ret == 0);
3379 
3380 	word_freq = rbt_value(fts_word_freq_t, parent.last);
3381 
3382 	/* Start from 1 since the first column has been read by the caller.
3383 	Also, we rely on the order of the columns projected, to filter
3384 	out ilists that are out of range and we always want to read
3385 	the doc_count irrespective of the suitablility of the row. */
3386 
3387 	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3388 
3389 		dfield_t*	dfield = que_node_get_val(exp);
3390 		byte*		data = static_cast<byte*>(
3391 			dfield_get_data(dfield));
3392 		ulint		len = dfield_get_len(dfield);
3393 
3394 		ut_a(len != UNIV_SQL_NULL);
3395 
3396 		/* Note: The column numbers below must match the SELECT. */
3397 
3398 		switch (i) {
3399 		case 1: /* DOC_COUNT */
3400 			word_freq->doc_count += mach_read_from_4(data);
3401 			break;
3402 
3403 		case 2: /* FIRST_DOC_ID */
3404 			node.first_doc_id = fts_read_doc_id(data);
3405 
3406 			/* Skip nodes whose doc ids are out range. */
3407 			if (query->oper == FTS_EXIST
3408 			    && query->upper_doc_id > 0
3409 			    && node.first_doc_id > query->upper_doc_id) {
3410 				skip = TRUE;
3411 			}
3412 			break;
3413 
3414 		case 3: /* LAST_DOC_ID */
3415 			node.last_doc_id = fts_read_doc_id(data);
3416 
3417 			/* Skip nodes whose doc ids are out range. */
3418 			if (query->oper == FTS_EXIST
3419 			    && query->lower_doc_id > 0
3420 			    && node.last_doc_id < query->lower_doc_id) {
3421 				skip = TRUE;
3422 			}
3423 			break;
3424 
3425 		case 4: /* ILIST */
3426 
3427 			error = fts_query_filter_doc_ids(
3428 					query, &word_freq->word, word_freq,
3429 					&node, data, len, FALSE);
3430 
3431 			break;
3432 
3433 		default:
3434 			ut_error;
3435 		}
3436 	}
3437 
3438 	if (!skip) {
3439 		/* Make sure all columns were read. */
3440 
3441 		ut_a(i == 5);
3442 	}
3443 
3444 	return error;
3445 }
3446 
3447 /*****************************************************************//**
3448 Callback function to fetch the rows in an FTS INDEX record.
3449 @return always returns TRUE */
3450 static
3451 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3452 fts_query_index_fetch_nodes(
3453 /*========================*/
3454 	void*		row,		/*!< in: sel_node_t* */
3455 	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
3456 {
3457 	fts_string_t	key;
3458 	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
3459 	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
3460 	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
3461 	que_node_t*	exp = sel_node->select_list;
3462 	dfield_t*	dfield = que_node_get_val(exp);
3463 	void*		data = dfield_get_data(dfield);
3464 	ulint		dfield_len = dfield_get_len(dfield);
3465 
3466 	key.f_str = static_cast<byte*>(data);
3467 	key.f_len = dfield_len;
3468 
3469 	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3470 
3471 	/* Note: we pass error out by 'query->error' */
3472 	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3473 
3474 	if (query->error != DB_SUCCESS) {
3475 		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3476 		return(FALSE);
3477 	} else {
3478 		return(TRUE);
3479 	}
3480 }
3481 
3482 /*****************************************************************//**
3483 Calculate the inverse document frequency (IDF) for all the terms. */
3484 static
3485 void
fts_query_calculate_idf(fts_query_t * query)3486 fts_query_calculate_idf(
3487 /*====================*/
3488 	fts_query_t*	query)	/*!< in: Query state */
3489 {
3490 	const ib_rbt_node_t*	node;
3491 	ib_uint64_t		total_docs = query->total_docs;
3492 
3493 	/* We need to free any instances of fts_doc_freq_t that we
3494 	may have allocated. */
3495 	for (node = rbt_first(query->word_freqs);
3496 	     node;
3497 	     node = rbt_next(query->word_freqs, node)) {
3498 
3499 		fts_word_freq_t*	word_freq;
3500 
3501 		word_freq = rbt_value(fts_word_freq_t, node);
3502 
3503 		if (word_freq->doc_count > 0) {
3504 			if (total_docs == word_freq->doc_count) {
3505 				/* QP assume ranking > 0 if we find
3506 				a match. Since Log10(1) = 0, we cannot
3507 				make IDF a zero value if do find a
3508 				word in all documents. So let's make
3509 				it an arbitrary very small number */
3510 				word_freq->idf = log10(1.0001);
3511 			} else {
3512 				word_freq->idf = log10(
3513 					total_docs
3514 					/ (double) word_freq->doc_count);
3515 			}
3516 		}
3517 	}
3518 }
3519 
3520 /*****************************************************************//**
3521 Calculate the ranking of the document. */
3522 static
3523 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3524 fts_query_calculate_ranking(
3525 /*========================*/
3526 	const fts_query_t*	query,		/*!< in: query state */
3527 	fts_ranking_t*		ranking)	/*!< in: Document to rank */
3528 {
3529 	ulint	pos = 0;
3530 	fts_string_t	word;
3531 
3532 	/* At this stage, ranking->rank should not exceed the 1.0
3533 	bound */
3534 	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3535 	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3536 
3537 	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3538 		int			ret;
3539 		ib_rbt_bound_t		parent;
3540 		double			weight;
3541 		fts_doc_freq_t*		doc_freq;
3542 		fts_word_freq_t*	word_freq;
3543 
3544 		ret = rbt_search(query->word_freqs, &parent, &word);
3545 
3546 		/* It must exist. */
3547 		ut_a(ret == 0);
3548 
3549 		word_freq = rbt_value(fts_word_freq_t, parent.last);
3550 
3551 		ret = rbt_search(
3552 			word_freq->doc_freqs, &parent, &ranking->doc_id);
3553 
3554 		/* It must exist. */
3555 		ut_a(ret == 0);
3556 
3557 		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3558 
3559 		weight = (double) doc_freq->freq * word_freq->idf;
3560 
3561 		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3562 	}
3563 }
3564 
3565 /*****************************************************************//**
3566 Add ranking to the result set. */
3567 static
3568 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3569 fts_query_add_ranking(
3570 /*==================*/
3571 	fts_query_t*		query,		/*!< in: query state */
3572 	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
3573 	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
3574 {
3575 	ib_rbt_bound_t		parent;
3576 
3577 	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
3578 	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3579 		fts_ranking_t*	ranking;
3580 
3581 		ranking = rbt_value(fts_ranking_t, parent.last);
3582 
3583 		ranking->rank += new_ranking->rank;
3584 
3585 		ut_a(ranking->words == NULL);
3586 	} else {
3587 		rbt_add_node(ranking_tree, &parent, new_ranking);
3588 
3589 		query->total_size += SIZEOF_RBT_NODE_ADD
3590 			+ sizeof(fts_ranking_t);
3591 	}
3592 }
3593 
3594 /*****************************************************************//**
3595 Retrieve the FTS Relevance Ranking result for doc with doc_id
3596 @return the relevance ranking value, 0 if no ranking value
3597 present. */
3598 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3599 fts_retrieve_ranking(
3600 /*=================*/
3601 	fts_result_t*	result,	/*!< in: FTS result structure */
3602 	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
3603 {
3604 	ib_rbt_bound_t		parent;
3605 	fts_ranking_t		new_ranking;
3606 
3607 	DBUG_ENTER("fts_retrieve_ranking");
3608 
3609 	if (!result || !result->rankings_by_id) {
3610 		DBUG_RETURN(0);
3611 	}
3612 
3613 	new_ranking.doc_id = doc_id;
3614 
3615 	/* Lookup the ranking in our rb tree */
3616 	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3617 		fts_ranking_t*  ranking;
3618 
3619 		ranking = rbt_value(fts_ranking_t, parent.last);
3620 
3621 		DBUG_RETURN(ranking->rank);
3622 	}
3623 
3624 	DBUG_RETURN(0);
3625 }
3626 
3627 /*****************************************************************//**
3628 Create the result and copy the data to it. */
3629 static
3630 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3631 fts_query_prepare_result(
3632 /*=====================*/
3633 	fts_query_t*	query,	/*!< in: Query state */
3634 	fts_result_t*	result)	/*!< in: result this can contain
3635 				data from a previous search on
3636 				another FTS index */
3637 {
3638 	const ib_rbt_node_t*	node;
3639 	bool			result_is_null = false;
3640 
3641 	DBUG_ENTER("fts_query_prepare_result");
3642 
3643 	if (result == NULL) {
3644 		result = static_cast<fts_result_t*>(
3645 			ut_zalloc_nokey(sizeof(*result)));
3646 
3647 		result->rankings_by_id = rbt_create(
3648 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3649 
3650 		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3651 		result_is_null = true;
3652 	}
3653 
3654 	if (query->flags == FTS_OPT_RANKING) {
3655 		fts_word_freq_t*	word_freq;
3656 		ulint		size = ib_vector_size(query->deleted->doc_ids);
3657 		doc_id_t*	updates =
3658 			(doc_id_t*) query->deleted->doc_ids->data;
3659 
3660 		node = rbt_first(query->word_freqs);
3661 		ut_ad(node);
3662 		word_freq = rbt_value(fts_word_freq_t, node);
3663 
3664 		for (node = rbt_first(word_freq->doc_freqs);
3665 		     node;
3666 		     node = rbt_next(word_freq->doc_freqs, node)) {
3667 			fts_doc_freq_t* doc_freq;
3668 			fts_ranking_t	ranking;
3669 
3670 			doc_freq = rbt_value(fts_doc_freq_t, node);
3671 
3672 			/* Don't put deleted docs into result */
3673 			if (fts_bsearch(updates, 0, static_cast<int>(size),
3674 					doc_freq->doc_id) >= 0) {
3675 				/* one less matching doc count */
3676 				--word_freq->doc_count;
3677 				continue;
3678 			}
3679 
3680 			ranking.doc_id = doc_freq->doc_id;
3681 			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3682 			ranking.words = NULL;
3683 
3684 			fts_query_add_ranking(query, result->rankings_by_id,
3685 					      &ranking);
3686 
3687 			if (query->total_size > fts_result_cache_limit) {
3688 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3689 				fts_query_free_result(result);
3690 				DBUG_RETURN(NULL);
3691 			}
3692 		}
3693 
3694 		/* Calculate IDF only after we exclude the deleted items */
3695 		fts_query_calculate_idf(query);
3696 
3697 		node = rbt_first(query->word_freqs);
3698 		word_freq = rbt_value(fts_word_freq_t, node);
3699 
3700 		/* Calculate the ranking for each doc */
3701 		for (node = rbt_first(result->rankings_by_id);
3702 		     node != NULL;
3703 		     node = rbt_next(result->rankings_by_id, node)) {
3704 
3705 			fts_ranking_t*  ranking;
3706 
3707 			ranking = rbt_value(fts_ranking_t, node);
3708 
3709 			ranking->rank = static_cast<fts_rank_t>(
3710 				ranking->rank * word_freq->idf * word_freq->idf);
3711 		}
3712 
3713 		DBUG_RETURN(result);
3714 	}
3715 
3716 	ut_a(rbt_size(query->doc_ids) > 0);
3717 
3718 	for (node = rbt_first(query->doc_ids);
3719 	     node;
3720 	     node = rbt_next(query->doc_ids, node)) {
3721 
3722 		fts_ranking_t*	ranking;
3723 
3724 		ranking = rbt_value(fts_ranking_t, node);
3725 		fts_query_calculate_ranking(query, ranking);
3726 
3727 		// FIXME: I think we may requre this information to improve the
3728 		// ranking of doc ids which have more word matches from
3729 		// different FTS indexes.
3730 
3731 		/* We don't need these anymore free the resources. */
3732 		ranking->words = NULL;
3733 
3734 		if (!result_is_null) {
3735 			fts_query_add_ranking(query, result->rankings_by_id, ranking);
3736 
3737 			 if (query->total_size > fts_result_cache_limit) {
3738 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3739 				fts_query_free_result(result);
3740 				DBUG_RETURN(NULL);
3741                         }
3742 		}
3743 	}
3744 
3745 	if (result_is_null) {
3746 		/* Use doc_ids directly */
3747 		rbt_free(result->rankings_by_id);
3748 		result->rankings_by_id = query->doc_ids;
3749 		query->doc_ids = NULL;
3750 	}
3751 
3752 	DBUG_RETURN(result);
3753 }
3754 
3755 /*****************************************************************//**
3756 Get the result of the query. Calculate the similarity coefficient. */
3757 static
3758 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3759 fts_query_get_result(
3760 /*=================*/
3761 	fts_query_t*		query,	/*!< in: query instance */
3762 	fts_result_t*		result)	/*!< in: result */
3763 {
3764 	DBUG_ENTER("fts_query_get_result");
3765 
3766 	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3767 		/* Copy the doc ids to the result. */
3768 		result = fts_query_prepare_result(query, result);
3769 	} else {
3770 		/* Create an empty result instance. */
3771 		result = static_cast<fts_result_t*>(
3772 			ut_zalloc_nokey(sizeof(*result)));
3773 	}
3774 
3775 	DBUG_RETURN(result);
3776 }
3777 
3778 /*****************************************************************//**
3779 FTS Query free resources and reset. */
3780 static
3781 void
fts_query_free(fts_query_t * query)3782 fts_query_free(
3783 /*===========*/
3784 	fts_query_t*	query)		/*!< in: query instance to free*/
3785 {
3786 
3787 	if (query->read_nodes_graph) {
3788 		fts_que_graph_free(query->read_nodes_graph);
3789 	}
3790 
3791 	if (query->root) {
3792 		fts_ast_free_node(query->root);
3793 	}
3794 
3795 	if (query->deleted) {
3796 		fts_doc_ids_free(query->deleted);
3797 	}
3798 
3799 	if (query->intersection) {
3800 		fts_query_free_doc_ids(query, query->intersection);
3801 	}
3802 
3803 	if (query->doc_ids) {
3804 		fts_query_free_doc_ids(query, query->doc_ids);
3805 	}
3806 
3807 	if (query->word_freqs) {
3808 		const ib_rbt_node_t*	node;
3809 
3810 		/* We need to free any instances of fts_doc_freq_t that we
3811 		may have allocated. */
3812 		for (node = rbt_first(query->word_freqs);
3813 		     node;
3814 		     node = rbt_next(query->word_freqs, node)) {
3815 
3816 			fts_word_freq_t*	word_freq;
3817 
3818 			word_freq = rbt_value(fts_word_freq_t, node);
3819 
3820 			/* We need to cast away the const. */
3821 			rbt_free(word_freq->doc_freqs);
3822 		}
3823 
3824 		rbt_free(query->word_freqs);
3825 	}
3826 
3827 	if (query->wildcard_words != NULL) {
3828 		rbt_free(query->wildcard_words);
3829 	}
3830 
3831 	ut_a(!query->intersection);
3832 
3833 	if (query->word_map) {
3834 		rbt_free(query->word_map);
3835 	}
3836 
3837 	if (query->word_vector != NULL) {
3838 		UT_DELETE(query->word_vector);
3839 	}
3840 
3841 	if (query->heap) {
3842 		mem_heap_free(query->heap);
3843 	}
3844 
3845 	memset(query, 0, sizeof(*query));
3846 }
3847 
3848 /*****************************************************************//**
3849 Parse the query using flex/bison or plugin parser.
3850 @return parse tree node. */
3851 static
3852 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3853 fts_query_parse(
3854 /*============*/
3855 	fts_query_t*	query,		/*!< in: query instance */
3856 	byte*		query_str,	/*!< in: query string */
3857 	ulint		query_len)	/*!< in: query string length */
3858 {
3859 	int		error;
3860 	fts_ast_state_t state;
3861 	bool		mode = query->boolean_mode;
3862 	DBUG_ENTER("fts_query_parse");
3863 
3864 	memset(&state, 0x0, sizeof(state));
3865 
3866 	state.charset = query->fts_index_table.charset;
3867 
3868 	DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3869 		query->parser = NULL;);
3870 
3871 	if (query->parser) {
3872 		state.root = state.cur_node =
3873 			fts_ast_create_node_list(&state, NULL);
3874 		error = fts_parse_by_parser(mode, query_str, query_len,
3875 					    query->parser, &state);
3876 	} else {
3877 		/* Setup the scanner to use, this depends on the mode flag. */
3878 		state.lexer = fts_lexer_create(mode, query_str, query_len);
3879 		state.charset = query->fts_index_table.charset;
3880 		error = fts_parse(&state);
3881 		fts_lexer_free(state.lexer);
3882 		state.lexer = NULL;
3883 	}
3884 
3885 	/* Error during parsing ? */
3886 	if (error) {
3887 		/* Free the nodes that were allocated during parsing. */
3888 		fts_ast_state_free(&state);
3889 	} else {
3890 		query->root = state.root;
3891 
3892 		if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
3893 			fts_ast_node_print(query->root);
3894 		}
3895 	}
3896 
3897 	DBUG_RETURN(state.root);
3898 }
3899 
3900 /*******************************************************************//**
3901 FTS Query optimization
3902 Set FTS_OPT_RANKING if it is a simple term query */
3903 static
3904 void
fts_query_can_optimize(fts_query_t * query,uint flags)3905 fts_query_can_optimize(
3906 /*===================*/
3907 	fts_query_t*	query,		/*!< in/out: query instance */
3908 	uint		flags)		/*!< In: FTS search mode */
3909 {
3910 	fts_ast_node_t*	node = query->root;
3911 
3912 	if (flags & FTS_EXPAND) {
3913 		return;
3914 	}
3915 
3916 	/* Check if it has only a term without oper */
3917 	ut_ad(node->type == FTS_AST_LIST);
3918 	node = node->list.head;
3919 	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3920 		query->flags = FTS_OPT_RANKING;
3921 	}
3922 }
3923 
3924 /** FTS Query entry point.
3925 @param[in,out]	trx		transaction
3926 @param[in]	index		fts index to search
3927 @param[in]	flags		FTS search mode
3928 @param[in]	query_str	FTS query
3929 @param[in]	query_len	FTS query string len in bytes
3930 @param[in,out]	result		result doc ids
3931 @return DB_SUCCESS if successful otherwise error code */
3932 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3933 fts_query(
3934 	trx_t*		trx,
3935 	dict_index_t*	index,
3936 	uint		flags,
3937 	const byte*	query_str,
3938 	ulint		query_len,
3939 	fts_result_t**	result)
3940 {
3941 	fts_query_t	query;
3942 	dberr_t		error = DB_SUCCESS;
3943 	byte*		lc_query_str;
3944 	ulint		lc_query_str_len;
3945 	ulint		result_len;
3946 	bool		boolean_mode;
3947 	trx_t*		query_trx; /* FIXME: use provided trx */
3948 	CHARSET_INFO*	charset;
3949 	ulint		start_time_ms;
3950 	bool		will_be_ignored = false;
3951 
3952 	boolean_mode = flags & FTS_BOOL;
3953 
3954 	*result = NULL;
3955 	memset(&query, 0x0, sizeof(query));
3956 	query_trx = trx_create();
3957 	query_trx->op_info = "FTS query";
3958 
3959 	start_time_ms = ut_time_ms();
3960 
3961 	query.trx = query_trx;
3962 	query.index = index;
3963 	query.boolean_mode = boolean_mode;
3964 	query.deleted = fts_doc_ids_create();
3965 	query.cur_node = NULL;
3966 
3967 	query.fts_common_table.type = FTS_COMMON_TABLE;
3968 	query.fts_common_table.table_id = index->table->id;
3969 	query.fts_common_table.table = index->table;
3970 
3971 	charset = fts_index_get_charset(index);
3972 
3973 	query.fts_index_table.type = FTS_INDEX_TABLE;
3974 	query.fts_index_table.index_id = index->id;
3975 	query.fts_index_table.table_id = index->table->id;
3976 	query.fts_index_table.charset = charset;
3977 	query.fts_index_table.table = index->table;
3978 
3979 	query.word_map = rbt_create_arg_cmp(
3980 		sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3981 	query.word_vector = UT_NEW_NOKEY(word_vector_t());
3982 	query.error = DB_SUCCESS;
3983 
3984 	/* Setup the RB tree that will be used to collect per term
3985 	statistics. */
3986 	query.word_freqs = rbt_create_arg_cmp(
3987 		sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3988                 (void*) charset);
3989 
3990 	if (flags & FTS_EXPAND) {
3991 		query.wildcard_words = rbt_create_arg_cmp(
3992 			sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3993 	}
3994 
3995 	query.total_size += SIZEOF_RBT_CREATE;
3996 
3997 	query.total_docs = dict_table_get_n_rows(index->table);
3998 
3999 	query.fts_common_table.suffix = "DELETED";
4000 
4001 	/* Read the deleted doc_ids, we need these for filtering. */
4002 	error = fts_table_fetch_doc_ids(
4003 		NULL, &query.fts_common_table, query.deleted);
4004 
4005 	if (error != DB_SUCCESS) {
4006 		goto func_exit;
4007 	}
4008 
4009 	query.fts_common_table.suffix = "DELETED_CACHE";
4010 
4011 	error = fts_table_fetch_doc_ids(
4012 		NULL, &query.fts_common_table, query.deleted);
4013 
4014 	if (error != DB_SUCCESS) {
4015 		goto func_exit;
4016 	}
4017 
4018 	/* Get the deleted doc ids that are in the cache. */
4019 	fts_cache_append_deleted_doc_ids(
4020 		index->table->fts->cache, query.deleted->doc_ids);
4021 	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4022 
4023 	/* Sort the vector so that we can do a binary search over the ids. */
4024 	ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
4025 
4026 	/* Convert the query string to lower case before parsing. We own
4027 	the ut_malloc'ed result and so remember to free it before return. */
4028 
4029 	lc_query_str_len = query_len * charset->casedn_multiply + 1;
4030 	lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4031 
4032 	/* For binary collations, a case sensitive search is
4033 	performed. Hence don't convert to lower case. */
4034 	if (my_binary_compare(charset)) {
4035 	memcpy(lc_query_str, query_str, query_len);
4036 		lc_query_str[query_len]= 0;
4037 		result_len= query_len;
4038 	} else {
4039 	result_len = innobase_fts_casedn_str(
4040 				charset, (char*)( query_str), query_len,
4041 				(char*)(lc_query_str), lc_query_str_len);
4042 	}
4043 
4044 	ut_ad(result_len < lc_query_str_len);
4045 
4046 	lc_query_str[result_len] = 0;
4047 
4048 	query.heap = mem_heap_create(128);
4049 
4050 	/* Create the rb tree for the doc id (current) set. */
4051 	query.doc_ids = rbt_create(
4052 		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4053 	query.parser = index->parser;
4054 
4055 	query.total_size += SIZEOF_RBT_CREATE;
4056 
4057 	/* Parse the input query string. */
4058 	if (fts_query_parse(&query, lc_query_str, result_len)) {
4059 		fts_ast_node_t*	ast = query.root;
4060 		ast->trx = trx;
4061 
4062 		/* Optimize query to check if it's a single term */
4063 		fts_query_can_optimize(&query, flags);
4064 
4065 		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4066 			        fts_result_cache_limit = 2048;
4067 		);
4068 
4069 		/* Traverse the Abstract Syntax Tree (AST) and execute
4070 		the query. */
4071 		query.error = fts_ast_visit(
4072 			FTS_NONE, ast, fts_query_visitor,
4073 			&query, &will_be_ignored);
4074 		if (query.error == DB_INTERRUPTED) {
4075 			error = DB_INTERRUPTED;
4076 			ut_free(lc_query_str);
4077 			goto func_exit;
4078 		}
4079 
4080 		/* If query expansion is requested, extend the search
4081 		with first search pass result */
4082 		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4083 			query.error = fts_expand_query(index, &query);
4084 		}
4085 
4086 		/* Calculate the inverse document frequency of the terms. */
4087 		if (query.error == DB_SUCCESS
4088 		    && query.flags != FTS_OPT_RANKING) {
4089 			fts_query_calculate_idf(&query);
4090 		}
4091 
4092 		/* Copy the result from the query state, so that we can
4093 		return it to the caller. */
4094 		if (query.error == DB_SUCCESS) {
4095 			*result = fts_query_get_result(&query, *result);
4096 		}
4097 
4098 		error = query.error;
4099 	} else {
4100 		/* still return an empty result set */
4101 		*result = static_cast<fts_result_t*>(
4102 			ut_zalloc_nokey(sizeof(**result)));
4103 	}
4104 
4105 	if (trx_is_interrupted(trx)) {
4106 		error = DB_INTERRUPTED;
4107 		ut_free(lc_query_str);
4108 		if (*result) {
4109 			fts_query_free_result(*result);
4110 		}
4111 		goto func_exit;
4112 	}
4113 
4114 	ut_free(lc_query_str);
4115 
4116 	if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
4117 		ulint	diff_time = ut_time_ms() - start_time_ms;
4118 
4119 		ib::info() << "FTS Search Processing time: "
4120 			<< diff_time / 1000 << " secs: " << diff_time % 1000
4121 			<< " millisec: row(s) "
4122 			<< ((*result)->rankings_by_id
4123 			    ? lint(rbt_size((*result)->rankings_by_id))
4124 			    : -1);
4125 
4126 		/* Log memory consumption & result size */
4127 		ib::info() << "Full Search Memory: " << query.total_size
4128 			<< " (bytes),  Row: "
4129 			<< ((*result)->rankings_by_id
4130 			    ? rbt_size((*result)->rankings_by_id)
4131 			    : 0)
4132 			<< ".";
4133 	}
4134 
4135 func_exit:
4136 	fts_query_free(&query);
4137 
4138 	query_trx->free();
4139 
4140 	return(error);
4141 }
4142 
4143 /*****************************************************************//**
4144 FTS Query free result, returned by fts_query(). */
4145 void
fts_query_free_result(fts_result_t * result)4146 fts_query_free_result(
4147 /*==================*/
4148 	fts_result_t*	result)		/*!< in: result instance to free.*/
4149 {
4150 	if (result) {
4151 		if (result->rankings_by_id != NULL) {
4152 			rbt_free(result->rankings_by_id);
4153 			result->rankings_by_id = NULL;
4154 		}
4155 		if (result->rankings_by_rank != NULL) {
4156 			rbt_free(result->rankings_by_rank);
4157 			result->rankings_by_rank = NULL;
4158 		}
4159 
4160 		ut_free(result);
4161 		result = NULL;
4162 	}
4163 }
4164 
4165 /*****************************************************************//**
4166 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4167 void
fts_query_sort_result_on_rank(fts_result_t * result)4168 fts_query_sort_result_on_rank(
4169 /*==========================*/
4170 	fts_result_t*	result)		/*!< out: result instance to sort.*/
4171 {
4172 	const ib_rbt_node_t*	node;
4173 	ib_rbt_t*		ranked;
4174 
4175 	ut_a(result->rankings_by_id != NULL);
4176 	if (result->rankings_by_rank) {
4177 		rbt_free(result->rankings_by_rank);
4178 	}
4179 
4180 	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4181 
4182 	/* We need to free any instances of fts_doc_freq_t that we
4183 	may have allocated. */
4184 	for (node = rbt_first(result->rankings_by_id);
4185 	     node;
4186 	     node = rbt_next(result->rankings_by_id, node)) {
4187 
4188 		fts_ranking_t*	ranking;
4189 
4190 		ranking = rbt_value(fts_ranking_t, node);
4191 
4192 		ut_a(ranking->words == NULL);
4193 
4194 		rbt_insert(ranked, ranking, ranking);
4195 	}
4196 
4197 	/* Reset the current node too. */
4198 	result->current = NULL;
4199 	result->rankings_by_rank = ranked;
4200 }
4201 
4202 /*******************************************************************//**
4203 A debug function to print result doc_id set. */
4204 static
4205 void
fts_print_doc_id(fts_query_t * query)4206 fts_print_doc_id(
4207 /*=============*/
4208 	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
4209 {
4210 	const ib_rbt_node_t*	node;
4211 
4212 	/* Iterate each member of the doc_id set */
4213 	for (node = rbt_first(query->doc_ids);
4214 	     node;
4215 	     node = rbt_next(query->doc_ids, node)) {
4216 		fts_ranking_t*	ranking;
4217 		ranking = rbt_value(fts_ranking_t, node);
4218 
4219 		ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4220 
4221 		ulint		pos = 0;
4222 		fts_string_t	word;
4223 
4224 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4225 			ib::info() << "doc_ids info, value: " << word.f_str;
4226 		}
4227 	}
4228 }
4229 
4230 /*************************************************************//**
4231 This function implements a simple "blind" query expansion search:
4232 words in documents found in the first search pass will be used as
4233 search arguments to search the document again, thus "expand"
4234 the search result set.
4235 @return DB_SUCCESS if success, otherwise the error code */
4236 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4237 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4238 fts_expand_query(
4239 /*=============*/
4240 	dict_index_t*	index,		/*!< in: FTS index to search */
4241 	fts_query_t*	query)		/*!< in: FTS query instance */
4242 {
4243 	const ib_rbt_node_t*	node;
4244 	const ib_rbt_node_t*	token_node;
4245 	fts_doc_t		result_doc;
4246 	dberr_t			error = DB_SUCCESS;
4247 	const fts_index_cache_t*index_cache;
4248 
4249 	/* If no doc is found in first search pass, return */
4250 	if (!rbt_size(query->doc_ids)) {
4251 		return(error);
4252 	}
4253 
4254 	/* Init "result_doc", to hold words from the first search pass */
4255 	fts_doc_init(&result_doc);
4256 
4257 	rw_lock_x_lock(&index->table->fts->cache->lock);
4258 	index_cache = fts_find_index_cache(index->table->fts->cache, index);
4259 	rw_lock_x_unlock(&index->table->fts->cache->lock);
4260 
4261 	ut_a(index_cache);
4262 
4263 	result_doc.tokens = rbt_create_arg_cmp(
4264 		sizeof(fts_token_t), innobase_fts_text_cmp,
4265 		(void*) index_cache->charset);
4266 
4267 	result_doc.charset = index_cache->charset;
4268 	result_doc.parser = index_cache->index->parser;
4269 
4270 	query->total_size += SIZEOF_RBT_CREATE;
4271 
4272 	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
4273 		fts_print_doc_id(query);
4274 	}
4275 
4276 	for (node = rbt_first(query->doc_ids);
4277 	     node;
4278 	     node = rbt_next(query->doc_ids, node)) {
4279 
4280 		fts_ranking_t*	ranking;
4281 		ulint		prev_token_size;
4282 		ulint		estimate_size;
4283 
4284 		prev_token_size = rbt_size(result_doc.tokens);
4285 
4286 		ranking = rbt_value(fts_ranking_t, node);
4287 
4288 		/* Fetch the documents with the doc_id from the
4289 		result of first seach pass. Since we do not
4290 		store document-to-word mapping, we need to
4291 		fetch the original document and parse them.
4292 		Future optimization could be done here if we
4293 		support some forms of document-to-word mapping */
4294 		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4295 					FTS_FETCH_DOC_BY_ID_EQUAL,
4296 					fts_query_expansion_fetch_doc,
4297 					&result_doc);
4298 
4299 		/* Estimate memory used, see fts_process_token and fts_token_t.
4300 		   We ignore token size here. */
4301 		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4302 			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4303 			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
4304 		query->total_size += estimate_size;
4305 
4306 		if (query->total_size > fts_result_cache_limit) {
4307 			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4308 			goto	func_exit;
4309 		}
4310 	}
4311 
4312 	/* Remove words that have already been searched in the first pass */
4313 	for (ulint i = 0; i < query->word_vector->size(); i++) {
4314 		fts_string_t	word = query->word_vector->at(i);
4315 		ib_rbt_bound_t	parent;
4316 
4317 		if (query->wildcard_words
4318 		    && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4319 			/* If it's a wildcard word, remove words having
4320 			it as prefix. */
4321 			while (rbt_search_cmp(result_doc.tokens,
4322 					      &parent, &word, NULL,
4323 					      innobase_fts_text_cmp_prefix)
4324 			       == 0) {
4325 				ut_free(rbt_remove_node(result_doc.tokens,
4326 							parent.last));
4327 			}
4328 		} else {
4329 			/* We don't check return value, because the word may
4330 			have been deleted by a previous wildcard word as its
4331 			prefix, e.g. ('g * good'). */
4332 			rbt_delete(result_doc.tokens, &word);
4333 		}
4334 	}
4335 
4336 	/* Search the table the second time with expanded search list */
4337 	for (token_node = rbt_first(result_doc.tokens);
4338 	     token_node;
4339 	     token_node = rbt_next(result_doc.tokens, token_node)) {
4340 		fts_token_t*	mytoken;
4341 		mytoken = rbt_value(fts_token_t, token_node);
4342 
4343 		/* '%' in the end is treated as prefix search,
4344 		it can cause assert failure, so we skip it. */
4345 		if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4346 			continue;
4347 		}
4348 
4349 		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4350 		fts_query_add_word_freq(query, &mytoken->text);
4351 		error = fts_query_union(query, &mytoken->text);
4352 
4353 		if (error != DB_SUCCESS) {
4354 			break;
4355 		}
4356 	}
4357 
4358 func_exit:
4359 	fts_doc_free(&result_doc);
4360 
4361 	return(error);
4362 }
4363 /*************************************************************//**
4364 This function finds documents that contain all words in a
4365 phrase or proximity search. And if proximity search, verify
4366 the words are close enough to each other, as in specified distance.
4367 This function is called for phrase and proximity search.
4368 @return TRUE if documents are found, FALSE if otherwise */
4369 static
4370 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4371 fts_phrase_or_proximity_search(
4372 /*===========================*/
4373 	fts_query_t*	query,		/*!< in/out:  query instance.
4374 					query->doc_ids might be instantiated
4375 					with qualified doc IDs */
4376 	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
4377 {
4378 	ulint		n_matched;
4379 	ulint		i;
4380 	ibool		matched = FALSE;
4381 	ulint		num_token = ib_vector_size(tokens);
4382 	fts_match_t*	match[MAX_PROXIMITY_ITEM];
4383 	ibool		end_list = FALSE;
4384 
4385 	/* Number of matched documents for the first token */
4386 	n_matched = ib_vector_size(query->match_array[0]);
4387 
4388 	/* We have a set of match list for each word, we shall
4389 	walk through the list and find common documents that
4390 	contain all the matching words. */
4391 	for (i = 0; i < n_matched; i++) {
4392 		ulint		j;
4393 		ulint		k = 0;
4394 		fts_proximity_t	qualified_pos;
4395 
4396 		match[0] = static_cast<fts_match_t*>(
4397 			ib_vector_get(query->match_array[0], i));
4398 
4399 		/* For remaining match list for the token(word), we
4400 		try to see if there is a document with the same
4401 		doc id */
4402 		for (j = 1; j < num_token; j++) {
4403 			match[j] = static_cast<fts_match_t*>(
4404 				ib_vector_get(query->match_array[j], k));
4405 
4406 			while (match[j]->doc_id < match[0]->doc_id
4407 			       && k < ib_vector_size(query->match_array[j])) {
4408 				 match[j] = static_cast<fts_match_t*>(
4409 					ib_vector_get(
4410 						query->match_array[j], k));
4411 				k++;
4412 			}
4413 
4414 			if (match[j]->doc_id > match[0]->doc_id) {
4415 				/* no match */
4416 				if (query->flags & FTS_PHRASE) {
4417 					match[0]->doc_id = 0;
4418 				}
4419 				break;
4420 			}
4421 
4422 			if (k == ib_vector_size(query->match_array[j])) {
4423 				end_list = TRUE;
4424 
4425 				if (query->flags & FTS_PHRASE) {
4426 					ulint	s;
4427 					/* Since i is the last doc id in the
4428 					match_array[j], remove all doc ids > i
4429 					from the match_array[0]. */
4430 					fts_match_t*    match_temp;
4431 					for (s = i + 1; s < n_matched; s++) {
4432 						match_temp = static_cast<
4433 						fts_match_t*>(ib_vector_get(
4434 						query->match_array[0], s));
4435 						match_temp->doc_id = 0;
4436 					}
4437 
4438 					if (match[j]->doc_id !=
4439 						match[0]->doc_id) {
4440 						/* no match */
4441 						match[0]->doc_id = 0;
4442 					}
4443 				}
4444 
4445 				if (match[j]->doc_id != match[0]->doc_id) {
4446 					goto func_exit;
4447 				}
4448 			}
4449 
4450 			/* FIXME: A better solution will be a counter array
4451 			remember each run's last position. So we don't
4452 			reset it here very time */
4453 			k = 0;
4454 		}
4455 
4456 		if (j != num_token) {
4457 			continue;
4458 		}
4459 
4460 		/* For this matching doc, we need to further
4461 		verify whether the words in the doc are close
4462 		to each other, and within the distance specified
4463 		in the proximity search */
4464 		if (query->flags & FTS_PHRASE) {
4465 			matched = TRUE;
4466 		} else if (fts_proximity_get_positions(
4467 			match, num_token, ULINT_MAX, &qualified_pos)) {
4468 
4469 			/* Fetch the original documents and count the
4470 			words in between matching words to see that is in
4471 			specified distance */
4472 			if (fts_query_is_in_proximity_range(
4473 				query, match, &qualified_pos)) {
4474 				/* If so, mark we find a matching doc */
4475 				query->error = fts_query_process_doc_id(
4476 					query, match[0]->doc_id, 0);
4477 				if (query->error != DB_SUCCESS) {
4478 					matched = FALSE;
4479 					goto func_exit;
4480 				}
4481 
4482 				matched = TRUE;
4483 				for (ulint z = 0; z < num_token; z++) {
4484 					fts_string_t*	token;
4485 					token = static_cast<fts_string_t*>(
4486 						ib_vector_get(tokens, z));
4487 					fts_query_add_word_to_document(
4488 						query, match[0]->doc_id, token);
4489 				}
4490 			}
4491 		}
4492 
4493 		if (end_list) {
4494 			break;
4495 		}
4496 	}
4497 
4498 func_exit:
4499 	return(matched);
4500 }
4501 
4502 /*************************************************************//**
4503 This function checks whether words in result documents are close to
4504 each other (within proximity range as specified by "distance").
4505 If "distance" is MAX_ULINT, then it will find all combinations of
4506 positions of matching words and store min and max positions
4507 in the "qualified_pos" for later verification.
4508 @return true if words are close to each other, false if otherwise */
4509 static
4510 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4511 fts_proximity_get_positions(
4512 /*========================*/
4513 	fts_match_t**		match,		/*!< in: query instance */
4514 	ulint			num_match,	/*!< in: number of matching
4515 						items */
4516 	ulint			distance,	/*!< in: distance value
4517 						for proximity search */
4518 	fts_proximity_t*	qualified_pos)	/*!< out: the position info
4519 						records ranges containing
4520 						all matching words. */
4521 {
4522 	ulint	i;
4523 	ulint	idx[MAX_PROXIMITY_ITEM];
4524 	ulint	num_pos[MAX_PROXIMITY_ITEM];
4525 	ulint	min_idx;
4526 
4527 	qualified_pos->n_pos = 0;
4528 
4529 	ut_a(num_match <= MAX_PROXIMITY_ITEM);
4530 
4531 	/* Each word could appear multiple times in a doc. So
4532 	we need to walk through each word's position list, and find
4533 	closest distance between different words to see if
4534 	they are in the proximity distance. */
4535 
4536 	/* Assume each word's position list is sorted, we
4537 	will just do a walk through to all words' lists
4538 	similar to a the merge phase of a merge sort */
4539 	for (i = 0; i < num_match; i++) {
4540 		/* idx is the current position we are checking
4541 		for a particular word */
4542 		idx[i] = 0;
4543 
4544 		/* Number of positions for this word */
4545 		num_pos[i] = ib_vector_size(match[i]->positions);
4546 	}
4547 
4548 	/* Start with the first word */
4549 	min_idx = 0;
4550 
4551 	while (idx[min_idx] < num_pos[min_idx]) {
4552 		ulint	position[MAX_PROXIMITY_ITEM];
4553 		ulint	min_pos = ULINT_MAX;
4554 		ulint	max_pos = 0;
4555 
4556 		/* Check positions in each word position list, and
4557 		record the max/min position */
4558 		for (i = 0; i < num_match; i++) {
4559 			position[i] = *(ulint*) ib_vector_get_const(
4560 				match[i]->positions, idx[i]);
4561 
4562 			if (position[i] == ULINT_UNDEFINED) {
4563 				break;
4564 			}
4565 
4566 			if (position[i] < min_pos) {
4567 				min_pos = position[i];
4568 				min_idx = i;
4569 			}
4570 
4571 			if (position[i] > max_pos) {
4572 				max_pos = position[i];
4573 			}
4574 		}
4575 
4576 		/* If max and min position are within range, we
4577 		find a good match */
4578 		if (max_pos - min_pos <= distance
4579 		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4580 			/* The charset has variable character
4581 			length encoding, record the min_pos and
4582 			max_pos, we will need to verify the actual
4583 			number of characters */
4584 			qualified_pos->min_pos.push_back(min_pos);
4585 			qualified_pos->max_pos.push_back(max_pos);
4586 			qualified_pos->n_pos++;
4587 		}
4588 
4589 		/* Otherwise, move to the next position is the
4590 		list for the word with the smallest position */
4591 		idx[min_idx]++;
4592 	}
4593 
4594 	return(qualified_pos->n_pos != 0);
4595 }
4596