1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file fts/fts0que.cc
29 Full Text Search functionality.
30 
31 Created 2007/03/27 Sunny Bains
32 Completed 2011/7/10 Sunny and Jimmy Yang
33 *******************************************************/
34 
35 #include "dict0dict.h" /* dict_table_get_n_rows() */
36 #include "ut0rbt.h"
37 #include "row0sel.h"
38 #include "fts0fts.h"
39 #include "fts0priv.h"
40 #include "fts0ast.h"
41 #include "fts0pars.h"
42 #include "fts0types.h"
43 #include "ha_prototypes.h"
44 #include <ctype.h>
45 
46 #ifndef UNIV_NONINL
47 #include "fts0types.ic"
48 #include "fts0vlc.ic"
49 #endif
50 
51 #include <vector>
52 
53 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
54 
55 #define RANK_DOWNGRADE		(-1.0F)
56 #define RANK_UPGRADE		(1.0F)
57 
58 /* Maximum number of words supported in a phrase or proximity search. */
59 #define MAX_PROXIMITY_ITEM	128
60 
61 /* Memory used by rbt itself for create and node add */
62 #define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
63 #define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
64 
65 /*Initial byte length for 'words' in fts_ranking_t */
66 #define RANKING_WORDS_INIT_LEN	4
67 
68 // FIXME: Need to have a generic iterator that traverses the ilist.
69 
70 typedef std::vector<fts_string_t>	word_vector_t;
71 
72 struct fts_word_freq_t;
73 
74 /** State of an FTS query. */
75 struct fts_query_t {
76 	mem_heap_t*	heap;		/*!< Heap to use for allocations */
77 
78 	trx_t*		trx;		/*!< The query transaction */
79 
80 	dict_index_t*	index;		/*!< The FTS index to search */
81 					/*!< FTS auxiliary common table def */
82 	fts_table_t	fts_common_table;
83 
84 	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
85 
86 	ulint		total_size;	/*!< total memory size used by query */
87 
88 	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
89 					filtered from the output */
90 
91 	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
92 
93 	fts_ast_node_t* cur_node;	/*!< Current tree node */
94 
95 	ib_rbt_t*	word_map;	/*!< Matched word map for
96 					searching by word*/
97 
98 	word_vector_t*	word_vector;	/*!< Matched word vector for
99 					searching by index */
100 
101 	ib_rbt_t*       doc_ids;	/*!< The current set of matching
102 					doc ids, elements are of
103 					type fts_ranking_t */
104 
105 	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
106 					doc_ids, this tree will become
107 					the new doc_ids, elements are of type
108 					fts_ranking_t */
109 
110 					/*!< Prepared statement to read the
111 					nodes from the FTS INDEX */
112 	que_t*		read_nodes_graph;
113 
114 	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
115 
116 					/*!< TRUE if we want to collect the
117 					word positions within the document */
118 	ibool		collect_positions;
119 
120 	ulint		flags;		/*!< Specify the full text search type,
121 					such as  boolean search, phrase
122 					search, proximity search etc. */
123 
124 	ulint		distance;	/*!< The proximity distance of a
125 					phrase search. */
126 
127 					/*!< These doc ids are used as a
128 					boundary condition when searching the
129 					FTS index rows */
130 
131 	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
132 
133 	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
134 
135 	bool		boolean_mode;	/*!< TRUE if boolean mode query */
136 
137 	ib_vector_t*	matched;	/*!< Array of matching documents
138 					(fts_match_t) to search for a phrase */
139 
140 	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
141 					position info for each matched word
142 					in the word list */
143 
144 	ib_uint64_t	total_docs;	/*!< The total number of documents */
145 
146 	ulint		total_words;	/*!< The total number of words */
147 
148 	dberr_t		error;		/*!< Error code if any, that is
149 					encountered during query processing */
150 
151 	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
152 					document, its elements are of type
153 					fts_word_freq_t */
154 
155 	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
156 	ulint		nested_exp_count; /*!< number of nested sub expression
157 					limit */
158 };
159 
160 /** For phrase matching, first we collect the documents and the positions
161 then we match. */
162 struct fts_match_t {
163 	doc_id_t	doc_id;		/*!< Document id */
164 
165 	ulint		start;		/*!< Start the phrase match from
166 					this offset within the positions
167 					vector. */
168 
169 	ib_vector_t*	positions;	/*!< Offsets of a word in a
170 					document */
171 };
172 
173 /** For matching tokens in a phrase search. We use this data structure in
174 the callback that determines whether a document should be accepted or
175 rejected for a phrase search. */
176 struct fts_select_t {
177 	doc_id_t	doc_id;		/*!< The document id to match */
178 
179 	ulint		min_pos;	/*!< For found to be TRUE at least
180 					one position must be greater than
181 					min_pos. */
182 
183 	ibool		found;		/*!< TRUE if found */
184 
185 	fts_word_freq_t*
186 			word_freq;	/*!< Word frequency instance of the
187 					current word being looked up in
188 					the FTS index */
189 };
190 
191 typedef std::vector<ulint>       pos_vector_t;
192 
193 /** structure defines a set of ranges for original documents, each of which
194 has a minimum position and maximum position. Text in such range should
195 contain all words in the proximity search. We will need to count the
196 words in such range to make sure it is less than the specified distance
197 of the proximity search */
198 struct fts_proximity_t {
199 	ulint		n_pos;		/*!< number of position set, defines
200 					a range (min to max) containing all
201 					matching words */
202 	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
203 					of the range */
204 	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
205 					of the range */
206 };
207 
208 /** The match positions and tokesn to match */
209 struct fts_phrase_t {
210 	ibool		found;		/*!< Match result */
211 
212 	const fts_match_t*
213 			match;		/*!< Positions within text */
214 
215 	const ib_vector_t*
216 			tokens;		/*!< Tokens to match */
217 
218 	ulint		distance;	/*!< For matching on proximity
219 					distance. Can be 0 for exact match */
220 	CHARSET_INFO*	charset;	/*!< Phrase match charset */
221 	mem_heap_t*     heap;		/*!< Heap for word processing */
222 	ulint		zip_size;	/*!< row zip size */
223 	fts_proximity_t*proximity_pos;	/*!< position info for proximity
224 					search verification. Records the min
225 					and max position of words matched */
226 };
227 
228 /** For storing the frequncy of a word/term in a document */
229 struct fts_doc_freq_t {
230 	doc_id_t	doc_id;		/*!< Document id */
231 	ulint		freq;		/*!< Frequency of a word in a document */
232 };
233 
234 /** To determine the word frequency per document. */
235 struct fts_word_freq_t {
236 	fts_string_t	word;		/*!< Word for which we need the freq,
237 					it's allocated on the query heap */
238 
239 	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
240 					word frequencies. The elements are
241 					of type fts_doc_freq_t */
242 	ib_uint64_t	doc_count;	/*!< Total number of documents that
243 					contain this word */
244 	double		idf;		/*!< Inverse document frequency */
245 };
246 
247 /********************************************************************
248 Callback function to fetch the rows in an FTS INDEX record.
249 @return always TRUE */
250 static
251 ibool
252 fts_query_index_fetch_nodes(
253 /*========================*/
254 	void*		row,		/*!< in: sel_node_t* */
255 	void*		user_arg);	/*!< in: pointer to ib_vector_t */
256 
257 /********************************************************************
258 Read and filter nodes.
259 @return fts_node_t instance */
260 static
261 dberr_t
262 fts_query_filter_doc_ids(
263 /*=====================*/
264 	fts_query_t*		query,		/*!< in: query instance */
265 	const fts_string_t*	word,		/*!< in: the current word */
266 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
267 	const fts_node_t*	node,		/*!< in: current FTS node */
268 	void*			data,		/*!< in: doc id ilist */
269 	ulint			len,		/*!< in: doc id ilist size */
270 	ibool			calc_doc_count);/*!< in: whether to remember doc
271 						count */
272 
273 #if 0
274 /*****************************************************************//***
275 Find a doc_id in a word's ilist.
276 @return TRUE if found. */
277 static
278 ibool
279 fts_query_find_doc_id(
280 /*==================*/
281 	fts_select_t*	select,		/*!< in/out: search the doc id selected,
282 					update the frequency if found. */
283 	void*		data,		/*!< in: doc id ilist */
284 	ulint		len);		/*!< in: doc id ilist size */
285 #endif
286 
287 /*************************************************************//**
288 This function implements a simple "blind" query expansion search:
289 words in documents found in the first search pass will be used as
290 search arguments to search the document again, thus "expand"
291 the search result set.
292 @return DB_SUCCESS if success, otherwise the error code */
293 static
294 dberr_t
295 fts_expand_query(
296 /*=============*/
297 	dict_index_t*	index,		/*!< in: FTS index to search */
298 	fts_query_t*	query)		/*!< in: query result, to be freed
299 					by the client */
300 	MY_ATTRIBUTE((nonnull, warn_unused_result));
301 /*************************************************************//**
302 This function finds documents that contain all words in a
303 phrase or proximity search. And if proximity search, verify
304 the words are close enough to each other, as in specified distance.
305 This function is called for phrase and proximity search.
306 @return TRUE if documents are found, FALSE if otherwise */
307 static
308 ibool
309 fts_phrase_or_proximity_search(
310 /*===========================*/
311 	fts_query_t*	query,		/*!< in/out:  query instance
312 					query->doc_ids might be instantiated
313 					with qualified doc IDs */
314 	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
315 /*************************************************************//**
316 This function checks whether words in result documents are close to
317 each other (within proximity range as specified by "distance").
318 If "distance" is MAX_ULINT, then it will find all combinations of
319 positions of matching words and store min and max positions
320 in the "qualified_pos" for later verification.
321 @return true if words are close to each other, false if otherwise */
322 static
323 bool
324 fts_proximity_get_positions(
325 /*========================*/
326 	fts_match_t**		match,		/*!< in: query instance */
327 	ulint			num_match,	/*!< in: number of matching
328 						items */
329 	ulint			distance,	/*!< in: distance value
330 						for proximity search */
331 	fts_proximity_t*	qualified_pos);	/*!< out: the position info
332 						records ranges containing
333 						all matching words. */
334 #if 0
335 /********************************************************************
336 Get the total number of words in a documents. */
337 static
338 ulint
339 fts_query_terms_in_document(
340 /*========================*/
341 					/*!< out: DB_SUCCESS if all go well
342 					else error code */
343 	fts_query_t*	query,		/*!< in: FTS query state */
344 	doc_id_t	doc_id,		/*!< in: the word to check */
345 	ulint*		total);		/*!< out: total words in document */
346 #endif
347 
348 /********************************************************************
349 Compare two fts_doc_freq_t doc_ids.
350 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
351 UNIV_INLINE
352 int
fts_freq_doc_id_cmp(const void * p1,const void * p2)353 fts_freq_doc_id_cmp(
354 /*================*/
355 	const void*	p1,			/*!< in: id1 */
356 	const void*	p2)			/*!< in: id2 */
357 {
358 	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
359 	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
360 
361 	return((int) (fq1->doc_id - fq2->doc_id));
362 }
363 
364 #if 0
365 /*******************************************************************//**
366 Print the table used for calculating LCS. */
367 static
368 void
369 fts_print_lcs_table(
370 /*================*/
371 	const ulint*	table,		/*!< in: array to print */
372 	ulint		n_rows,		/*!< in: total no. of rows */
373 	ulint		n_cols)		/*!< in: total no. of cols */
374 {
375 	ulint		i;
376 
377 	for (i = 0; i < n_rows; ++i) {
378 		ulint	j;
379 
380 		printf("\n");
381 
382 		for (j = 0; j < n_cols; ++j) {
383 
384 			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
385 		}
386 	}
387 }
388 
389 /********************************************************************
390 Find the longest common subsequence between the query string and
391 the document. */
392 static
393 ulint
394 fts_query_lcs(
395 /*==========*/
396 					/*!< out: LCS (length) between
397 					two ilists */
398 	const	ulint*	p1,		/*!< in: word positions of query */
399 	ulint	len_p1,			/*!< in: no. of elements in p1 */
400 	const	ulint*	p2,		/*!< in: word positions within document */
401 	ulint	len_p2)			/*!< in: no. of elements in p2 */
402 {
403 	int	i;
404 	ulint	len = 0;
405 	ulint	r = len_p1;
406 	ulint	c = len_p2;
407 	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
408 	ulint*	table = (ulint*) ut_malloc(size);
409 
410 	/* Traverse the table backwards, from the last row to the first and
411 	also from the last column to the first. We compute the smaller
412 	common subsequeces first, then use the caluclated values to determine
413 	the longest common subsequence. The result will be in TABLE[0][0]. */
414 	for (i = r; i >= 0; --i) {
415 		int	j;
416 
417 		for (j = c; j >= 0; --j) {
418 
419 			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
420 
421 				FTS_ELEM(table, c, i, j) = 0;
422 
423 			} else if (p1[i] == p2[j]) {
424 
425 				FTS_ELEM(table, c, i, j) = FTS_ELEM(
426 					table, c, i + 1, j + 1) + 1;
427 
428 			} else {
429 
430 				ulint	value;
431 
432 				value = ut_max(
433 					FTS_ELEM(table, c, i + 1, j),
434 					FTS_ELEM(table, c, i, j + 1));
435 
436 				FTS_ELEM(table, c, i, j) = value;
437 			}
438 		}
439 	}
440 
441 	len = FTS_ELEM(table, c, 0, 0);
442 
443 	fts_print_lcs_table(table, r, c);
444 	printf("\nLen=%lu\n", len);
445 
446 	ut_free(table);
447 
448 	return(len);
449 }
450 #endif
451 
452 /*******************************************************************//**
453 Compare two fts_ranking_t instance on their rank value and doc ids in
454 descending order on the rank and ascending order on doc id.
455 @return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
456 static
457 int
fts_query_compare_rank(const void * p1,const void * p2)458 fts_query_compare_rank(
459 /*===================*/
460 	const void*	p1,		/*!< in: pointer to elem */
461 	const void*	p2)		/*!< in: pointer to elem */
462 {
463 	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
464 	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
465 
466 	if (r2->rank < r1->rank) {
467 		return(-1);
468 	} else if (r2->rank == r1->rank) {
469 
470 		if (r1->doc_id < r2->doc_id) {
471 			return(1);
472 		} else if (r1->doc_id > r2->doc_id) {
473 			return(1);
474 		}
475 
476 		return(0);
477 	}
478 
479 	return(1);
480 }
481 
482 #ifdef FTS_UTF8_DEBUG
483 /*******************************************************************//**
484 Convert string to lowercase.
485 @return lower case string, callers responsibility to delete using
486 ut_free() */
487 static
488 byte*
fts_tolower(const byte * src,ulint len)489 fts_tolower(
490 /*========*/
491 	const byte*	src,		/*!< in: src string */
492 	ulint		len)		/*!< in: src string length */
493 {
494 	fts_string_t	str;
495 	byte*		lc_str = ut_malloc(len + 1);
496 
497 	str.f_len = len;
498 	str.f_str = lc_str;
499 
500 	memcpy(str.f_str, src, len);
501 
502 	/* Make sure the last byte is NUL terminated */
503 	str.f_str[len] = '\0';
504 
505 	fts_utf8_tolower(&str);
506 
507 	return(lc_str);
508 }
509 
510 /*******************************************************************//**
511 Do a case insensitive search. Doesn't check for NUL byte end marker
512 only relies on len. Convert str2 to lower case before comparing.
513 @return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
514 static
515 int
fts_utf8_strcmp(const fts_string_t * str1,fts_string_t * str2)516 fts_utf8_strcmp(
517 /*============*/
518 	const fts_string_t*
519 			str1,		/*!< in: should be lower case*/
520 
521 	fts_string_t*	str2)		/*!< in: any case. We will use the length
522 					of this string during compare as it
523 					should be the min of the two strings */
524 {
525 	byte		b = str2->f_str[str2->f_len];
526 
527 	ut_a(str2->f_len <= str1->f_len);
528 
529 	/* We need to write a NUL byte at the end of the string because the
530 	string is converted to lowercase by a MySQL function which doesn't
531 	care about the length. */
532 	str2->f_str[str2->f_len] = 0;
533 
534 	fts_utf8_tolower(str2);
535 
536 	/* Restore the value we replaced above. */
537 	str2->f_str[str2->f_len] = b;
538 
539 	return(memcmp(str1->f_str, str2->f_str, str2->f_len));
540 }
541 #endif
542 
543 /*******************************************************************//**
544 Create words in ranking */
545 static
546 void
fts_ranking_words_create(fts_query_t * query,fts_ranking_t * ranking)547 fts_ranking_words_create(
548 /*=====================*/
549 	fts_query_t*	query,		/*!< in: query instance */
550 	fts_ranking_t*	ranking)	/*!< in: ranking instance */
551 {
552 	ranking->words = static_cast<byte*>(
553 		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
554 	ranking->words_len = RANKING_WORDS_INIT_LEN;
555 }
556 
557 /*
558 The optimization here is using a char array(bitmap) to replace words rb tree
559 in fts_ranking_t.
560 
561 It can save lots of memory except in some cases of QUERY EXPANSION.
562 
563 'word_map' is used as a word dictionary, in which the key is a word, the value
564 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
565 if not, we add it into 'word_map', and give it a position(actually a number).
566 then we set the corresponding bit to '1' at the position in the char array 'words'.
567 
568 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
569 more quickly than searching by value in 'word_map'. we use 'word_vector'
570 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
571 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
572 by the word.
573 */
574 
575 /*******************************************************************//**
576 Add a word into ranking */
577 static
578 void
fts_ranking_words_add(fts_query_t * query,fts_ranking_t * ranking,const fts_string_t * word)579 fts_ranking_words_add(
580 /*==================*/
581 	fts_query_t*		query,		/*!< in: query instance */
582 	fts_ranking_t*		ranking,	/*!< in: ranking instance */
583 	const fts_string_t*	word)		/*!< in: term/word to add */
584 {
585 	ulint	pos;
586 	ulint	byte_offset;
587 	ulint	bit_offset;
588 	ib_rbt_bound_t	parent;
589 
590 	/* Note: we suppose the word map and vector are append-only. */
591 	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
592 
593 	/* We use ib_rbt to simulate a map, f_n_char means position. */
594 	if (rbt_search(query->word_map, &parent, word) == 0) {
595 		fts_string_t*	result_word;
596 
597 		result_word = rbt_value(fts_string_t, parent.last);
598 		pos = result_word->f_n_char;
599 		ut_ad(pos < rbt_size(query->word_map));
600 	} else {
601 		/* Add the word to map. */
602 		fts_string_t	new_word;
603 
604 		pos = rbt_size(query->word_map);
605 
606 		new_word.f_str = static_cast<byte*>(mem_heap_alloc(query->heap,
607 			word->f_len + 1));
608 		memcpy(new_word.f_str, word->f_str, word->f_len);
609 		new_word.f_str[word->f_len] = 0;
610 		new_word.f_len = word->f_len;
611 		new_word.f_n_char = pos;
612 
613 		rbt_add_node(query->word_map, &parent, &new_word);
614 		ut_ad(rbt_validate(query->word_map));
615 		query->word_vector->push_back(new_word);
616 	}
617 
618 	/* Check words len */
619 	byte_offset = pos / CHAR_BIT;
620 	if (byte_offset >= ranking->words_len) {
621 		byte*	words = ranking->words;
622 		ulint	words_len = ranking->words_len;
623 
624 		while (byte_offset >= words_len) {
625 			words_len *= 2;
626 		}
627 
628 		ranking->words = static_cast<byte*>(
629 			mem_heap_zalloc(query->heap, words_len));
630 		ut_memcpy(ranking->words, words, ranking->words_len);
631 		ranking->words_len = words_len;
632 	}
633 
634 	/* Set ranking words */
635 	ut_ad(byte_offset < ranking->words_len);
636 	bit_offset = pos % CHAR_BIT;
637 	ranking->words[byte_offset] |= 1 << bit_offset;
638 }
639 
640 /*******************************************************************//**
641 Get a word from a ranking
642 @return true if it's successful */
643 static
644 bool
fts_ranking_words_get_next(const fts_query_t * query,fts_ranking_t * ranking,ulint * pos,fts_string_t * word)645 fts_ranking_words_get_next(
646 /*=======================*/
647 	const	fts_query_t*	query,	/*!< in: query instance */
648 	fts_ranking_t*		ranking,/*!< in: ranking instance */
649 	ulint*			pos,	/*!< in/out: word start pos */
650 	fts_string_t*		word)	/*!< in/out: term/word to add */
651 {
652 	bool	ret = false;
653 	ulint	max_pos = ranking->words_len * CHAR_BIT;
654 
655 	/* Search for next word */
656 	while (*pos < max_pos) {
657 		ulint	byte_offset = *pos / CHAR_BIT;
658 		ulint	bit_offset = *pos % CHAR_BIT;
659 
660 		if (ranking->words[byte_offset] & (1 << bit_offset)) {
661 			ret = true;
662 			break;
663 		}
664 
665 		*pos += 1;
666 	};
667 
668 	/* Get next word from word vector */
669 	if (ret) {
670 		ut_ad(*pos < query->word_vector->size());
671 		*word = query->word_vector->at((size_t)*pos);
672 		*pos += 1;
673 	}
674 
675 	return ret;
676 }
677 
678 /*******************************************************************//**
679 Add a word if it doesn't exist, to the term freq RB tree. We store
680 a pointer to the word that is passed in as the argument.
681 @return pointer to word */
682 static
683 fts_word_freq_t*
fts_query_add_word_freq(fts_query_t * query,const fts_string_t * word)684 fts_query_add_word_freq(
685 /*====================*/
686 	fts_query_t*		query,		/*!< in: query instance */
687 	const fts_string_t*	word)		/*!< in: term/word to add */
688 {
689 	ib_rbt_bound_t		parent;
690 
691 	/* Lookup the word in our rb tree and add if it doesn't exist. */
692 	if (rbt_search(query->word_freqs, &parent, word) != 0) {
693 		fts_word_freq_t	word_freq;
694 
695 		memset(&word_freq, 0, sizeof(word_freq));
696 
697 		word_freq.word.f_str = static_cast<byte*>(
698 			mem_heap_alloc(query->heap, word->f_len + 1));
699 		memcpy(word_freq.word.f_str, word->f_str, word->f_len);
700 		word_freq.word.f_str[word->f_len] = 0;
701 		word_freq.word.f_len = word->f_len;
702 
703 		word_freq.doc_count = 0;
704 
705 		word_freq.doc_freqs = rbt_create(
706 			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
707 
708 		parent.last = rbt_add_node(
709 			query->word_freqs, &parent, &word_freq);
710 
711 		query->total_size += word->f_len
712 			+ SIZEOF_RBT_CREATE
713 			+ SIZEOF_RBT_NODE_ADD
714 			+ sizeof(fts_word_freq_t);
715 	}
716 
717 	return(rbt_value(fts_word_freq_t, parent.last));
718 }
719 
720 /*******************************************************************//**
721 Add a doc id if it doesn't exist, to the doc freq RB tree.
722 @return pointer to word */
723 static
724 fts_doc_freq_t*
fts_query_add_doc_freq(fts_query_t * query,ib_rbt_t * doc_freqs,doc_id_t doc_id)725 fts_query_add_doc_freq(
726 /*===================*/
727 	fts_query_t*	query,		/*!< in: query instance	*/
728 	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
729 	doc_id_t	doc_id)		/*!< in: doc id to add */
730 {
731 	ib_rbt_bound_t	parent;
732 
733 	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
734 	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
735 		fts_doc_freq_t	doc_freq;
736 
737 		memset(&doc_freq, 0, sizeof(doc_freq));
738 
739 		doc_freq.freq = 0;
740 		doc_freq.doc_id = doc_id;
741 
742 		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
743 
744 		query->total_size += SIZEOF_RBT_NODE_ADD
745 			+ sizeof(fts_doc_freq_t);
746 	}
747 
748 	return(rbt_value(fts_doc_freq_t, parent.last));
749 }
750 
751 /*******************************************************************//**
752 Add the doc id to the query set only if it's not in the
753 deleted array. */
754 static
755 void
fts_query_union_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)756 fts_query_union_doc_id(
757 /*===================*/
758 	fts_query_t*	query,		/*!< in: query instance */
759 	doc_id_t	doc_id,		/*!< in: the doc id to add */
760 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
761 					rank associated with the doc_id */
762 {
763 	ib_rbt_bound_t	parent;
764 	ulint		size = ib_vector_size(query->deleted->doc_ids);
765 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
766 
767 	/* Check if the doc id is deleted and it's not already in our set. */
768 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
769 	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
770 
771 		fts_ranking_t	ranking;
772 
773 		ranking.rank = rank;
774 		ranking.doc_id = doc_id;
775 		fts_ranking_words_create(query, &ranking);
776 
777 		rbt_add_node(query->doc_ids, &parent, &ranking);
778 
779 		query->total_size += SIZEOF_RBT_NODE_ADD
780 			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
781 	}
782 }
783 
784 /*******************************************************************//**
785 Remove the doc id from the query set only if it's not in the
786 deleted set. */
787 static
788 void
fts_query_remove_doc_id(fts_query_t * query,doc_id_t doc_id)789 fts_query_remove_doc_id(
790 /*====================*/
791 	fts_query_t*	query,		/*!< in: query instance */
792 	doc_id_t	doc_id)		/*!< in: the doc id to add */
793 {
794 	ib_rbt_bound_t	parent;
795 	ulint		size = ib_vector_size(query->deleted->doc_ids);
796 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
797 
798 	/* Check if the doc id is deleted and it's in our set. */
799 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
800 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
801 		ut_free(rbt_remove_node(query->doc_ids, parent.last));
802 
803 		ut_ad(query->total_size >=
804 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
805 		query->total_size -= SIZEOF_RBT_NODE_ADD
806 			+ sizeof(fts_ranking_t);
807 	}
808 }
809 
810 /*******************************************************************//**
811 Find the doc id in the query set but not in the deleted set, artificialy
812 downgrade or upgrade its ranking by a value and make/initialize its ranking
813 under or above its normal range 0 to 1. This is used for Boolean Search
814 operator such as Negation operator, which makes word's contribution to the
815 row's relevance to be negative */
816 static
817 void
fts_query_change_ranking(fts_query_t * query,doc_id_t doc_id,ibool downgrade)818 fts_query_change_ranking(
819 /*====================*/
820 	fts_query_t*	query,		/*!< in: query instance */
821 	doc_id_t	doc_id,		/*!< in: the doc id to add */
822 	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
823 {
824 	ib_rbt_bound_t	parent;
825 	ulint		size = ib_vector_size(query->deleted->doc_ids);
826 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
827 
828 	/* Check if the doc id is deleted and it's in our set. */
829 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
830 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
831 
832 		fts_ranking_t*	ranking;
833 
834 		ranking = rbt_value(fts_ranking_t, parent.last);
835 
836 		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
837 
838 		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
839 		and RANK_UPGRADE (0.5) */
840 		if (ranking->rank >= 1.0F) {
841 			ranking->rank = 1.0F;
842 		} else if (ranking->rank <= -1.0F) {
843 			ranking->rank = -1.0F;
844 		}
845 	}
846 }
847 
848 /*******************************************************************//**
849 Check the doc id in the query set only if it's not in the
850 deleted array. The doc ids that were found are stored in
851 another rb tree (fts_query_t::intersect). */
852 static
853 void
fts_query_intersect_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)854 fts_query_intersect_doc_id(
855 /*=======================*/
856 	fts_query_t*	query,		/*!< in: query instance */
857 	doc_id_t	doc_id,		/*!< in: the doc id to add */
858 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
859 					rank associated with the doc_id */
860 {
861 	ib_rbt_bound_t	parent;
862 	ulint		size = ib_vector_size(query->deleted->doc_ids);
863 	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
864 	fts_ranking_t*	ranking= NULL;
865 
866 	/* There are three types of intersect:
867 	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
868 	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
869 	      if it matches 'b'. if the doc is also in  doc_ids, then change the
870 	      doc's rank, and add 'a' in doc's words.
871 	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
872 	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
873 
874 	/* Check if the doc id is deleted and it's in our set */
875 	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
876 		fts_ranking_t	new_ranking;
877 
878 		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
879 			if (query->multi_exist) {
880 				return;
881 			} else {
882 				new_ranking.words = NULL;
883 			}
884 		} else {
885 			ranking = rbt_value(fts_ranking_t, parent.last);
886 
887 			/* We've just checked the doc id before */
888 			if (ranking->words == NULL) {
889 				ut_ad(rbt_search(query->intersection, &parent,
890 					ranking) == 0);
891 				return;
892 			}
893 
894 			/* Merge rank */
895 			rank += ranking->rank;
896 			if (rank >= 1.0F) {
897 				rank = 1.0F;
898 			} else if (rank <= -1.0F) {
899 				rank = -1.0F;
900 			}
901 
902 			/* Take words */
903 			new_ranking.words = ranking->words;
904 			new_ranking.words_len = ranking->words_len;
905 		}
906 
907 		new_ranking.rank = rank;
908 		new_ranking.doc_id = doc_id;
909 
910 		if (rbt_search(query->intersection, &parent,
911 			       &new_ranking) != 0) {
912 			if (new_ranking.words == NULL) {
913 				fts_ranking_words_create(query, &new_ranking);
914 
915 				query->total_size += RANKING_WORDS_INIT_LEN;
916 			} else {
917 				/* Note that the intersection has taken
918 				ownership of the ranking data. */
919 				ranking->words = NULL;
920 			}
921 
922 			rbt_add_node(query->intersection,
923 				     &parent, &new_ranking);
924 
925 			query->total_size += SIZEOF_RBT_NODE_ADD
926 				+ sizeof(fts_ranking_t);
927 		}
928 	}
929 }
930 
931 /*******************************************************************//**
932 Free the document ranking rb tree. */
933 static
934 void
fts_query_free_doc_ids(fts_query_t * query,ib_rbt_t * doc_ids)935 fts_query_free_doc_ids(
936 /*===================*/
937 	fts_query_t*	query,		/*!< in: query instance */
938 	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
939 {
940 	const ib_rbt_node_t*	node;
941 
942 	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
943 
944 		fts_ranking_t*	ranking;
945 
946 		ranking = rbt_value(fts_ranking_t, node);
947 
948 		if (ranking->words) {
949 			ranking->words = NULL;
950 		}
951 
952 		ut_free(rbt_remove_node(doc_ids, node));
953 
954 		ut_ad(query->total_size >=
955 		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
956 		query->total_size -= SIZEOF_RBT_NODE_ADD
957 			+ sizeof(fts_ranking_t);
958 	}
959 
960 	rbt_free(doc_ids);
961 
962 	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
963 	query->total_size -= SIZEOF_RBT_CREATE;
964 }
965 
966 /**
967 Free the query intersection
968 @param[in] query	query instance */
969 static
970 void
fts_query_free_intersection(fts_query_t * query)971 fts_query_free_intersection(
972 	fts_query_t*	query)
973 {
974 	fts_query_free_doc_ids(query, query->intersection);
975 	query->intersection = NULL;
976 }
977 
978 /*******************************************************************//**
979 Add the word to the documents "list" of matching words from
980 the query. We make a copy of the word from the query heap. */
981 static
982 void
fts_query_add_word_to_document(fts_query_t * query,doc_id_t doc_id,const fts_string_t * word)983 fts_query_add_word_to_document(
984 /*===========================*/
985 	fts_query_t*		query,	/*!< in: query to update */
986 	doc_id_t		doc_id,	/*!< in: the document to update */
987 	const fts_string_t*	word)	/*!< in: the token to add */
988 {
989 	ib_rbt_bound_t		parent;
990 	fts_ranking_t*		ranking = NULL;
991 
992 	if (query->flags == FTS_OPT_RANKING) {
993 		return;
994 	}
995 
996 	/* First we search the intersection RB tree as it could have
997 	taken ownership of the words rb tree instance. */
998 	if (query->intersection
999 	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
1000 
1001 		ranking = rbt_value(fts_ranking_t, parent.last);
1002 	}
1003 
1004 	if (ranking == NULL
1005 	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
1006 
1007 		ranking = rbt_value(fts_ranking_t, parent.last);
1008 	}
1009 
1010 	if (ranking != NULL) {
1011 		fts_ranking_words_add(query, ranking, word);
1012 	}
1013 }
1014 
1015 /*******************************************************************//**
1016 Check the node ilist. */
1017 static
1018 void
fts_query_check_node(fts_query_t * query,const fts_string_t * token,const fts_node_t * node)1019 fts_query_check_node(
1020 /*=================*/
1021 	fts_query_t*		query,	/*!< in: query to update */
1022 	const fts_string_t*	token,	/*!< in: the token to search */
1023 	const fts_node_t*	node)	/*!< in: node to check */
1024 {
1025 	/* Skip nodes whose doc ids are out range. */
1026 	if (query->oper == FTS_EXIST
1027 	    && ((query->upper_doc_id > 0
1028 		&& node->first_doc_id > query->upper_doc_id)
1029 		|| (query->lower_doc_id > 0
1030 		    && node->last_doc_id < query->lower_doc_id))) {
1031 
1032 		/* Ignore */
1033 
1034 	} else {
1035 		int		ret;
1036 		ib_rbt_bound_t	parent;
1037 		ulint		ilist_size = node->ilist_size;
1038 		fts_word_freq_t*word_freqs;
1039 
1040 		/* The word must exist. */
1041 		ret = rbt_search(query->word_freqs, &parent, token);
1042 		ut_a(ret == 0);
1043 
1044 		word_freqs = rbt_value(fts_word_freq_t, parent.last);
1045 
1046 		query->error = fts_query_filter_doc_ids(
1047 					query, token, word_freqs, node,
1048 					node->ilist, ilist_size, TRUE);
1049 	}
1050 }
1051 
1052 /*****************************************************************//**
1053 Search index cache for word with wildcard match.
1054 @return number of words matched */
1055 static
1056 ulint
fts_cache_find_wildcard(fts_query_t * query,const fts_index_cache_t * index_cache,const fts_string_t * token)1057 fts_cache_find_wildcard(
1058 /*====================*/
1059 	fts_query_t*		query,		/*!< in: query instance */
1060 	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
1061 	const fts_string_t*	token)		/*!< in: token to search */
1062 {
1063 	ib_rbt_bound_t		parent;
1064 	const ib_vector_t*	nodes = NULL;
1065 	fts_string_t		srch_text;
1066 	byte			term[FTS_MAX_WORD_LEN + 1];
1067 	ulint			num_word = 0;
1068 
1069 	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1070 			? token->f_len - 1
1071 			: token->f_len;
1072 
1073 	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1074 	term[srch_text.f_len] = '\0';
1075 	srch_text.f_str = term;
1076 
1077 	/* Lookup the word in the rb tree */
1078 	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1079 			   innobase_fts_text_cmp_prefix) == 0) {
1080 		const fts_tokenizer_word_t*     word;
1081 		ulint				i;
1082 		const ib_rbt_node_t*		cur_node;
1083 		ibool				forward = FALSE;
1084 
1085 		word = rbt_value(fts_tokenizer_word_t, parent.last);
1086 		cur_node = parent.last;
1087 
1088 		while (innobase_fts_text_cmp_prefix(
1089 			index_cache->charset, &srch_text, &word->text) == 0) {
1090 
1091 			nodes = word->nodes;
1092 
1093 			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1094 				int                     ret;
1095 				const fts_node_t*       node;
1096 				ib_rbt_bound_t          freq_parent;
1097 				fts_word_freq_t*	word_freqs;
1098 
1099 				node = static_cast<const fts_node_t*>(
1100 					ib_vector_get_const(nodes, i));
1101 
1102 				ret = rbt_search(query->word_freqs,
1103 						 &freq_parent,
1104 						 &srch_text);
1105 
1106 				ut_a(ret == 0);
1107 
1108 				word_freqs = rbt_value(
1109 					fts_word_freq_t,
1110 					freq_parent.last);
1111 
1112 				query->error = fts_query_filter_doc_ids(
1113 					query, &srch_text,
1114 					word_freqs, node,
1115 					node->ilist, node->ilist_size, TRUE);
1116 
1117 				if (query->error != DB_SUCCESS) {
1118 					return(0);
1119 				}
1120 			}
1121 
1122 			num_word++;
1123 
1124 			if (!forward) {
1125 				cur_node = rbt_prev(
1126 					index_cache->words, cur_node);
1127 			} else {
1128 cont_search:
1129 				cur_node = rbt_next(
1130 					index_cache->words, cur_node);
1131 			}
1132 
1133 			if (!cur_node) {
1134 				break;
1135 			}
1136 
1137 			word = rbt_value(fts_tokenizer_word_t, cur_node);
1138 		}
1139 
1140 		if (!forward) {
1141 			forward = TRUE;
1142 			cur_node = parent.last;
1143 			goto cont_search;
1144 		}
1145 	}
1146 
1147 	return(num_word);
1148 }
1149 
1150 /*****************************************************************//**
1151 Set difference.
1152 @return DB_SUCCESS if all go well */
1153 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1154 dberr_t
fts_query_difference(fts_query_t * query,const fts_string_t * token)1155 fts_query_difference(
1156 /*=================*/
1157 	fts_query_t*		query,	/*!< in: query instance */
1158 	const fts_string_t*	token)	/*!< in: token to search */
1159 {
1160 	ulint			n_doc_ids= 0;
1161 	trx_t*			trx = query->trx;
1162 	dict_table_t*		table = query->index->table;
1163 
1164 	ut_a(query->oper == FTS_IGNORE);
1165 
1166 #ifdef FTS_INTERNAL_DIAG_PRINT
1167 	fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
1168 		(int) token->f_len, token->f_str);
1169 #endif
1170 
1171 	if (query->doc_ids) {
1172 		n_doc_ids = rbt_size(query->doc_ids);
1173 	}
1174 
1175 	/* There is nothing we can substract from an empty set. */
1176 	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1177 		ulint			i;
1178 		fts_fetch_t		fetch;
1179 		const ib_vector_t*	nodes;
1180 		const fts_index_cache_t*index_cache;
1181 		que_t*			graph = NULL;
1182 		fts_cache_t*		cache = table->fts->cache;
1183 		dberr_t			error;
1184 
1185 		rw_lock_x_lock(&cache->lock);
1186 
1187 		index_cache = fts_find_index_cache(cache, query->index);
1188 
1189 		/* Must find the index cache */
1190 		ut_a(index_cache != NULL);
1191 
1192 		/* Search the cache for a matching word first. */
1193 		if (query->cur_node->term.wildcard
1194 		    && query->flags != FTS_PROXIMITY
1195 		    && query->flags != FTS_PHRASE) {
1196 			fts_cache_find_wildcard(query, index_cache, token);
1197 		} else {
1198 			nodes = fts_cache_find_word(index_cache, token);
1199 
1200 			for (i = 0; nodes && i < ib_vector_size(nodes)
1201 			     && query->error == DB_SUCCESS; ++i) {
1202 				const fts_node_t*	node;
1203 
1204 				node = static_cast<const fts_node_t*>(
1205 					ib_vector_get_const(nodes, i));
1206 
1207 				fts_query_check_node(query, token, node);
1208 			}
1209 		}
1210 
1211 		rw_lock_x_unlock(&cache->lock);
1212 
1213 		/* error is passed by 'query->error' */
1214 		if (query->error != DB_SUCCESS) {
1215 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1216 			return(query->error);
1217 		}
1218 
1219 		/* Setup the callback args for filtering and
1220 		consolidating the ilist. */
1221 		fetch.read_arg = query;
1222 		fetch.read_record = fts_query_index_fetch_nodes;
1223 
1224 		error = fts_index_fetch_nodes(
1225 			trx, &graph, &query->fts_index_table, token, &fetch);
1226 
1227 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1228 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1229 		if (error != DB_SUCCESS) {
1230 			query->error = error;
1231 		}
1232 
1233 		fts_que_graph_free(graph);
1234 	}
1235 
1236 	/* The size can't increase. */
1237 	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1238 
1239 	return(query->error);
1240 }
1241 
1242 /*****************************************************************//**
1243 Intersect the token doc ids with the current set.
1244 @return DB_SUCCESS if all go well */
1245 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1246 dberr_t
fts_query_intersect(fts_query_t * query,const fts_string_t * token)1247 fts_query_intersect(
1248 /*================*/
1249 	fts_query_t*		query,	/*!< in: query instance */
1250 	const fts_string_t*	token)	/*!< in: the token to search */
1251 {
1252 	trx_t*			trx = query->trx;
1253 	dict_table_t*		table = query->index->table;
1254 
1255 	ut_a(query->oper == FTS_EXIST);
1256 
1257 #ifdef FTS_INTERNAL_DIAG_PRINT
1258 	fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
1259 		(int) token->f_len, token->f_str);
1260 #endif
1261 
1262 	/* If the words set is not empty and multi exist is true,
1263 	we know the intersection set is empty in advance. */
1264 	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1265 		ulint                   n_doc_ids = 0;
1266 		ulint			i;
1267 		fts_fetch_t		fetch;
1268 		const ib_vector_t*	nodes;
1269 		const fts_index_cache_t*index_cache;
1270 		que_t*			graph = NULL;
1271 		fts_cache_t*		cache = table->fts->cache;
1272 		dberr_t			error;
1273 
1274 		ut_a(!query->intersection);
1275 
1276 		n_doc_ids = rbt_size(query->doc_ids);
1277 
1278 		/* Create the rb tree that will hold the doc ids of
1279 		the intersection. */
1280 		query->intersection = rbt_create(
1281 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1282 
1283 		query->total_size += SIZEOF_RBT_CREATE;
1284 
1285 		/* This is to avoid decompressing the ilist if the
1286 		node's ilist doc ids are out of range. */
1287 		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1288 			const ib_rbt_node_t*	node;
1289 			doc_id_t*		doc_id;
1290 
1291 			node = rbt_first(query->doc_ids);
1292 			doc_id = rbt_value(doc_id_t, node);
1293 			query->lower_doc_id = *doc_id;
1294 
1295 			node = rbt_last(query->doc_ids);
1296 			doc_id = rbt_value(doc_id_t, node);
1297 			query->upper_doc_id = *doc_id;
1298 
1299 		} else {
1300 			query->lower_doc_id = 0;
1301 			query->upper_doc_id = 0;
1302 		}
1303 
1304 		/* Search the cache for a matching word first. */
1305 
1306 		rw_lock_x_lock(&cache->lock);
1307 
1308 		/* Search for the index specific cache. */
1309 		index_cache = fts_find_index_cache(cache, query->index);
1310 
1311 		/* Must find the index cache. */
1312 		ut_a(index_cache != NULL);
1313 
1314 		if (query->cur_node->term.wildcard) {
1315 			/* Wildcard search the index cache */
1316 			fts_cache_find_wildcard(query, index_cache, token);
1317 		} else {
1318 			nodes = fts_cache_find_word(index_cache, token);
1319 
1320 			for (i = 0; nodes && i < ib_vector_size(nodes)
1321 			     && query->error == DB_SUCCESS; ++i) {
1322 				const fts_node_t*	node;
1323 
1324 				node = static_cast<const fts_node_t*>(
1325 					ib_vector_get_const(nodes, i));
1326 
1327 				fts_query_check_node(query, token, node);
1328 			}
1329 		}
1330 
1331 		rw_lock_x_unlock(&cache->lock);
1332 
1333 		/* error is passed by 'query->error' */
1334 		if (query->error != DB_SUCCESS) {
1335 			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1336 			fts_query_free_intersection(query);
1337 			return(query->error);
1338 		}
1339 
1340 		/* Setup the callback args for filtering and
1341 		consolidating the ilist. */
1342 		fetch.read_arg = query;
1343 		fetch.read_record = fts_query_index_fetch_nodes;
1344 
1345 		error = fts_index_fetch_nodes(
1346 			trx, &graph, &query->fts_index_table, token, &fetch);
1347 
1348 		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1349 		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1350 		if (error != DB_SUCCESS) {
1351 			query->error = error;
1352 		}
1353 
1354 		fts_que_graph_free(graph);
1355 
1356 		if (query->error == DB_SUCCESS) {
1357 			/* Make the intesection (rb tree) the current doc id
1358 			set and free the old set. */
1359 			fts_query_free_doc_ids(query, query->doc_ids);
1360 			query->doc_ids = query->intersection;
1361 			query->intersection = NULL;
1362 
1363 			ut_a(!query->multi_exist || (query->multi_exist
1364 			     && rbt_size(query->doc_ids) <= n_doc_ids));
1365 		} else if (query->intersection != NULL) {
1366 			fts_query_free_intersection(query);
1367 		}
1368 	}
1369 
1370 	return(query->error);
1371 }
1372 
1373 /*****************************************************************//**
1374 Query index cache.
1375 @return DB_SUCCESS if all go well */
1376 static
1377 dberr_t
fts_query_cache(fts_query_t * query,const fts_string_t * token)1378 fts_query_cache(
1379 /*============*/
1380 	fts_query_t*		query,	/*!< in/out: query instance */
1381 	const fts_string_t*	token)	/*!< in: token to search */
1382 {
1383 	const fts_index_cache_t*index_cache;
1384 	dict_table_t*		table = query->index->table;
1385 	fts_cache_t*		cache = table->fts->cache;
1386 
1387 	/* Search the cache for a matching word first. */
1388 	rw_lock_x_lock(&cache->lock);
1389 
1390 	/* Search for the index specific cache. */
1391 	index_cache = fts_find_index_cache(cache, query->index);
1392 
1393 	/* Must find the index cache. */
1394 	ut_a(index_cache != NULL);
1395 
1396 	if (query->cur_node->term.wildcard
1397 	    && query->flags != FTS_PROXIMITY
1398 	    && query->flags != FTS_PHRASE) {
1399 		/* Wildcard search the index cache */
1400 		fts_cache_find_wildcard(query, index_cache, token);
1401 	} else {
1402 		const ib_vector_t*      nodes;
1403 		ulint			i;
1404 
1405 		nodes = fts_cache_find_word(index_cache, token);
1406 
1407 		for (i = 0; nodes && i < ib_vector_size(nodes)
1408 		     && query->error == DB_SUCCESS; ++i) {
1409 			const fts_node_t*	node;
1410 
1411 			node = static_cast<const fts_node_t*>(
1412 				ib_vector_get_const(nodes, i));
1413 
1414 			fts_query_check_node(query, token, node);
1415 		}
1416 	}
1417 
1418 	rw_lock_x_unlock(&cache->lock);
1419 
1420 	return(query->error);
1421 }
1422 
1423 /*****************************************************************//**
1424 Set union.
1425 @return DB_SUCCESS if all go well */
1426 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1427 dberr_t
fts_query_union(fts_query_t * query,fts_string_t * token)1428 fts_query_union(
1429 /*============*/
1430 	fts_query_t*		query,	/*!< in: query instance */
1431 	fts_string_t*		token)	/*!< in: token to search */
1432 {
1433 	fts_fetch_t		fetch;
1434 	ulint			n_doc_ids = 0;
1435 	trx_t*			trx = query->trx;
1436 	que_t*			graph = NULL;
1437 	dberr_t			error;
1438 
1439 	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1440 	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1441 
1442 #ifdef FTS_INTERNAL_DIAG_PRINT
1443 	fprintf(stderr, "UNION: Searching: '%.*s'\n",
1444 		(int) token->f_len, token->f_str);
1445 #endif
1446 
1447 	if (query->doc_ids) {
1448 		n_doc_ids = rbt_size(query->doc_ids);
1449 	}
1450 
1451 	if (token->f_len == 0) {
1452 		return(query->error);
1453 	}
1454 
1455 	/* Single '%' would confuse parser in pars_like_rebind(). In addition,
1456 	our wildcard search only supports prefix search */
1457 	ut_ad(*token->f_str != '%');
1458 
1459 	fts_query_cache(query, token);
1460 
1461 	/* Setup the callback args for filtering and
1462 	consolidating the ilist. */
1463 	fetch.read_arg = query;
1464 	fetch.read_record = fts_query_index_fetch_nodes;
1465 
1466 	/* Read the nodes from disk. */
1467 	error = fts_index_fetch_nodes(
1468 		trx, &graph, &query->fts_index_table, token, &fetch);
1469 
1470 	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1471 	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1472 	if (error != DB_SUCCESS) {
1473 		query->error = error;
1474 	}
1475 
1476 	fts_que_graph_free(graph);
1477 
1478 	if (query->error == DB_SUCCESS) {
1479 
1480 		/* The size can't decrease. */
1481 		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1482 
1483 		/* Calulate the number of doc ids that were added to
1484 		the current doc id set. */
1485 		if (query->doc_ids) {
1486 			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1487 		}
1488 	}
1489 
1490 	return(query->error);
1491 }
1492 
1493 /*****************************************************************//**
1494 Depending upon the current query operator process the doc id.
1495 return DB_SUCCESS if all go well
1496 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1497 static
1498 dberr_t
fts_query_process_doc_id(fts_query_t * query,doc_id_t doc_id,fts_rank_t rank)1499 fts_query_process_doc_id(
1500 /*=====================*/
1501 	fts_query_t*	query,		/*!< in: query instance */
1502 	doc_id_t	doc_id,		/*!< in: doc id to process */
1503 	fts_rank_t	rank)		/*!< in: if non-zero, it is the
1504 					rank associated with the doc_id */
1505 {
1506 	if (query->flags == FTS_OPT_RANKING) {
1507 		return(DB_SUCCESS);
1508 	}
1509 
1510 	switch (query->oper) {
1511 	case FTS_NONE:
1512 		fts_query_union_doc_id(query, doc_id, rank);
1513 		break;
1514 
1515 	case FTS_EXIST:
1516 		fts_query_intersect_doc_id(query, doc_id, rank);
1517 		break;
1518 
1519 	case FTS_IGNORE:
1520 		fts_query_remove_doc_id(query, doc_id);
1521 		break;
1522 
1523 	case FTS_NEGATE:
1524 		fts_query_change_ranking(query, doc_id, TRUE);
1525 		break;
1526 
1527 	case FTS_DECR_RATING:
1528 		fts_query_union_doc_id(query, doc_id, rank);
1529 		fts_query_change_ranking(query, doc_id, TRUE);
1530 		break;
1531 
1532 	case FTS_INCR_RATING:
1533 		fts_query_union_doc_id(query, doc_id, rank);
1534 		fts_query_change_ranking(query, doc_id, FALSE);
1535 		break;
1536 
1537 	default:
1538 		ut_error;
1539 	}
1540 
1541 	if (query->total_size > fts_result_cache_limit) {
1542 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1543 	} else {
1544 		return(DB_SUCCESS);
1545 	}
1546 }
1547 
1548 /*****************************************************************//**
1549 Merge two result sets. */
1550 static
1551 dberr_t
fts_merge_doc_ids(fts_query_t * query,const ib_rbt_t * doc_ids)1552 fts_merge_doc_ids(
1553 /*==============*/
1554 	fts_query_t*	query,		/*!< in,out: query instance */
1555 	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
1556 {
1557 	const ib_rbt_node_t*	node;
1558 
1559 	DBUG_ENTER("fts_merge_doc_ids");
1560 
1561 	ut_a(!query->intersection);
1562 
1563 	/* To process FTS_EXIST operation (intersection), we need
1564 	to create a new result set for fts_query_intersect(). */
1565 	if (query->oper == FTS_EXIST) {
1566 
1567 		query->intersection = rbt_create(
1568 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1569 
1570 		query->total_size += SIZEOF_RBT_CREATE;
1571 	}
1572 
1573 	/* Merge the elements to the result set. */
1574 	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1575 		fts_ranking_t*		ranking;
1576 		ulint			pos = 0;
1577 		fts_string_t		word;
1578 
1579 		ranking = rbt_value(fts_ranking_t, node);
1580 
1581 		query->error = fts_query_process_doc_id(
1582 				query, ranking->doc_id, ranking->rank);
1583 
1584 		if (query->error != DB_SUCCESS) {
1585 			if (query->intersection != NULL)
1586 			{
1587 				ut_a(query->oper == FTS_EXIST);
1588 				fts_query_free_intersection(query);
1589 			}
1590 			DBUG_RETURN(query->error);
1591 		}
1592 
1593 		/* Merge words. Don't need to take operator into account. */
1594 		ut_a(ranking->words);
1595 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1596 			fts_query_add_word_to_document(query, ranking->doc_id,
1597 						       &word);
1598 		}
1599 	}
1600 
1601 	/* If it is an intersection operation, reset query->doc_ids
1602 	to query->intersection and free the old result list. */
1603 	if (query->oper == FTS_EXIST && query->intersection != NULL) {
1604 		fts_query_free_doc_ids(query, query->doc_ids);
1605 		query->doc_ids = query->intersection;
1606 		query->intersection = NULL;
1607 	}
1608 
1609 	DBUG_RETURN(DB_SUCCESS);
1610 }
1611 
1612 /*****************************************************************//**
1613 Skip non-whitespace in a string. Move ptr to the next word boundary.
1614 @return pointer to first whitespace character or end */
1615 UNIV_INLINE
1616 byte*
fts_query_skip_word(byte * ptr,const byte * end)1617 fts_query_skip_word(
1618 /*================*/
1619 	byte*		ptr,		/*!< in: start of scan */
1620 	const byte*	end)		/*!< in: pointer to end of string */
1621 {
1622 	/* TODO: Does this have to be UTF-8 too ? */
1623 	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1624 		++ptr;
1625 	}
1626 
1627 	return(ptr);
1628 }
1629 
1630 /*****************************************************************//**
1631 Check whether the remaining terms in the phrase match the text.
1632 @return TRUE if matched else FALSE */
1633 static
1634 ibool
fts_query_match_phrase_terms(fts_phrase_t * phrase,byte ** start,const byte * end,mem_heap_t * heap)1635 fts_query_match_phrase_terms(
1636 /*=========================*/
1637 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1638 	byte**		start,		/*!< in/out: text to search, we can't
1639 					make this const becase we need to
1640 					first convert the string to
1641 					lowercase */
1642 	const byte*	end,		/*!< in: pointer to the end of
1643 					the string to search */
1644 	mem_heap_t*	heap)		/*!< in: heap */
1645 {
1646 	ulint			i;
1647 	byte*			ptr = *start;
1648 	const ib_vector_t*	tokens = phrase->tokens;
1649 	ulint			distance = phrase->distance;
1650 
1651 	/* We check only from the second term onwards, since the first
1652 	must have matched otherwise we wouldn't be here. */
1653 	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1654 		fts_string_t		match;
1655 		fts_string_t		cmp_str;
1656 		const fts_string_t*	token;
1657 		int			result;
1658 		ulint			ret;
1659 		ulint			offset;
1660 
1661 		ret = innobase_mysql_fts_get_token(
1662 			phrase->charset, ptr, (byte*) end,
1663 			&match, &offset);
1664 
1665 		if (match.f_len > 0) {
1666 			/* Get next token to match. */
1667 			token = static_cast<const fts_string_t*>(
1668 				ib_vector_get_const(tokens, i));
1669 
1670 			fts_utf8_string_dup(&cmp_str, &match, heap);
1671 
1672 			result = innobase_fts_text_case_cmp(
1673 				phrase->charset, token, &cmp_str);
1674 
1675 			/* Skip the rest of the tokens if this one doesn't
1676 			match and the proximity distance is exceeded. */
1677 			if (result
1678 			    && (distance == ULINT_UNDEFINED
1679 				|| distance == 0)) {
1680 
1681 				break;
1682 			}
1683 
1684 			/* This token matched move to the next token. */
1685 			if (result == 0) {
1686 				/* Advance the text to search by the length
1687 				of the last token. */
1688 				ptr += ret;
1689 
1690 				/* Advance to the next token. */
1691 				++i;
1692 			} else {
1693 
1694 				ut_a(distance != ULINT_UNDEFINED);
1695 
1696 				ptr = fts_query_skip_word(ptr, end);
1697 			}
1698 
1699 			/* Distance can be 0 for exact matches. */
1700 			if (distance != ULINT_UNDEFINED && distance > 0) {
1701 				--distance;
1702 			}
1703 		} else {
1704 			ptr += ret;
1705 		}
1706 	}
1707 
1708 	*start = ptr;
1709 
1710 	/* Can't be greater than the number of elements. */
1711 	ut_a(i <= ib_vector_size(tokens));
1712 
1713 	/* This is the case for multiple words. */
1714 	if (i == ib_vector_size(tokens)) {
1715 		phrase->found = TRUE;
1716 	}
1717 
1718 	return(phrase->found);
1719 }
1720 
1721 /*****************************************************************//**
1722 Callback function to count the number of words in position ranges,
1723 and see whether the word count is in specified "phrase->distance"
1724 @return true if the number of characters is less than the "distance" */
1725 static
1726 bool
fts_proximity_is_word_in_range(const fts_phrase_t * phrase,byte * start,ulint total_len)1727 fts_proximity_is_word_in_range(
1728 /*===========================*/
1729 	const fts_phrase_t*
1730 			phrase,		/*!< in: phrase with the search info */
1731 	byte*		start,		/*!< in: text to search */
1732 	ulint		total_len)	/*!< in: length of text */
1733 {
1734 	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
1735 
1736 	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1737 	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1738 
1739 	/* Search each matched position pair (with min and max positions)
1740 	and count the number of words in the range */
1741 	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1742 		ulint		cur_pos = proximity_pos->min_pos[i];
1743 		ulint		n_word = 0;
1744 
1745 		ut_ad(proximity_pos->max_pos[i] <= total_len);
1746 
1747 		/* Walk through words in the range and count them */
1748 		while (cur_pos <= proximity_pos->max_pos[i]) {
1749 			ulint		len;
1750 			fts_string_t	str;
1751 			ulint           offset = 0;
1752 
1753 			len = innobase_mysql_fts_get_token(
1754 				phrase->charset,
1755 				start + cur_pos,
1756 				start + total_len, &str, &offset);
1757 
1758 			if (len == 0) {
1759 				break;
1760 			}
1761 
1762 			/* Advances position with "len" bytes */
1763 			cur_pos += len;
1764 
1765 			/* Record the number of words */
1766 			if (str.f_n_char > 0) {
1767 				n_word++;
1768 			}
1769 
1770 			if (n_word > phrase->distance) {
1771 				break;
1772 			}
1773 		}
1774 
1775 		/* Check if the number of words is less than specified
1776 		"distance" */
1777 		if (n_word && n_word <= phrase->distance) {
1778 			return(true);
1779 		}
1780 	}
1781 
1782 	return(false);
1783 }
1784 
1785 /*****************************************************************//**
1786 Callback function to fetch and search the document.
1787 @return TRUE if matched else FALSE */
1788 static
1789 ibool
fts_query_match_phrase(fts_phrase_t * phrase,byte * start,ulint cur_len,ulint prev_len,mem_heap_t * heap)1790 fts_query_match_phrase(
1791 /*===================*/
1792 	fts_phrase_t*	phrase,		/*!< in: phrase to match */
1793 	byte*		start,		/*!< in: text to search, we can't make
1794 					this const becase we need to first
1795 					convert the string to lowercase */
1796 	ulint		cur_len,	/*!< in: length of text */
1797 	ulint		prev_len,	/*!< in: total length for searched
1798 					doc fields*/
1799 	mem_heap_t*	heap)		/* heap */
1800 {
1801 	ulint			i;
1802 	const fts_string_t*	first;
1803 	const byte*		end = start + cur_len;
1804 	const ib_vector_t*	tokens = phrase->tokens;
1805 	const ib_vector_t*	positions = phrase->match->positions;
1806 
1807 	ut_a(!phrase->found);
1808 	ut_a(phrase->match->doc_id > 0);
1809 	ut_a(ib_vector_size(tokens) > 0);
1810 	ut_a(ib_vector_size(positions) > 0);
1811 
1812 	first = static_cast<const fts_string_t*>(
1813 		ib_vector_get_const(tokens, 0));
1814 
1815 	ut_a(phrase->match->start < ib_vector_size(positions));
1816 
1817 	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1818 		ulint		pos;
1819 		fts_string_t	match;
1820 		fts_string_t	cmp_str;
1821 		byte*		ptr = start;
1822 		ulint		ret;
1823 		ulint		offset;
1824 
1825 		pos = *(ulint*) ib_vector_get_const(positions, i);
1826 
1827 		if (pos == ULINT_UNDEFINED) {
1828 			break;
1829 		}
1830 
1831 		if (pos < prev_len) {
1832 			continue;
1833 		}
1834 
1835 		/* Document positions are calculated from the beginning
1836 		of the first field, need to save the length for each
1837 		searched field to adjust the doc position when search
1838 		phrases. */
1839 		pos -= prev_len;
1840 		ptr = match.f_str = start + pos;
1841 
1842 		/* Within limits ? */
1843 		if (ptr >= end) {
1844 			break;
1845 		}
1846 
1847 		ret = innobase_mysql_fts_get_token(
1848 			phrase->charset, start + pos, (byte*) end,
1849 			&match, &offset);
1850 
1851 		if (match.f_len == 0) {
1852 			break;
1853 		}
1854 
1855 		fts_utf8_string_dup(&cmp_str, &match, heap);
1856 
1857 		if (innobase_fts_text_case_cmp(
1858 			phrase->charset, first, &cmp_str) == 0) {
1859 
1860 			/* This is the case for the single word
1861 			in the phrase. */
1862 			if (ib_vector_size(phrase->tokens) == 1) {
1863 				phrase->found = TRUE;
1864 				break;
1865 			}
1866 
1867 			ptr += ret;
1868 
1869 			/* Match the remaining terms in the phrase. */
1870 			if (fts_query_match_phrase_terms(phrase, &ptr,
1871 							 end, heap)) {
1872 				break;
1873 			}
1874 		}
1875 	}
1876 
1877 	return(phrase->found);
1878 }
1879 
1880 /*****************************************************************//**
1881 Callback function to fetch and search the document.
1882 @return whether the phrase is found */
1883 static
1884 ibool
fts_query_fetch_document(void * row,void * user_arg)1885 fts_query_fetch_document(
1886 /*=====================*/
1887 	void*		row,		/*!< in:  sel_node_t* */
1888 	void*		user_arg)	/*!< in:  fts_doc_t* */
1889 {
1890 
1891 	que_node_t*	exp;
1892 	sel_node_t*	node = static_cast<sel_node_t*>(row);
1893 	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
1894 	ulint		prev_len = 0;
1895 	ulint		total_len = 0;
1896 	byte*		document_text = NULL;
1897 
1898 	exp = node->select_list;
1899 
1900 	phrase->found = FALSE;
1901 
1902 	/* For proximity search, we will need to get the whole document
1903 	from all fields, so first count the total length of the document
1904 	from all the fields */
1905 	if (phrase->proximity_pos) {
1906 		 while (exp) {
1907 			ulint		field_len;
1908 			dfield_t*	dfield = que_node_get_val(exp);
1909 			byte*		data = static_cast<byte*>(
1910 						dfield_get_data(dfield));
1911 
1912 			if (dfield_is_ext(dfield)) {
1913 				ulint	local_len = dfield_get_len(dfield);
1914 
1915 				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1916 
1917 				field_len = mach_read_from_4(
1918 					data + local_len + BTR_EXTERN_LEN + 4);
1919 			} else {
1920 				field_len = dfield_get_len(dfield);
1921 			}
1922 
1923 			if (field_len != UNIV_SQL_NULL) {
1924 				total_len += field_len + 1;
1925 			}
1926 
1927 			exp = que_node_get_next(exp);
1928 		}
1929 
1930 		document_text = static_cast<byte*>(mem_heap_zalloc(
1931 					phrase->heap, total_len));
1932 
1933 		if (!document_text) {
1934 			return(FALSE);
1935 		}
1936 	}
1937 
1938 	exp = node->select_list;
1939 
1940 	while (exp) {
1941 		dfield_t*	dfield = que_node_get_val(exp);
1942 		byte*		data = static_cast<byte*>(
1943 					dfield_get_data(dfield));
1944 		ulint		cur_len;
1945 
1946 		if (dfield_is_ext(dfield)) {
1947 			data = btr_copy_externally_stored_field(
1948 				&cur_len, data, phrase->zip_size,
1949 				dfield_get_len(dfield), phrase->heap);
1950 		} else {
1951 			cur_len = dfield_get_len(dfield);
1952 		}
1953 
1954 		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
1955 			if (phrase->proximity_pos) {
1956 				ut_ad(prev_len + cur_len <= total_len);
1957 				memcpy(document_text + prev_len, data, cur_len);
1958 			} else {
1959 				/* For phrase search */
1960 				phrase->found =
1961 					fts_query_match_phrase(
1962 						phrase,
1963 						static_cast<byte*>(data),
1964 						cur_len, prev_len,
1965 						phrase->heap);
1966 			}
1967 
1968 			/* Document positions are calculated from the beginning
1969 			of the first field, need to save the length for each
1970 			searched field to adjust the doc position when search
1971 			phrases. */
1972 			prev_len += cur_len + 1;
1973 		}
1974 
1975 		if (phrase->found) {
1976 			break;
1977 		}
1978 
1979 		exp = que_node_get_next(exp);
1980 	}
1981 
1982 	if (phrase->proximity_pos) {
1983 		ut_ad(prev_len <= total_len);
1984 
1985 		phrase->found = fts_proximity_is_word_in_range(
1986 			phrase, document_text, total_len);
1987 	}
1988 
1989 	return(phrase->found);
1990 }
1991 
1992 #if 0
1993 /********************************************************************
1994 Callback function to check whether a record was found or not. */
1995 static
1996 ibool
1997 fts_query_select(
1998 /*=============*/
1999 	void*		row,		/*!< in:  sel_node_t* */
2000 	void*		user_arg)	/*!< in:  fts_doc_t* */
2001 {
2002 	int		i;
2003 	que_node_t*	exp;
2004 	sel_node_t*	node = row;
2005 	fts_select_t*	select = user_arg;
2006 
2007 	ut_a(select->word_freq);
2008 	ut_a(select->word_freq->doc_freqs);
2009 
2010 	exp = node->select_list;
2011 
2012 	for (i = 0; exp && !select->found; ++i) {
2013 		dfield_t*	dfield = que_node_get_val(exp);
2014 		void*		data = dfield_get_data(dfield);
2015 		ulint		len = dfield_get_len(dfield);
2016 
2017 		switch (i) {
2018 		case 0: /* DOC_COUNT */
2019 			if (len != UNIV_SQL_NULL && len != 0) {
2020 
2021 				select->word_freq->doc_count +=
2022 					mach_read_from_4(data);
2023 			}
2024 			break;
2025 
2026 		case 1: /* ILIST */
2027 			if (len != UNIV_SQL_NULL && len != 0) {
2028 
2029 				fts_query_find_doc_id(select, data, len);
2030 			}
2031 			break;
2032 
2033 		default:
2034 			ut_error;
2035 		}
2036 
2037 		exp = que_node_get_next(exp);
2038 	}
2039 
2040 	return(FALSE);
2041 }
2042 
2043 /********************************************************************
2044 Read the rows from the FTS index, that match word and where the
2045 doc id is between first and last doc id.
2046 @return DB_SUCCESS if all go well else error code */
2047 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2048 dberr_t
2049 fts_query_find_term(
2050 /*================*/
2051 	fts_query_t*		query,	/*!< in: FTS query state */
2052 	que_t**			graph,	/*!< in: prepared statement */
2053 	const fts_string_t*	word,	/*!< in: the word to fetch */
2054 	doc_id_t		doc_id,	/*!< in: doc id to match */
2055 	ulint*			min_pos,/*!< in/out: pos found must be
2056 					 greater than this minimum value. */
2057 	ibool*			found)	/*!< out: TRUE if found else FALSE */
2058 {
2059 	pars_info_t*		info;
2060 	dberr_t			error;
2061 	fts_select_t		select;
2062 	doc_id_t		match_doc_id;
2063 	trx_t*			trx = query->trx;
2064 
2065 	trx->op_info = "fetching FTS index matching nodes";
2066 
2067 	if (*graph) {
2068 		info = (*graph)->info;
2069 	} else {
2070 		info = pars_info_create();
2071 	}
2072 
2073 	select.found = FALSE;
2074 	select.doc_id = doc_id;
2075 	select.min_pos = *min_pos;
2076 	select.word_freq = fts_query_add_word_freq(query, word->f_str);
2077 
2078 	pars_info_bind_function(info, "my_func", fts_query_select, &select);
2079 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2080 
2081 	/* Convert to "storage" byte order. */
2082 	fts_write_doc_id((byte*) &match_doc_id, doc_id);
2083 
2084 	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2085 
2086 	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2087 
2088 	if (!*graph) {
2089 		ulint		selected;
2090 
2091 		selected = fts_select_index(*word->f_str);
2092 
2093 		query->fts_index_table.suffix = fts_get_suffix(selected);
2094 
2095 		*graph = fts_parse_sql(
2096 			&query->fts_index_table,
2097 			info,
2098 			"DECLARE FUNCTION my_func;\n"
2099 			"DECLARE CURSOR c IS"
2100 			" SELECT doc_count, ilist\n"
2101 			" FROM \"%s\"\n"
2102 			" WHERE word LIKE :word AND "
2103 			"	first_doc_id <= :min_doc_id AND "
2104 			"	last_doc_id >= :max_doc_id\n"
2105 			" ORDER BY first_doc_id;\n"
2106 			"BEGIN\n"
2107 			"\n"
2108 			"OPEN c;\n"
2109 			"WHILE 1 = 1 LOOP\n"
2110 			"  FETCH c INTO my_func();\n"
2111 			"  IF c % NOTFOUND THEN\n"
2112 			"    EXIT;\n"
2113 			"  END IF;\n"
2114 			"END LOOP;\n"
2115 			"CLOSE c;");
2116 	}
2117 
2118 	for(;;) {
2119 		error = fts_eval_sql(trx, *graph);
2120 
2121 		if (error == DB_SUCCESS) {
2122 
2123 			break;				/* Exit the loop. */
2124 		} else {
2125 			ut_print_timestamp(stderr);
2126 
2127 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2128 				fprintf(stderr, " InnoDB: Warning: lock wait "
2129 					"timeout reading FTS index. "
2130 					"Retrying!\n");
2131 
2132 				trx->error_state = DB_SUCCESS;
2133 			} else {
2134 				fprintf(stderr, " InnoDB: Error: %lu "
2135 					"while reading FTS index.\n", error);
2136 
2137 				break;			/* Exit the loop. */
2138 			}
2139 		}
2140 	}
2141 
2142 	/* Value to return */
2143 	*found = select.found;
2144 
2145 	if (*found) {
2146 		*min_pos = select.min_pos;
2147 	}
2148 
2149 	return(error);
2150 }
2151 
2152 /********************************************************************
2153 Callback aggregator for int columns. */
2154 static
2155 ibool
2156 fts_query_sum(
2157 /*==========*/
2158 					/*!< out: always returns TRUE */
2159 	void*		row,		/*!< in:  sel_node_t* */
2160 	void*		user_arg)	/*!< in:  ulint* */
2161 {
2162 
2163 	que_node_t*	exp;
2164 	sel_node_t*	node = row;
2165 	ulint*		total = user_arg;
2166 
2167 	exp = node->select_list;
2168 
2169 	while (exp) {
2170 		dfield_t*	dfield = que_node_get_val(exp);
2171 		void*		data = dfield_get_data(dfield);
2172 		ulint		len = dfield_get_len(dfield);
2173 
2174 		if (len != UNIV_SQL_NULL && len != 0) {
2175 			*total += mach_read_from_4(data);
2176 		}
2177 
2178 		exp = que_node_get_next(exp);
2179 	}
2180 
2181 	return(TRUE);
2182 }
2183 
2184 /********************************************************************
2185 Calculate the total documents that contain a particular word (term).
2186 @return DB_SUCCESS if all go well else error code */
2187 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2188 dberr_t
2189 fts_query_total_docs_containing_term(
2190 /*=================================*/
2191 	fts_query_t*		query,	/*!< in: FTS query state */
2192 	const fts_string_t*	word,	/*!< in: the word to check */
2193 	ulint*			total)	/*!< out: documents containing word */
2194 {
2195 	pars_info_t*		info;
2196 	dberr_t			error;
2197 	que_t*			graph;
2198 	ulint			selected;
2199 	trx_t*			trx = query->trx;
2200 
2201 	trx->op_info = "fetching FTS index document count";
2202 
2203 	*total = 0;
2204 
2205 	info = pars_info_create();
2206 
2207 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2208 	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2209 
2210 	selected = fts_select_index(*word->f_str);
2211 
2212 	query->fts_index_table.suffix = fts_get_suffix(selected);
2213 
2214 	graph = fts_parse_sql(
2215 		&query->fts_index_table,
2216 		info,
2217 		"DECLARE FUNCTION my_func;\n"
2218 		"DECLARE CURSOR c IS"
2219 		" SELECT doc_count\n"
2220 		" FROM %s\n"
2221 		" WHERE word = :word "
2222 		" ORDER BY first_doc_id;\n"
2223 		"BEGIN\n"
2224 		"\n"
2225 		"OPEN c;\n"
2226 		"WHILE 1 = 1 LOOP\n"
2227 		"  FETCH c INTO my_func();\n"
2228 		"  IF c % NOTFOUND THEN\n"
2229 		"    EXIT;\n"
2230 		"  END IF;\n"
2231 		"END LOOP;\n"
2232 		"CLOSE c;");
2233 
2234 	for(;;) {
2235 		error = fts_eval_sql(trx, graph);
2236 
2237 		if (error == DB_SUCCESS) {
2238 
2239 			break;				/* Exit the loop. */
2240 		} else {
2241 			ut_print_timestamp(stderr);
2242 
2243 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2244 				fprintf(stderr, " InnoDB: Warning: lock wait "
2245 					"timeout reading FTS index. "
2246 					"Retrying!\n");
2247 
2248 				trx->error_state = DB_SUCCESS;
2249 			} else {
2250 				fprintf(stderr, " InnoDB: Error: %lu "
2251 					"while reading FTS index.\n", error);
2252 
2253 				break;			/* Exit the loop. */
2254 			}
2255 		}
2256 	}
2257 
2258 	fts_que_graph_free(graph);
2259 
2260 	return(error);
2261 }
2262 
2263 /********************************************************************
2264 Get the total number of words in a documents.
2265 @return DB_SUCCESS if all go well else error code */
2266 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2267 dberr_t
2268 fts_query_terms_in_document(
2269 /*========================*/
2270 	fts_query_t*	query,		/*!< in: FTS query state */
2271 	doc_id_t	doc_id,		/*!< in: the word to check */
2272 	ulint*		total)		/*!< out: total words in document */
2273 {
2274 	pars_info_t*	info;
2275 	dberr_t		error;
2276 	que_t*		graph;
2277 	doc_id_t	read_doc_id;
2278 	trx_t*		trx = query->trx;
2279 
2280 	trx->op_info = "fetching FTS document term count";
2281 
2282 	*total = 0;
2283 
2284 	info = pars_info_create();
2285 
2286 	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2287 
2288 	/* Convert to "storage" byte order. */
2289 	fts_write_doc_id((byte*) &read_doc_id, doc_id);
2290 	fts_bind_doc_id(info, "doc_id", &read_doc_id);
2291 
2292 	query->fts_index_table.suffix = "DOC_ID";
2293 
2294 	graph = fts_parse_sql(
2295 		&query->fts_index_table,
2296 		info,
2297 		"DECLARE FUNCTION my_func;\n"
2298 		"DECLARE CURSOR c IS"
2299 		" SELECT count\n"
2300 		" FROM \"%s\"\n"
2301 		" WHERE doc_id = :doc_id "
2302 		"BEGIN\n"
2303 		"\n"
2304 		"OPEN c;\n"
2305 		"WHILE 1 = 1 LOOP\n"
2306 		"  FETCH c INTO my_func();\n"
2307 		"  IF c % NOTFOUND THEN\n"
2308 		"    EXIT;\n"
2309 		"  END IF;\n"
2310 		"END LOOP;\n"
2311 		"CLOSE c;");
2312 
2313 	for(;;) {
2314 		error = fts_eval_sql(trx, graph);
2315 
2316 		if (error == DB_SUCCESS) {
2317 
2318 			break;				/* Exit the loop. */
2319 		} else {
2320 			ut_print_timestamp(stderr);
2321 
2322 			if (error == DB_LOCK_WAIT_TIMEOUT) {
2323 				fprintf(stderr, " InnoDB: Warning: lock wait "
2324 					"timeout reading FTS doc id table. "
2325 					"Retrying!\n");
2326 
2327 				trx->error_state = DB_SUCCESS;
2328 			} else {
2329 				fprintf(stderr, " InnoDB: Error: %lu "
2330 					"while reading FTS doc id table.\n",
2331 					error);
2332 
2333 				break;			/* Exit the loop. */
2334 			}
2335 		}
2336 	}
2337 
2338 	fts_que_graph_free(graph);
2339 
2340 	return(error);
2341 }
2342 #endif
2343 
2344 /*****************************************************************//**
2345 Retrieve the document and match the phrase tokens.
2346 @return DB_SUCCESS or error code */
2347 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2348 dberr_t
fts_query_match_document(ib_vector_t * tokens,fts_get_doc_t * get_doc,fts_match_t * match,ulint distance,ibool * found)2349 fts_query_match_document(
2350 /*=====================*/
2351 	ib_vector_t*	tokens,		/*!< in: phrase tokens */
2352 	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
2353 	fts_match_t*	match,		/*!< in: doc id and positions */
2354 	ulint		distance,	/*!< in: proximity distance */
2355 	ibool*		found)		/*!< out: TRUE if phrase found */
2356 {
2357 	dberr_t		error;
2358 	fts_phrase_t	phrase;
2359 
2360 	memset(&phrase, 0x0, sizeof(phrase));
2361 
2362 	phrase.match = match;		/* Positions to match */
2363 	phrase.tokens = tokens;		/* Tokens to match */
2364 	phrase.distance = distance;
2365 	phrase.charset = get_doc->index_cache->charset;
2366 	phrase.zip_size = dict_table_zip_size(
2367 		get_doc->index_cache->index->table);
2368 	phrase.heap = mem_heap_create(512);
2369 
2370 	*found = phrase.found = FALSE;
2371 
2372 	error = fts_doc_fetch_by_doc_id(
2373 		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2374 		fts_query_fetch_document, &phrase);
2375 
2376 	if (error != DB_SUCCESS) {
2377 		ut_print_timestamp(stderr);
2378 		fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
2379 			ut_strerr(error));
2380 	} else {
2381 		*found = phrase.found;
2382 	}
2383 
2384 	mem_heap_free(phrase.heap);
2385 
2386 	return(error);
2387 }
2388 
2389 /*****************************************************************//**
2390 This function fetches the original documents and count the
2391 words in between matching words to see that is in specified distance
2392 @return DB_SUCCESS if all OK */
2393 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2394 bool
fts_query_is_in_proximity_range(const fts_query_t * query,fts_match_t ** match,fts_proximity_t * qualified_pos)2395 fts_query_is_in_proximity_range(
2396 /*============================*/
2397 	const fts_query_t*	query,		/*!< in:  query instance */
2398 	fts_match_t**		match,		/*!< in: query instance */
2399 	fts_proximity_t*	qualified_pos)	/*!< in: position info for
2400 						qualified ranges */
2401 {
2402 	fts_get_doc_t		get_doc;
2403 	fts_cache_t*		cache = query->index->table->fts->cache;
2404 	dberr_t			err;
2405 	fts_phrase_t		phrase;
2406 
2407 	memset(&get_doc, 0x0, sizeof(get_doc));
2408 	memset(&phrase, 0x0, sizeof(phrase));
2409 
2410 	rw_lock_x_lock(&cache->lock);
2411 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2412 	rw_lock_x_unlock(&cache->lock);
2413 	ut_a(get_doc.index_cache != NULL);
2414 
2415 	phrase.distance = query->distance;
2416 	phrase.charset = get_doc.index_cache->charset;
2417 	phrase.zip_size = dict_table_zip_size(
2418 		get_doc.index_cache->index->table);
2419 	phrase.heap = mem_heap_create(512);
2420 	phrase.proximity_pos = qualified_pos;
2421 	phrase.found = FALSE;
2422 
2423 	err = fts_doc_fetch_by_doc_id(
2424 		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2425 		fts_query_fetch_document, &phrase);
2426 
2427 	if (err != DB_SUCCESS) {
2428 		ib_logf(IB_LOG_LEVEL_ERROR,
2429 			"Error: (%s) in verification phase of proximity "
2430 			"search", ut_strerr(err));
2431 	}
2432 
2433 	/* Free the prepared statement. */
2434 	if (get_doc.get_document_graph) {
2435 		fts_que_graph_free(get_doc.get_document_graph);
2436 		get_doc.get_document_graph = NULL;
2437 	}
2438 
2439 	mem_heap_free(phrase.heap);
2440 
2441 	return(err == DB_SUCCESS && phrase.found);
2442 }
2443 
2444 /*****************************************************************//**
2445 Iterate over the matched document ids and search the for the
2446 actual phrase in the text.
2447 @return DB_SUCCESS if all OK */
2448 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2449 dberr_t
fts_query_search_phrase(fts_query_t * query,ib_vector_t * orig_tokens,ib_vector_t * tokens)2450 fts_query_search_phrase(
2451 /*====================*/
2452 	fts_query_t*		query,		/*!< in: query instance */
2453 	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
2454 						with any stopwords in the
2455 						original phrase */
2456 	ib_vector_t*		tokens)		/*!< in: tokens that does
2457 						not include stopwords and
2458 						can be used to calculate
2459 						ranking */
2460 {
2461 	ulint			i;
2462 	fts_get_doc_t		get_doc;
2463 	ulint			n_matched;
2464 	fts_cache_t*		cache = query->index->table->fts->cache;
2465 
2466 	n_matched = ib_vector_size(query->matched);
2467 
2468 	/* Setup the doc retrieval infrastructure. */
2469 	memset(&get_doc, 0x0, sizeof(get_doc));
2470 
2471 	rw_lock_x_lock(&cache->lock);
2472 
2473 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2474 
2475 	/* Must find the index cache */
2476 	ut_a(get_doc.index_cache != NULL);
2477 
2478 	rw_lock_x_unlock(&cache->lock);
2479 
2480 #ifdef FTS_INTERNAL_DIAG_PRINT
2481 	ut_print_timestamp(stderr);
2482 	fprintf(stderr, " Start phrase search\n");
2483 #endif
2484 
2485 	/* Read the document from disk and do the actual
2486 	match, matching documents will be added to the current
2487 	doc id set. */
2488 	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2489 		fts_match_t*	match;
2490 		ibool		found = FALSE;
2491 
2492 		match = static_cast<fts_match_t*>(
2493 			ib_vector_get(query->matched, i));
2494 
2495 		/* Skip the document ids that were filtered out by
2496 		an earlier pass. */
2497 		if (match->doc_id != 0) {
2498 
2499 			query->error = fts_query_match_document(
2500 				orig_tokens, &get_doc,
2501 				match, query->distance, &found);
2502 
2503 			if (query->error == DB_SUCCESS && found) {
2504 				ulint	z;
2505 
2506 				query->error = fts_query_process_doc_id(query,
2507 							 match->doc_id, 0);
2508 				if (query->error != DB_SUCCESS) {
2509 					goto func_exit;
2510 				}
2511 
2512 				for (z = 0; z < ib_vector_size(tokens); z++) {
2513 					fts_string_t*   token;
2514 					token = static_cast<fts_string_t*>(
2515 						ib_vector_get(tokens, z));
2516 					fts_query_add_word_to_document(
2517 						query, match->doc_id, token);
2518 				}
2519 			}
2520 		}
2521 	}
2522 
2523 func_exit:
2524 	/* Free the prepared statement. */
2525 	if (get_doc.get_document_graph) {
2526 		fts_que_graph_free(get_doc.get_document_graph);
2527 		get_doc.get_document_graph = NULL;
2528 	}
2529 
2530 	return(query->error);
2531 }
2532 
2533 /*****************************************************************//**
2534 Text/Phrase search.
2535 @return DB_SUCCESS or error code */
2536 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2537 dberr_t
fts_query_phrase_search(fts_query_t * query,const fts_string_t * phrase)2538 fts_query_phrase_search(
2539 /*====================*/
2540 	fts_query_t*		query,	/*!< in: query instance */
2541 	const fts_string_t*	phrase)	/*!< in: token to search */
2542 {
2543 	ib_vector_t*		tokens;
2544 	ib_vector_t*		orig_tokens;
2545 	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
2546 	ulint			len = phrase->f_len;
2547 	ulint			cur_pos = 0;
2548 	ib_alloc_t*		heap_alloc;
2549 	ulint			num_token;
2550 	CHARSET_INFO*		charset;
2551 
2552 	charset = query->fts_index_table.charset;
2553 
2554 	heap_alloc = ib_heap_allocator_create(heap);
2555 
2556 	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2557 	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2558 
2559 	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2560 		query->flags = FTS_PROXIMITY;
2561 	} else {
2562 		query->flags = FTS_PHRASE;
2563 	}
2564 
2565 	/* Split the phrase into tokens. */
2566 	while (cur_pos < len) {
2567 		fts_cache_t*	cache = query->index->table->fts->cache;
2568 		ib_rbt_bound_t	parent;
2569 		ulint		offset;
2570 		ulint		cur_len;
2571 		fts_string_t	result_str;
2572 
2573                 cur_len = innobase_mysql_fts_get_token(
2574                         charset,
2575                         reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
2576                         reinterpret_cast<const byte*>(phrase->f_str) + len,
2577 			&result_str, &offset);
2578 
2579 		if (cur_len == 0) {
2580 			break;
2581 		}
2582 
2583 		cur_pos += cur_len;
2584 
2585 		if (result_str.f_n_char == 0) {
2586 			continue;
2587 		}
2588 
2589 		fts_string_t*	token = static_cast<fts_string_t*>(
2590 			ib_vector_push(tokens, NULL));
2591 
2592 		token->f_str = static_cast<byte*>(
2593 			mem_heap_alloc(heap, result_str.f_len + 1));
2594 		ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
2595 
2596 		token->f_len = result_str.f_len;
2597 		token->f_str[token->f_len] = 0;
2598 
2599 		if (cache->stopword_info.cached_stopword
2600 		    && rbt_search(cache->stopword_info.cached_stopword,
2601 			       &parent, token) != 0
2602 		    && result_str.f_n_char >= fts_min_token_size
2603 		    && result_str.f_n_char <= fts_max_token_size) {
2604 			/* Add the word to the RB tree so that we can
2605 			calculate it's frequencey within a document. */
2606 			fts_query_add_word_freq(query, token);
2607 		} else {
2608 			ib_vector_pop(tokens);
2609 		}
2610 
2611 		/* we will start to store all words including stopwords
2612 		in the "orig_tokens" vector, but skip any leading words
2613 		that are stopwords */
2614 		if (!ib_vector_is_empty(tokens)) {
2615 			fts_string_t*	orig_token = static_cast<fts_string_t*>(
2616 				ib_vector_push(orig_tokens, NULL));
2617 
2618 			orig_token->f_str = token->f_str;
2619 			orig_token->f_len = token->f_len;
2620 		}
2621 	}
2622 
2623 	num_token = ib_vector_size(tokens);
2624 	if (num_token > MAX_PROXIMITY_ITEM) {
2625 		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2626 		goto func_exit;
2627 	}
2628 
2629 	ut_ad(ib_vector_size(orig_tokens) >= num_token);
2630 
2631 	/* Ignore empty strings. */
2632 	if (num_token > 0) {
2633 		fts_string_t*	token;
2634 		fts_fetch_t	fetch;
2635 		trx_t*		trx = query->trx;
2636 		fts_ast_oper_t	oper = query->oper;
2637 		que_t*		graph = NULL;
2638 		ulint		i;
2639 		dberr_t		error;
2640 
2641 		/* Create the vector for storing matching document ids
2642 		and the positions of the first token of the phrase. */
2643 		if (!query->matched) {
2644 			ib_alloc_t*	heap_alloc;
2645 
2646 			heap_alloc = ib_heap_allocator_create(heap);
2647 
2648 			if (!(query->flags & FTS_PROXIMITY)
2649 			    && !(query->flags & FTS_PHRASE)) {
2650 				query->matched = ib_vector_create(
2651 					heap_alloc, sizeof(fts_match_t),
2652 					64);
2653 			} else {
2654 				ut_a(num_token <= MAX_PROXIMITY_ITEM);
2655 				query->match_array =
2656 					(ib_vector_t**) mem_heap_alloc(
2657 						heap,
2658 						num_token *
2659 						sizeof(query->matched));
2660 
2661 				for (i = 0; i < num_token; i++) {
2662 					query->match_array[i] =
2663 					ib_vector_create(
2664 						heap_alloc, sizeof(fts_match_t),
2665 						64);
2666 				}
2667 
2668 				query->matched = query->match_array[0];
2669 			}
2670 		}
2671 
2672 		/* Setup the callback args for filtering and consolidating
2673 		the ilist. */
2674 		fetch.read_arg = query;
2675 		fetch.read_record = fts_query_index_fetch_nodes;
2676 
2677 		for (i = 0; i < num_token; i++) {
2678 			/* Search for the first word from the phrase. */
2679 			token = static_cast<fts_string_t*>(
2680 				ib_vector_get(tokens, i));
2681 
2682 			if (query->flags & FTS_PROXIMITY
2683 			    || query->flags & FTS_PHRASE) {
2684 				query->matched = query->match_array[i];
2685 			}
2686 
2687 			error = fts_index_fetch_nodes(
2688 				trx, &graph, &query->fts_index_table,
2689 				token, &fetch);
2690 
2691 			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2692 			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2693 			if (error != DB_SUCCESS) {
2694 				query->error = error;
2695 			}
2696 
2697 			fts_que_graph_free(graph);
2698 			graph = NULL;
2699 
2700 			fts_query_cache(query, token);
2701 
2702 			if (!(query->flags & FTS_PHRASE)
2703 			    && !(query->flags & FTS_PROXIMITY)) {
2704 				break;
2705 			}
2706 
2707 			/* If any of the token can't be found,
2708 			no need to continue match */
2709 			if (ib_vector_is_empty(query->match_array[i])
2710 			    || query->error != DB_SUCCESS) {
2711 				goto func_exit;
2712 			}
2713 		}
2714 
2715 		/* Just a single word, no need to fetch the original
2716 		documents to do phrase matching */
2717 		if (ib_vector_size(orig_tokens) == 1
2718 		    && !ib_vector_is_empty(query->match_array[0])) {
2719 			fts_match_t*    match;
2720 			ulint		n_matched;
2721 
2722 			n_matched = ib_vector_size(query->match_array[0]);
2723 
2724 			for (i = 0; i < n_matched; i++) {
2725 				match = static_cast<fts_match_t*>(
2726 					ib_vector_get(
2727 						query->match_array[0], i));
2728 
2729 				query->error = fts_query_process_doc_id(
2730 						query, match->doc_id, 0);
2731 				if (query->error != DB_SUCCESS) {
2732 					goto func_exit;
2733 				}
2734 
2735 				fts_query_add_word_to_document(
2736 					query, match->doc_id, token);
2737 			}
2738 			query->oper = oper;
2739 			goto func_exit;
2740 		}
2741 
2742 		/* If we are doing proximity search, verify the distance
2743 		between all words, and check they are in specified distance. */
2744 		if (query->flags & FTS_PROXIMITY) {
2745 			fts_phrase_or_proximity_search(query, tokens);
2746 		} else {
2747 			ibool	matched;
2748 
2749 			/* Phrase Search case:
2750 			We filter out the doc ids that don't contain
2751 			all the tokens in the phrase. It's cheaper to
2752 			search the ilist than bringing the documents in
2753 			and then doing a search through the text. Isolated
2754 			testing shows this also helps in mitigating disruption
2755 			of the buffer cache. */
2756 			matched = fts_phrase_or_proximity_search(query, tokens);
2757 			query->matched = query->match_array[0];
2758 
2759 			/* Read the actual text in and search for the phrase. */
2760 			if (matched) {
2761 				ut_ad(query->error == DB_SUCCESS);
2762 				query->error = fts_query_search_phrase(
2763 					query, orig_tokens, tokens);
2764 			}
2765 		}
2766 
2767 		/* Restore original operation. */
2768 		query->oper = oper;
2769 
2770 		if (query->error != DB_SUCCESS) {
2771 			goto func_exit;
2772 		}
2773 	}
2774 
2775 func_exit:
2776 	mem_heap_free(heap);
2777 
2778 	/* Don't need it anymore. */
2779 	query->matched = NULL;
2780 
2781 	return(query->error);
2782 }
2783 
2784 /*****************************************************************//**
2785 Find the word and evaluate.
2786 @return DB_SUCCESS if all go well */
2787 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2788 dberr_t
fts_query_execute(fts_query_t * query,fts_string_t * token)2789 fts_query_execute(
2790 /*==============*/
2791 	fts_query_t*		query,	/*!< in: query instance */
2792 	fts_string_t*		token)	/*!< in: token to search */
2793 {
2794 	switch (query->oper) {
2795 	case FTS_NONE:
2796 	case FTS_NEGATE:
2797 	case FTS_INCR_RATING:
2798 	case FTS_DECR_RATING:
2799 		query->error = fts_query_union(query, token);
2800 		break;
2801 
2802 	case FTS_EXIST:
2803 		query->error = fts_query_intersect(query, token);
2804 		break;
2805 
2806 	case FTS_IGNORE:
2807 		query->error = fts_query_difference(query, token);
2808 		break;
2809 
2810 	default:
2811 		ut_error;
2812 	}
2813 
2814 	return(query->error);
2815 }
2816 
2817 /*****************************************************************//**
2818 Create a wildcard string. It's the responsibility of the caller to
2819 free the byte* pointer. It's allocated using ut_malloc().
2820 @return ptr to allocated memory */
2821 static
2822 byte*
fts_query_get_token(fts_ast_node_t * node,fts_string_t * token)2823 fts_query_get_token(
2824 /*================*/
2825 	fts_ast_node_t*	node,		/*!< in: the current sub tree */
2826 	fts_string_t*	token)		/*!< in: token to create */
2827 {
2828 	ulint		str_len;
2829 	byte*		new_ptr = NULL;
2830 
2831 	str_len = node->term.ptr->len;
2832 
2833 	ut_a(node->type == FTS_AST_TERM);
2834 
2835 	token->f_len = str_len;
2836 	token->f_str = node->term.ptr->str;
2837 
2838 	if (node->term.wildcard) {
2839 
2840 		token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
2841 		token->f_len = str_len + 1;
2842 
2843 		memcpy(token->f_str, node->term.ptr->str, str_len);
2844 
2845 		token->f_str[str_len] = '%';
2846 		token->f_str[token->f_len] = 0;
2847 
2848 		new_ptr = token->f_str;
2849 	}
2850 
2851 	return(new_ptr);
2852 }
2853 
2854 /*****************************************************************//**
2855 Visit every node of the AST. */
2856 static
2857 dberr_t
fts_query_visitor(fts_ast_oper_t oper,fts_ast_node_t * node,void * arg)2858 fts_query_visitor(
2859 /*==============*/
2860 	fts_ast_oper_t	oper,		/*!< in: current operator */
2861 	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
2862 	void*		arg)		/*!< in: callback arg*/
2863 {
2864 	byte*		ptr;
2865 	fts_string_t	token;
2866 	fts_query_t*	query = static_cast<fts_query_t*>(arg);
2867 
2868 	ut_a(node);
2869 	DBUG_ENTER("fts_query_visitor");
2870 	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2871 
2872 	token.f_n_char = 0;
2873 	query->oper = oper;
2874 	query->cur_node = node;
2875 
2876 	switch (node->type) {
2877 	case FTS_AST_TEXT:
2878 		token.f_str = node->text.ptr->str;
2879 		token.f_len = node->text.ptr->len;
2880 
2881 		if (query->oper == FTS_EXIST) {
2882 			ut_ad(query->intersection == NULL);
2883 			query->intersection = rbt_create(
2884 				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
2885 
2886 			query->total_size += SIZEOF_RBT_CREATE;
2887 		}
2888 
2889 		/* Set the current proximity distance. */
2890 		query->distance = node->text.distance;
2891 
2892 		/* Force collection of doc ids and the positions. */
2893 		query->collect_positions = TRUE;
2894 
2895 		query->error = fts_query_phrase_search(query, &token);
2896 
2897 		query->collect_positions = FALSE;
2898 
2899 		if (query->oper == FTS_EXIST) {
2900 			fts_query_free_doc_ids(query, query->doc_ids);
2901 			query->doc_ids = query->intersection;
2902 			query->intersection = NULL;
2903 		}
2904 
2905 		break;
2906 
2907 	case FTS_AST_TERM:
2908 		token.f_str = node->term.ptr->str;
2909 		token.f_len = node->term.ptr->len;
2910 
2911 		/* Add the word to our RB tree that will be used to
2912 		calculate this terms per document frequency. */
2913 		fts_query_add_word_freq(query, &token);
2914 
2915 		ptr = fts_query_get_token(node, &token);
2916 		query->error = fts_query_execute(query, &token);
2917 
2918 		if (ptr) {
2919 			ut_free(ptr);
2920 		}
2921 		break;
2922 
2923 	case FTS_AST_SUBEXP_LIST:
2924 		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
2925 		break;
2926 
2927 	default:
2928 		ut_error;
2929 	}
2930 
2931 	if (query->oper == FTS_EXIST) {
2932 		query->multi_exist = true;
2933 	}
2934 
2935 	DBUG_RETURN(query->error);
2936 }
2937 
2938 /*****************************************************************//**
2939 Process (nested) sub-expression, create a new result set to store the
2940 sub-expression result by processing nodes under current sub-expression
2941 list. Merge the sub-expression result with that of parent expression list.
2942 @return DB_SUCCESS if all  well */
2943 UNIV_INTERN
2944 dberr_t
fts_ast_visit_sub_exp(fts_ast_node_t * node,fts_ast_callback visitor,void * arg)2945 fts_ast_visit_sub_exp(
2946 /*==================*/
2947 	fts_ast_node_t*		node,		/*!< in,out: current root node */
2948 	fts_ast_callback	visitor,	/*!< in: callback function */
2949 	void*			arg)		/*!< in,out: arg for callback */
2950 {
2951 	fts_ast_oper_t		cur_oper;
2952 	fts_query_t*		query = static_cast<fts_query_t*>(arg);
2953 	ib_rbt_t*		parent_doc_ids;
2954 	ib_rbt_t*		subexpr_doc_ids;
2955 	dberr_t			error = DB_SUCCESS;
2956 	bool			will_be_ignored = false;
2957 	bool			multi_exist;
2958 
2959 	DBUG_ENTER("fts_ast_visit_sub_exp");
2960 
2961 	/* sub-expression list may contains sub-expressions.
2962 	So we increase sub-expression depth counter.
2963 	If this counter reaches to the threshold then
2964 	we abort the search opertion and reports an error */
2965 	if (query->nested_exp_count > FTS_MAX_NESTED_EXP) {
2966 		query->error = DB_FTS_TOO_MANY_NESTED_EXP;
2967 		DBUG_RETURN(query->error);
2968 	}
2969 	query->nested_exp_count++;
2970 
2971 
2972 	ut_a(node->type == FTS_AST_SUBEXP_LIST);
2973 
2974 	cur_oper = query->oper;
2975 
2976 	/* Save current result set */
2977 	parent_doc_ids = query->doc_ids;
2978 
2979 	/* Create new result set to store the sub-expression result. We
2980 	will merge this result set with the parent after processing. */
2981 	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
2982 				    fts_ranking_doc_id_cmp);
2983 
2984 	query->total_size += SIZEOF_RBT_CREATE;
2985 
2986 	multi_exist = query->multi_exist;
2987 	query->multi_exist = false;
2988 	/* Process nodes in current sub-expression and store its
2989 	result set in query->doc_ids we created above. */
2990 	error = fts_ast_visit(FTS_NONE, node, visitor,
2991 			      arg, &will_be_ignored);
2992 
2993 	/* Reinstate parent node state */
2994 	query->multi_exist = multi_exist;
2995 	query->oper = cur_oper;
2996 
2997 	/* Merge the sub-expression result with the parent result set. */
2998 	subexpr_doc_ids = query->doc_ids;
2999 	query->doc_ids = parent_doc_ids;
3000 	if (error == DB_SUCCESS) {
3001 		error = fts_merge_doc_ids(query, subexpr_doc_ids);
3002 	}
3003 
3004 	/* Free current result set. Result already merged into parent. */
3005 	fts_query_free_doc_ids(query, subexpr_doc_ids);
3006 
3007 	DBUG_RETURN(error);
3008 }
3009 
3010 #if 0
3011 /*****************************************************************//***
3012 Check if the doc id exists in the ilist.
3013 @return TRUE if doc id found */
3014 static
3015 ulint
3016 fts_query_find_doc_id(
3017 /*==================*/
3018 	fts_select_t*	select,		/*!< in/out: contains the doc id to
3019 					find, we update the word freq if
3020 					document found */
3021 	void*		data,		/*!< in: doc id ilist */
3022 	ulint		len)		/*!< in: doc id ilist size */
3023 {
3024 	byte*		ptr = data;
3025 	doc_id_t	doc_id = 0;
3026 	ulint		decoded = 0;
3027 
3028 	/* Decode the ilist and search for selected doc_id. We also
3029 	calculate the frequency of the word in the document if found. */
3030 	while (decoded < len && !select->found) {
3031 		ulint		freq = 0;
3032 		ulint		min_pos = 0;
3033 		ulint		last_pos = 0;
3034 		ulint		pos = fts_decode_vlc(&ptr);
3035 
3036 		/* Add the delta. */
3037 		doc_id += pos;
3038 
3039 		while (*ptr) {
3040 			++freq;
3041 			last_pos += fts_decode_vlc(&ptr);
3042 
3043 			/* Only if min_pos is not set and the current
3044 			term exists in a position greater than the
3045 			min_pos of the previous term. */
3046 			if (min_pos == 0 && last_pos > select->min_pos) {
3047 				min_pos = last_pos;
3048 			}
3049 		}
3050 
3051 		/* Skip the end of word position marker. */
3052 		++ptr;
3053 
3054 		/* Bytes decoded so far. */
3055 		decoded = ptr - (byte*) data;
3056 
3057 		/* A word may exist in the document but we only consider a
3058 		match if it exists in a position that is greater than the
3059 		position of the previous term. */
3060 		if (doc_id == select->doc_id && min_pos > 0) {
3061 			fts_doc_freq_t*	doc_freq;
3062 
3063 			/* Add the doc id to the doc freq rb tree, if
3064 			the doc id doesn't exist it will be created. */
3065 			doc_freq = fts_query_add_doc_freq(
3066 				select->word_freq->doc_freqs, doc_id);
3067 
3068 			/* Avoid duplicating the frequency tally */
3069 			if (doc_freq->freq == 0) {
3070 				doc_freq->freq = freq;
3071 			}
3072 
3073 			select->found = TRUE;
3074 			select->min_pos = min_pos;
3075 		}
3076 	}
3077 
3078 	return(select->found);
3079 }
3080 #endif
3081 
3082 /*****************************************************************//**
3083 Read and filter nodes.
3084 @return DB_SUCCESS if all go well,
3085 or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3086 static
3087 dberr_t
fts_query_filter_doc_ids(fts_query_t * query,const fts_string_t * word,fts_word_freq_t * word_freq,const fts_node_t * node,void * data,ulint len,ibool calc_doc_count)3088 fts_query_filter_doc_ids(
3089 /*=====================*/
3090 	fts_query_t*		query,		/*!< in: query instance */
3091 	const fts_string_t*	word,		/*!< in: the current word */
3092 	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
3093 	const fts_node_t*	node,		/*!< in: current FTS node */
3094 	void*			data,		/*!< in: doc id ilist */
3095 	ulint			len,		/*!< in: doc id ilist size */
3096 	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
3097 {
3098 	byte*		ptr = static_cast<byte*>(data);
3099 	doc_id_t	doc_id = 0;
3100 	ulint		decoded = 0;
3101 	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
3102 
3103 	/* Decode the ilist and add the doc ids to the query doc_id set. */
3104 	while (decoded < len) {
3105 		ulint		freq = 0;
3106 		fts_doc_freq_t*	doc_freq;
3107 		fts_match_t*	match = NULL;
3108 		ulint		last_pos = 0;
3109 		ulint		pos = fts_decode_vlc(&ptr);
3110 
3111 		/* Some sanity checks. */
3112 		if (doc_id == 0) {
3113 			ut_a(pos == node->first_doc_id);
3114 		}
3115 
3116 		/* Add the delta. */
3117 		doc_id += pos;
3118 
3119 		if (calc_doc_count) {
3120 			word_freq->doc_count++;
3121 		}
3122 
3123 		/* We simply collect the matching instances here. */
3124 		if (query->collect_positions) {
3125 			ib_alloc_t*	heap_alloc;
3126 
3127 			/* Create a new fts_match_t instance. */
3128 			match = static_cast<fts_match_t*>(
3129 				ib_vector_push(query->matched, NULL));
3130 
3131 			match->start = 0;
3132 			match->doc_id = doc_id;
3133 			heap_alloc = ib_vector_allocator(query->matched);
3134 
3135 			/* Allocate from the same heap as the
3136 			parent container. */
3137 			match->positions = ib_vector_create(
3138 				heap_alloc, sizeof(ulint), 64);
3139 
3140 			query->total_size += sizeof(fts_match_t)
3141 				+ sizeof(ib_vector_t)
3142 				+ sizeof(ulint) * 64;
3143 		}
3144 
3145 		/* Unpack the positions within the document. */
3146 		while (*ptr) {
3147 			last_pos += fts_decode_vlc(&ptr);
3148 
3149 			/* Collect the matching word positions, for phrase
3150 			matching later. */
3151 			if (query->collect_positions) {
3152 				ib_vector_push(match->positions, &last_pos);
3153 			}
3154 
3155 			++freq;
3156 		}
3157 
3158 		/* End of list marker. */
3159 		last_pos = (ulint) -1;
3160 
3161 		if (query->collect_positions) {
3162 			ut_a(match != NULL);
3163 			ib_vector_push(match->positions, &last_pos);
3164 		}
3165 
3166 		/* Add the doc id to the doc freq rb tree, if the doc id
3167 		doesn't exist it will be created. */
3168 		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3169 
3170 		/* Avoid duplicating frequency tally. */
3171 		if (doc_freq->freq == 0) {
3172 			doc_freq->freq = freq;
3173 		}
3174 
3175 		/* Skip the end of word position marker. */
3176 		++ptr;
3177 
3178 		/* Bytes decoded so far */
3179 		decoded = ptr - (byte*) data;
3180 
3181 		/* We simply collect the matching documents and the
3182 		positions here and match later. */
3183 		if (!query->collect_positions) {
3184 			/* We ignore error here and will check it later */
3185 			fts_query_process_doc_id(query, doc_id, 0);
3186 
3187 			/* Add the word to the document's matched RB tree. */
3188 			fts_query_add_word_to_document(query, doc_id, word);
3189 		}
3190 	}
3191 
3192 	/* Some sanity checks. */
3193 	ut_a(doc_id == node->last_doc_id);
3194 
3195 	if (query->total_size > fts_result_cache_limit) {
3196 		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3197 	} else {
3198 		return(DB_SUCCESS);
3199 	}
3200 }
3201 
3202 /*****************************************************************//**
3203 Read the FTS INDEX row.
3204 @return DB_SUCCESS if all go well. */
3205 static
3206 dberr_t
fts_query_read_node(fts_query_t * query,const fts_string_t * word,que_node_t * exp)3207 fts_query_read_node(
3208 /*================*/
3209 	fts_query_t*		query,	/*!< in: query instance */
3210 	const fts_string_t*	word,	/*!< in: current word */
3211 	que_node_t*		exp)	/*!< in: query graph node */
3212 {
3213 	int			i;
3214 	int			ret;
3215 	fts_node_t		node;
3216 	ib_rbt_bound_t		parent;
3217 	fts_word_freq_t*	word_freq;
3218 	ibool			skip = FALSE;
3219 	fts_string_t		term;
3220 	byte			buf[FTS_MAX_WORD_LEN + 1];
3221 	dberr_t			error = DB_SUCCESS;
3222 
3223 	ut_a(query->cur_node->type == FTS_AST_TERM ||
3224 	     query->cur_node->type == FTS_AST_TEXT);
3225 
3226 	memset(&node, 0, sizeof(node));
3227 	term.f_str = buf;
3228 
3229 	/* Need to consider the wildcard search case, the word frequency
3230 	is created on the search string not the actual word. So we need
3231 	to assign the frequency on search string behalf. */
3232 	if (query->cur_node->type == FTS_AST_TERM
3233 	    && query->cur_node->term.wildcard) {
3234 		term.f_len = query->cur_node->term.ptr->len;
3235 		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3236 		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3237 	} else {
3238 		term.f_len = word->f_len;
3239 		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3240 		memcpy(term.f_str, word->f_str, word->f_len);
3241 	}
3242 
3243 	/* Lookup the word in our rb tree, it must exist. */
3244 	ret = rbt_search(query->word_freqs, &parent, &term);
3245 
3246 	ut_a(ret == 0);
3247 
3248 	word_freq = rbt_value(fts_word_freq_t, parent.last);
3249 
3250 	/* Start from 1 since the first column has been read by the caller.
3251 	Also, we rely on the order of the columns projected, to filter
3252 	out ilists that are out of range and we always want to read
3253 	the doc_count irrespective of the suitablility of the row. */
3254 
3255 	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3256 
3257 		dfield_t*	dfield = que_node_get_val(exp);
3258 		byte*		data = static_cast<byte*>(
3259 			dfield_get_data(dfield));
3260 		ulint		len = dfield_get_len(dfield);
3261 
3262 		ut_a(len != UNIV_SQL_NULL);
3263 
3264 		/* Note: The column numbers below must match the SELECT. */
3265 
3266 		switch (i) {
3267 		case 1: /* DOC_COUNT */
3268 			word_freq->doc_count += mach_read_from_4(data);
3269 			break;
3270 
3271 		case 2: /* FIRST_DOC_ID */
3272 			node.first_doc_id = fts_read_doc_id(data);
3273 
3274 			/* Skip nodes whose doc ids are out range. */
3275 			if (query->oper == FTS_EXIST
3276 			    && query->upper_doc_id > 0
3277 			    && node.first_doc_id > query->upper_doc_id) {
3278 				skip = TRUE;
3279 			}
3280 			break;
3281 
3282 		case 3: /* LAST_DOC_ID */
3283 			node.last_doc_id = fts_read_doc_id(data);
3284 
3285 			/* Skip nodes whose doc ids are out range. */
3286 			if (query->oper == FTS_EXIST
3287 			    && query->lower_doc_id > 0
3288 			    && node.last_doc_id < query->lower_doc_id) {
3289 				skip = TRUE;
3290 			}
3291 			break;
3292 
3293 		case 4: /* ILIST */
3294 
3295 			error = fts_query_filter_doc_ids(
3296 					query, &word_freq->word, word_freq,
3297 					&node, data, len, FALSE);
3298 
3299 			break;
3300 
3301 		default:
3302 			ut_error;
3303 		}
3304 	}
3305 
3306 	if (!skip) {
3307 		/* Make sure all columns were read. */
3308 
3309 		ut_a(i == 5);
3310 	}
3311 
3312 	return error;
3313 }
3314 
3315 /*****************************************************************//**
3316 Callback function to fetch the rows in an FTS INDEX record.
3317 @return always returns TRUE */
3318 static
3319 ibool
fts_query_index_fetch_nodes(void * row,void * user_arg)3320 fts_query_index_fetch_nodes(
3321 /*========================*/
3322 	void*		row,		/*!< in: sel_node_t* */
3323 	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
3324 {
3325 	fts_string_t	key;
3326 	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
3327 	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
3328 	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
3329 	que_node_t*	exp = sel_node->select_list;
3330 	dfield_t*	dfield = que_node_get_val(exp);
3331 	void*		data = dfield_get_data(dfield);
3332 	ulint		dfield_len = dfield_get_len(dfield);
3333 
3334 	key.f_str = static_cast<byte*>(data);
3335 	key.f_len = dfield_len;
3336 
3337 	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3338 
3339 	/* Note: we pass error out by 'query->error' */
3340 	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3341 
3342 	if (query->error != DB_SUCCESS) {
3343 		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3344 		return(FALSE);
3345 	} else {
3346 		return(TRUE);
3347 	}
3348 }
3349 
3350 /*****************************************************************//**
3351 Calculate the inverse document frequency (IDF) for all the terms. */
3352 static
3353 void
fts_query_calculate_idf(fts_query_t * query)3354 fts_query_calculate_idf(
3355 /*====================*/
3356 	fts_query_t*	query)	/*!< in: Query state */
3357 {
3358 	const ib_rbt_node_t*	node;
3359 	ib_uint64_t		total_docs = query->total_docs;
3360 
3361 	/* We need to free any instances of fts_doc_freq_t that we
3362 	may have allocated. */
3363 	for (node = rbt_first(query->word_freqs);
3364 	     node;
3365 	     node = rbt_next(query->word_freqs, node)) {
3366 
3367 		fts_word_freq_t*	word_freq;
3368 
3369 		word_freq = rbt_value(fts_word_freq_t, node);
3370 
3371 		if (word_freq->doc_count > 0) {
3372 			if (total_docs == word_freq->doc_count) {
3373 				/* QP assume ranking > 0 if we find
3374 				a match. Since Log10(1) = 0, we cannot
3375 				make IDF a zero value if do find a
3376 				word in all documents. So let's make
3377 				it an arbitrary very small number */
3378 				word_freq->idf = log10(1.0001);
3379 			} else {
3380 				word_freq->idf = log10(
3381 					total_docs
3382 					/ (double) word_freq->doc_count);
3383 			}
3384 		}
3385 
3386 		if (fts_enable_diag_print) {
3387 			fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
3388 				" %6.5lf\n",
3389 			        word_freq->word.f_str,
3390 			        query->total_docs, word_freq->doc_count,
3391 			        word_freq->idf);
3392 		}
3393 	}
3394 }
3395 
3396 /*****************************************************************//**
3397 Calculate the ranking of the document. */
3398 static
3399 void
fts_query_calculate_ranking(const fts_query_t * query,fts_ranking_t * ranking)3400 fts_query_calculate_ranking(
3401 /*========================*/
3402 	const fts_query_t*	query,		/*!< in: query state */
3403 	fts_ranking_t*		ranking)	/*!< in: Document to rank */
3404 {
3405 	ulint	pos = 0;
3406 	fts_string_t	word;
3407 
3408 	/* At this stage, ranking->rank should not exceed the 1.0
3409 	bound */
3410 	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3411 	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3412 
3413 	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3414 		int			ret;
3415 		ib_rbt_bound_t		parent;
3416 		double			weight;
3417 		fts_doc_freq_t*		doc_freq;
3418 		fts_word_freq_t*	word_freq;
3419 
3420 		ret = rbt_search(query->word_freqs, &parent, &word);
3421 
3422 		/* It must exist. */
3423 		ut_a(ret == 0);
3424 
3425 		word_freq = rbt_value(fts_word_freq_t, parent.last);
3426 
3427 		ret = rbt_search(
3428 			word_freq->doc_freqs, &parent, &ranking->doc_id);
3429 
3430 		/* It must exist. */
3431 		ut_a(ret == 0);
3432 
3433 		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3434 
3435 		weight = (double) doc_freq->freq * word_freq->idf;
3436 
3437 		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3438 	}
3439 }
3440 
3441 /*****************************************************************//**
3442 Add ranking to the result set. */
3443 static
3444 void
fts_query_add_ranking(fts_query_t * query,ib_rbt_t * ranking_tree,const fts_ranking_t * new_ranking)3445 fts_query_add_ranking(
3446 /*==================*/
3447 	fts_query_t*		query,		/*!< in: query state */
3448 	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
3449 	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
3450 {
3451 	ib_rbt_bound_t		parent;
3452 
3453 	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
3454 	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3455 		fts_ranking_t*	ranking;
3456 
3457 		ranking = rbt_value(fts_ranking_t, parent.last);
3458 
3459 		ranking->rank += new_ranking->rank;
3460 
3461 		ut_a(ranking->words == NULL);
3462 	} else {
3463 		rbt_add_node(ranking_tree, &parent, new_ranking);
3464 
3465 		query->total_size += SIZEOF_RBT_NODE_ADD
3466 			+ sizeof(fts_ranking_t);
3467 	}
3468 }
3469 
3470 /*****************************************************************//**
3471 Retrieve the FTS Relevance Ranking result for doc with doc_id
3472 @return the relevance ranking value, 0 if no ranking value
3473 present. */
3474 float
fts_retrieve_ranking(fts_result_t * result,doc_id_t doc_id)3475 fts_retrieve_ranking(
3476 /*=================*/
3477 	fts_result_t*	result,	/*!< in: FTS result structure */
3478 	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
3479 {
3480 	ib_rbt_bound_t		parent;
3481 	fts_ranking_t		new_ranking;
3482 
3483 	DBUG_ENTER("fts_retrieve_ranking");
3484 
3485 	if (!result || !result->rankings_by_id) {
3486 		DBUG_RETURN(0);
3487 	}
3488 
3489 	new_ranking.doc_id = doc_id;
3490 
3491 	/* Lookup the ranking in our rb tree */
3492 	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3493 		fts_ranking_t*  ranking;
3494 
3495 		ranking = rbt_value(fts_ranking_t, parent.last);
3496 
3497 		DBUG_RETURN(ranking->rank);
3498 	}
3499 
3500 	DBUG_RETURN(0);
3501 }
3502 
3503 /*****************************************************************//**
3504 Create the result and copy the data to it. */
3505 static
3506 fts_result_t*
fts_query_prepare_result(fts_query_t * query,fts_result_t * result)3507 fts_query_prepare_result(
3508 /*=====================*/
3509 	fts_query_t*	query,	/*!< in: Query state */
3510 	fts_result_t*	result)	/*!< in: result this can contain
3511 				data from a previous search on
3512 				another FTS index */
3513 {
3514 	const ib_rbt_node_t*	node;
3515 	bool			result_is_null = false;
3516 
3517 	DBUG_ENTER("fts_query_prepare_result");
3518 
3519 	if (result == NULL) {
3520 		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3521 
3522 		memset(result, 0x0, sizeof(*result));
3523 
3524 		result->rankings_by_id = rbt_create(
3525 			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3526 
3527 		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3528 		result_is_null = true;
3529 	}
3530 
3531 	if (query->flags == FTS_OPT_RANKING) {
3532 		fts_word_freq_t*	word_freq;
3533 		ulint		size = ib_vector_size(query->deleted->doc_ids);
3534 		fts_update_t*	array =
3535 			(fts_update_t*) query->deleted->doc_ids->data;
3536 
3537 		node = rbt_first(query->word_freqs);
3538 		ut_ad(node);
3539 		word_freq = rbt_value(fts_word_freq_t, node);
3540 
3541 		for (node = rbt_first(word_freq->doc_freqs);
3542 		     node;
3543 		     node = rbt_next(word_freq->doc_freqs, node)) {
3544 			fts_doc_freq_t* doc_freq;
3545 			fts_ranking_t	ranking;
3546 
3547 			doc_freq = rbt_value(fts_doc_freq_t, node);
3548 
3549 			/* Don't put deleted docs into result */
3550 			if (fts_bsearch(array, 0, static_cast<int>(size),
3551 					doc_freq->doc_id) >= 0) {
3552 				/* one less matching doc count */
3553 				--word_freq->doc_count;
3554 				continue;
3555 			}
3556 
3557 			ranking.doc_id = doc_freq->doc_id;
3558 			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3559 			ranking.words = NULL;
3560 
3561 			fts_query_add_ranking(query, result->rankings_by_id,
3562 					      &ranking);
3563 
3564 			if (query->total_size > fts_result_cache_limit) {
3565 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3566 				fts_query_free_result(result);
3567 				DBUG_RETURN(NULL);
3568 			}
3569 		}
3570 
3571 		/* Calculate IDF only after we exclude the deleted items */
3572 		fts_query_calculate_idf(query);
3573 
3574 		node = rbt_first(query->word_freqs);
3575 		word_freq = rbt_value(fts_word_freq_t, node);
3576 
3577 		/* Calculate the ranking for each doc */
3578 		for (node = rbt_first(result->rankings_by_id);
3579 		     node != NULL;
3580 		     node = rbt_next(result->rankings_by_id, node)) {
3581 
3582 			fts_ranking_t*  ranking;
3583 
3584 			ranking = rbt_value(fts_ranking_t, node);
3585 
3586 			ranking->rank = static_cast<fts_rank_t>(
3587 				ranking->rank * word_freq->idf * word_freq->idf);
3588 		}
3589 
3590 		DBUG_RETURN(result);
3591 	}
3592 
3593 	ut_a(rbt_size(query->doc_ids) > 0);
3594 
3595 	for (node = rbt_first(query->doc_ids);
3596 	     node;
3597 	     node = rbt_next(query->doc_ids, node)) {
3598 
3599 		fts_ranking_t*	ranking;
3600 
3601 		ranking = rbt_value(fts_ranking_t, node);
3602 		fts_query_calculate_ranking(query, ranking);
3603 
3604 		// FIXME: I think we may requre this information to improve the
3605 		// ranking of doc ids which have more word matches from
3606 		// different FTS indexes.
3607 
3608 		/* We don't need these anymore free the resources. */
3609 		ranking->words = NULL;
3610 
3611 		if (!result_is_null) {
3612 			fts_query_add_ranking(query, result->rankings_by_id, ranking);
3613 
3614 			 if (query->total_size > fts_result_cache_limit) {
3615 				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3616 				fts_query_free_result(result);
3617 				DBUG_RETURN(NULL);
3618                         }
3619 		}
3620 	}
3621 
3622 	if (result_is_null) {
3623 		/* Use doc_ids directly */
3624 		rbt_free(result->rankings_by_id);
3625 		result->rankings_by_id = query->doc_ids;
3626 		query->doc_ids = NULL;
3627 	}
3628 
3629 	DBUG_RETURN(result);
3630 }
3631 
3632 /*****************************************************************//**
3633 Get the result of the query. Calculate the similarity coefficient. */
3634 static
3635 fts_result_t*
fts_query_get_result(fts_query_t * query,fts_result_t * result)3636 fts_query_get_result(
3637 /*=================*/
3638 	fts_query_t*		query,	/*!< in: query instance */
3639 	fts_result_t*		result)	/*!< in: result */
3640 {
3641 	DBUG_ENTER("fts_query_get_result");
3642 
3643 	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3644 		/* Copy the doc ids to the result. */
3645 		result = fts_query_prepare_result(query, result);
3646 	} else {
3647 		/* Create an empty result instance. */
3648 		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3649 		memset(result, 0, sizeof(*result));
3650 	}
3651 
3652 	DBUG_RETURN(result);
3653 }
3654 
3655 /*****************************************************************//**
3656 FTS Query free resources and reset. */
3657 static
3658 void
fts_query_free(fts_query_t * query)3659 fts_query_free(
3660 /*===========*/
3661 	fts_query_t*	query)		/*!< in: query instance to free*/
3662 {
3663 
3664 	if (query->read_nodes_graph) {
3665 		fts_que_graph_free(query->read_nodes_graph);
3666 	}
3667 
3668 	if (query->root) {
3669 		fts_ast_free_node(query->root);
3670 	}
3671 
3672 	if (query->deleted) {
3673 		fts_doc_ids_free(query->deleted);
3674 	}
3675 
3676 	if (query->intersection) {
3677 		fts_query_free_doc_ids(query, query->intersection);
3678 		query->intersection = NULL;
3679 	}
3680 
3681 	if (query->doc_ids) {
3682 		fts_query_free_doc_ids(query, query->doc_ids);
3683 	}
3684 
3685 	if (query->word_freqs) {
3686 		const ib_rbt_node_t*	node;
3687 
3688 		/* We need to free any instances of fts_doc_freq_t that we
3689 		may have allocated. */
3690 		for (node = rbt_first(query->word_freqs);
3691 		     node;
3692 		     node = rbt_next(query->word_freqs, node)) {
3693 
3694 			fts_word_freq_t*	word_freq;
3695 
3696 			word_freq = rbt_value(fts_word_freq_t, node);
3697 
3698 			/* We need to cast away the const. */
3699 			rbt_free(word_freq->doc_freqs);
3700 		}
3701 
3702 		rbt_free(query->word_freqs);
3703 	}
3704 
3705 	ut_a(!query->intersection);
3706 
3707 	if (query->word_map) {
3708 		rbt_free(query->word_map);
3709 	}
3710 
3711 	if (query->word_vector) {
3712 		delete query->word_vector;
3713 	}
3714 
3715 	if (query->heap) {
3716 		mem_heap_free(query->heap);
3717 	}
3718 
3719 	memset(query, 0, sizeof(*query));
3720 }
3721 
3722 /*****************************************************************//**
3723 Parse the query using flex/bison. */
3724 static
3725 fts_ast_node_t*
fts_query_parse(fts_query_t * query,byte * query_str,ulint query_len)3726 fts_query_parse(
3727 /*============*/
3728 	fts_query_t*	query,		/*!< in: query instance */
3729 	byte*		query_str,	/*!< in: query string */
3730 	ulint		query_len)	/*!< in: query string length */
3731 {
3732 	int		error;
3733 	fts_ast_state_t state;
3734 	bool		mode = query->boolean_mode;
3735 	DBUG_ENTER("fts_query_parse");
3736 
3737 	memset(&state, 0x0, sizeof(state));
3738 
3739 	/* Setup the scanner to use, this depends on the mode flag. */
3740 	state.lexer = fts_lexer_create(mode, query_str, query_len);
3741 	state.charset = query->fts_index_table.charset;
3742 	error = fts_parse(&state);
3743 	fts_lexer_free(state.lexer);
3744 	state.lexer = NULL;
3745 
3746 	/* Error during parsing ? */
3747 	if (error) {
3748 		/* Free the nodes that were allocated during parsing. */
3749 		fts_ast_state_free(&state);
3750 	} else {
3751 		query->root = state.root;
3752 	}
3753 
3754 	DBUG_RETURN(state.root);
3755 }
3756 
3757 /*******************************************************************//**
3758 FTS Query optimization
3759 Set FTS_OPT_RANKING if it is a simple term query */
3760 static
3761 void
fts_query_can_optimize(fts_query_t * query,uint flags)3762 fts_query_can_optimize(
3763 /*===================*/
3764 	fts_query_t*	query,		/*!< in/out: query instance */
3765 	uint		flags)		/*!< In: FTS search mode */
3766 {
3767 	fts_ast_node_t*	node = query->root;
3768 
3769 	if (flags & FTS_EXPAND) {
3770 		return;
3771 	}
3772 
3773 	/* Check if it has only a term without oper */
3774 	ut_ad(node->type == FTS_AST_LIST);
3775 	node = node->list.head;
3776 	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3777 		query->flags = FTS_OPT_RANKING;
3778 	}
3779 }
3780 
3781 /*******************************************************************//**
3782 Pre-process the query string
3783 1) make it lower case
3784 2) in boolean mode, if there is '-' or '+' that is immediately proceeded
3785 and followed by valid word, make it a space
3786 @return the processed string */
3787 static
3788 byte*
fts_query_str_preprocess(const byte * query_str,ulint query_len,ulint * result_len,CHARSET_INFO * charset,bool boolean_mode)3789 fts_query_str_preprocess(
3790 /*=====================*/
3791 	const byte*	query_str,	/*!< in: FTS query */
3792 	ulint		query_len,	/*!< in: FTS query string len */
3793 	ulint		*result_len,	/*!< out: result string length */
3794 	CHARSET_INFO*	charset,	/*!< in: string charset */
3795 	bool		boolean_mode)	/*!< in: is boolean mode */
3796 {
3797 	ulint	cur_pos = 0;
3798 	ulint	str_len;
3799 	byte*	str_ptr;
3800 	bool	in_phrase = false;
3801 
3802 	/* Convert the query string to lower case before parsing. We own
3803 	the ut_malloc'ed result and so remember to free it before return. */
3804 
3805 	str_len = query_len * charset->casedn_multiply + 1;
3806 	str_ptr = static_cast<byte*>(ut_malloc(str_len));
3807 
3808 	/* For binary collations, a case sensitive search is
3809 	performed. Hence don't convert to lower case. */
3810 	if (my_binary_compare(charset)) {
3811 		memcpy(str_ptr, query_str, query_len);
3812 		str_ptr[query_len]= 0;
3813 		*result_len= query_len;
3814 	} else {
3815 		*result_len = innobase_fts_casedn_str(
3816 				charset, const_cast<char*>
3817 				(reinterpret_cast<const char*>( query_str)),
3818 				query_len,
3819 				reinterpret_cast<char*>(str_ptr), str_len);
3820 	}
3821 
3822 	ut_ad(*result_len < str_len);
3823 
3824 	str_ptr[*result_len] = 0;
3825 
3826 	/* If it is boolean mode, no need to check for '-/+' */
3827 	if (!boolean_mode) {
3828 		return(str_ptr);
3829 	}
3830 
3831 	/* Otherwise, we travese the string to find any '-/+' that are
3832 	immediately proceeded and followed by valid search word.
3833 	NOTE: we should not do so for CJK languages, this should
3834 	be taken care of in our CJK implementation */
3835         while (cur_pos < *result_len) {
3836                 fts_string_t    str;
3837                 ulint           offset;
3838                 ulint           cur_len;
3839 
3840                 cur_len = innobase_mysql_fts_get_token(
3841                         charset, str_ptr + cur_pos, str_ptr + *result_len,
3842 			&str, &offset);
3843 
3844 		if (cur_len == 0 || str.f_str == NULL) {
3845 			/* No valid word found */
3846 			break;
3847 		}
3848 
3849 		/* Check if we are in a phrase, if so, no need to do
3850 		replacement of '-/+'. */
3851 		for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
3852 			if ((char) (*ptr) == '"' ) {
3853 				in_phrase = !in_phrase;
3854 			}
3855 		}
3856 
3857 		/* Find those are not leading '-/+' and also not in a phrase */
3858 		if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
3859 		    && !in_phrase) {
3860 			char*	last_op = reinterpret_cast<char*>(
3861 						str_ptr + cur_pos);
3862 
3863 			if (*last_op == '-' || *last_op == '+') {
3864 				*last_op = ' ';
3865 			}
3866 		}
3867 
3868                 cur_pos += cur_len;
3869 	}
3870 
3871 	return(str_ptr);
3872 }
3873 
3874 /*******************************************************************//**
3875 FTS Query entry point.
3876 @return DB_SUCCESS if successful otherwise error code */
3877 UNIV_INTERN
3878 dberr_t
fts_query(trx_t * trx,dict_index_t * index,uint flags,const byte * query_str,ulint query_len,fts_result_t ** result)3879 fts_query(
3880 /*======*/
3881 	trx_t*		trx,		/*!< in: transaction */
3882 	dict_index_t*	index,		/*!< in: The FTS index to search */
3883 	uint		flags,		/*!< in: FTS search mode */
3884 	const byte*	query_str,	/*!< in: FTS query */
3885 	ulint		query_len,	/*!< in: FTS query string len
3886 					in bytes */
3887 	fts_result_t**	result)		/*!< in/out: result doc ids */
3888 {
3889 	fts_query_t	query;
3890 	dberr_t		error = DB_SUCCESS;
3891 	byte*		lc_query_str;
3892 	ulint		result_len;
3893 	bool		boolean_mode;
3894 	trx_t*		query_trx;
3895 	CHARSET_INFO*	charset;
3896 	ulint		start_time_ms;
3897 	bool		will_be_ignored = false;
3898 
3899 	boolean_mode = flags & FTS_BOOL;
3900 
3901 	*result = NULL;
3902 	memset(&query, 0x0, sizeof(query));
3903 	query_trx = trx_allocate_for_background();
3904 	query_trx->op_info = "FTS query";
3905 
3906 	start_time_ms = ut_time_ms();
3907 
3908 	query.trx = query_trx;
3909 	query.index = index;
3910 	query.boolean_mode = boolean_mode;
3911 	query.deleted = fts_doc_ids_create();
3912 	query.cur_node = NULL;
3913 
3914 	query.fts_common_table.type = FTS_COMMON_TABLE;
3915 	query.fts_common_table.table_id = index->table->id;
3916 	query.fts_common_table.parent = index->table->name;
3917 	query.fts_common_table.table = index->table;
3918 
3919 	charset = fts_index_get_charset(index);
3920 
3921 	query.fts_index_table.type = FTS_INDEX_TABLE;
3922 	query.fts_index_table.index_id = index->id;
3923 	query.fts_index_table.table_id = index->table->id;
3924 	query.fts_index_table.parent = index->table->name;
3925 	query.fts_index_table.charset = charset;
3926 	query.fts_index_table.table = index->table;
3927 
3928 	query.word_map = rbt_create_arg_cmp(
3929 		sizeof(fts_string_t), innobase_fts_text_cmp, charset);
3930 	query.word_vector = new word_vector_t;
3931 	query.error = DB_SUCCESS;
3932 
3933 	/* Setup the RB tree that will be used to collect per term
3934 	statistics. */
3935 	query.word_freqs = rbt_create_arg_cmp(
3936 		sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset);
3937 
3938 	query.total_size += SIZEOF_RBT_CREATE;
3939 
3940 	query.total_docs = dict_table_get_n_rows(index->table);
3941 
3942 #ifdef FTS_DOC_STATS_DEBUG
3943 	if (ft_enable_diag_print) {
3944 		error = fts_get_total_word_count(
3945 			trx, query.index, &query.total_words);
3946 
3947 		if (error != DB_SUCCESS) {
3948 			goto func_exit;
3949 		}
3950 
3951 		fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
3952 			query.total_docs, query.total_words);
3953 	}
3954 #endif /* FTS_DOC_STATS_DEBUG */
3955 
3956 	query.fts_common_table.suffix = "DELETED";
3957 
3958 	/* Read the deleted doc_ids, we need these for filtering. */
3959 	error = fts_table_fetch_doc_ids(
3960 		NULL, &query.fts_common_table, query.deleted);
3961 
3962 	if (error != DB_SUCCESS) {
3963 		goto func_exit;
3964 	}
3965 
3966 	query.fts_common_table.suffix = "DELETED_CACHE";
3967 
3968 	error = fts_table_fetch_doc_ids(
3969 		NULL, &query.fts_common_table, query.deleted);
3970 
3971 	if (error != DB_SUCCESS) {
3972 		goto func_exit;
3973 	}
3974 
3975 	/* Get the deleted doc ids that are in the cache. */
3976 	fts_cache_append_deleted_doc_ids(
3977 		index->table->fts->cache, query.deleted->doc_ids);
3978 	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
3979 
3980 	/* Sort the vector so that we can do a binary search over the ids. */
3981 	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
3982 
3983 #if 0
3984 	/* Convert the query string to lower case before parsing. We own
3985 	the ut_malloc'ed result and so remember to free it before return. */
3986 
3987 	lc_query_str_len = query_len * charset->casedn_multiply + 1;
3988 	lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
3989 
3990 	result_len = innobase_fts_casedn_str(
3991 		charset, (char*) query_str, query_len,
3992 		(char*) lc_query_str, lc_query_str_len);
3993 
3994 	ut_ad(result_len < lc_query_str_len);
3995 
3996 	lc_query_str[result_len] = 0;
3997 
3998 #endif
3999 
4000 	lc_query_str = fts_query_str_preprocess(
4001 		query_str, query_len, &result_len, charset, boolean_mode);
4002 
4003 	query.heap = mem_heap_create(128);
4004 
4005 	/* Create the rb tree for the doc id (current) set. */
4006 	query.doc_ids = rbt_create(
4007 		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4008 
4009 	query.total_size += SIZEOF_RBT_CREATE;
4010 
4011 	/* Parse the input query string. */
4012 	if (fts_query_parse(&query, lc_query_str, result_len)) {
4013 		fts_ast_node_t*	ast = query.root;
4014 
4015 		/* Optimize query to check if it's a single term */
4016 		fts_query_can_optimize(&query, flags);
4017 
4018 		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4019 			        fts_result_cache_limit = 2048;
4020 		);
4021 
4022 		/* Traverse the Abstract Syntax Tree (AST) and execute
4023 		the query. */
4024 		query.error = fts_ast_visit(
4025 			FTS_NONE, ast, fts_query_visitor,
4026 			&query, &will_be_ignored);
4027 
4028 		/* If query expansion is requested, extend the search
4029 		with first search pass result */
4030 		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4031 			query.error = fts_expand_query(index, &query);
4032 		}
4033 
4034 		/* Calculate the inverse document frequency of the terms. */
4035 		if (query.error == DB_SUCCESS
4036 		    && query.flags != FTS_OPT_RANKING) {
4037 			fts_query_calculate_idf(&query);
4038 		}
4039 
4040 		/* Copy the result from the query state, so that we can
4041 		return it to the caller. */
4042 		if (query.error == DB_SUCCESS) {
4043 			*result = fts_query_get_result(&query, *result);
4044 		}
4045 
4046 		error = query.error;
4047 	} else {
4048 		/* still return an empty result set */
4049 		*result = static_cast<fts_result_t*>(
4050 			ut_malloc(sizeof(**result)));
4051 		memset(*result, 0, sizeof(**result));
4052 	}
4053 
4054 	ut_free(lc_query_str);
4055 
4056 	if (fts_enable_diag_print && (*result)) {
4057 		ulint	diff_time = ut_time_ms() - start_time_ms;
4058 		fprintf(stderr, "FTS Search Processing time: %ld secs:"
4059 				" %ld millisec: row(s) %d \n",
4060 			diff_time / 1000, diff_time % 1000,
4061 			(*result)->rankings_by_id
4062 				? (int) rbt_size((*result)->rankings_by_id)
4063 				: -1);
4064 
4065 		/* Log memory consumption & result size */
4066 		ib_logf(IB_LOG_LEVEL_INFO,
4067 			"Full Search Memory: "
4068 			"%lu (bytes),  Row: %lu .",
4069 			query.total_size,
4070 			(*result)->rankings_by_id
4071 				?  rbt_size((*result)->rankings_by_id)
4072 				: 0);
4073 	}
4074 
4075 func_exit:
4076 	fts_query_free(&query);
4077 
4078 	trx_free_for_background(query_trx);
4079 
4080 	return(error);
4081 }
4082 
4083 /*****************************************************************//**
4084 FTS Query free result, returned by fts_query(). */
4085 
4086 void
fts_query_free_result(fts_result_t * result)4087 fts_query_free_result(
4088 /*==================*/
4089 	fts_result_t*	result)		/*!< in: result instance to free.*/
4090 {
4091 	if (result) {
4092 		if (result->rankings_by_id != NULL) {
4093 			rbt_free(result->rankings_by_id);
4094 			result->rankings_by_id = NULL;
4095 		}
4096 		if (result->rankings_by_rank != NULL) {
4097 			rbt_free(result->rankings_by_rank);
4098 			result->rankings_by_rank = NULL;
4099 		}
4100 
4101 		ut_free(result);
4102 		result = NULL;
4103 	}
4104 }
4105 
4106 /*****************************************************************//**
4107 FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4108 
4109 void
fts_query_sort_result_on_rank(fts_result_t * result)4110 fts_query_sort_result_on_rank(
4111 /*==========================*/
4112 	fts_result_t*	result)		/*!< out: result instance to sort.*/
4113 {
4114 	const ib_rbt_node_t*	node;
4115 	ib_rbt_t*		ranked;
4116 
4117 	ut_a(result->rankings_by_id != NULL);
4118 	if (result->rankings_by_rank) {
4119 		rbt_free(result->rankings_by_rank);
4120 	}
4121 
4122 	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4123 
4124 	/* We need to free any instances of fts_doc_freq_t that we
4125 	may have allocated. */
4126 	for (node = rbt_first(result->rankings_by_id);
4127 	     node;
4128 	     node = rbt_next(result->rankings_by_id, node)) {
4129 
4130 		fts_ranking_t*	ranking;
4131 
4132 		ranking = rbt_value(fts_ranking_t, node);
4133 
4134 		ut_a(ranking->words == NULL);
4135 
4136 		rbt_insert(ranked, ranking, ranking);
4137 	}
4138 
4139 	/* Reset the current node too. */
4140 	result->current = NULL;
4141 	result->rankings_by_rank = ranked;
4142 }
4143 
4144 #ifdef UNIV_DEBUG
4145 /*******************************************************************//**
4146 A debug function to print result doc_id set. */
4147 static
4148 void
fts_print_doc_id(fts_query_t * query)4149 fts_print_doc_id(
4150 /*=============*/
4151 	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
4152 {
4153 	const ib_rbt_node_t*	node;
4154 
4155 	/* Iterate each member of the doc_id set */
4156 	for (node = rbt_first(query->doc_ids);
4157 	     node;
4158 	     node = rbt_next(query->doc_ids, node)) {
4159 		fts_ranking_t*	ranking;
4160 		ranking = rbt_value(fts_ranking_t, node);
4161 
4162 		ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, doc_id: %ld \n",
4163 			(ulint) ranking->doc_id);
4164 
4165 		ulint		pos = 0;
4166 		fts_string_t	word;
4167 
4168 		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4169 			ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, value: %s \n", word.f_str);
4170 		}
4171 	}
4172 }
4173 #endif
4174 
4175 /*************************************************************//**
4176 This function implements a simple "blind" query expansion search:
4177 words in documents found in the first search pass will be used as
4178 search arguments to search the document again, thus "expand"
4179 the search result set.
4180 @return DB_SUCCESS if success, otherwise the error code */
4181 static MY_ATTRIBUTE((nonnull, warn_unused_result))
4182 dberr_t
fts_expand_query(dict_index_t * index,fts_query_t * query)4183 fts_expand_query(
4184 /*=============*/
4185 	dict_index_t*	index,		/*!< in: FTS index to search */
4186 	fts_query_t*	query)		/*!< in: FTS query instance */
4187 {
4188 	const ib_rbt_node_t*	node;
4189 	const ib_rbt_node_t*	token_node;
4190 	fts_doc_t		result_doc;
4191 	dberr_t			error = DB_SUCCESS;
4192 	const fts_index_cache_t*index_cache;
4193 
4194 	/* If no doc is found in first search pass, return */
4195 	if (!rbt_size(query->doc_ids)) {
4196 		return(error);
4197 	}
4198 
4199 	/* Init "result_doc", to hold words from the first search pass */
4200 	fts_doc_init(&result_doc);
4201 
4202 	rw_lock_x_lock(&index->table->fts->cache->lock);
4203 	index_cache = fts_find_index_cache(index->table->fts->cache, index);
4204 	rw_lock_x_unlock(&index->table->fts->cache->lock);
4205 
4206 	ut_a(index_cache);
4207 
4208 	result_doc.tokens = rbt_create_arg_cmp(
4209 		sizeof(fts_token_t), innobase_fts_text_cmp,
4210 		index_cache->charset);
4211 
4212 	result_doc.charset = index_cache->charset;
4213 
4214 	query->total_size += SIZEOF_RBT_CREATE;
4215 #ifdef UNIV_DEBUG
4216 	fts_print_doc_id(query);
4217 #endif
4218 
4219 	for (node = rbt_first(query->doc_ids);
4220 	     node;
4221 	     node = rbt_next(query->doc_ids, node)) {
4222 
4223 		fts_ranking_t*	ranking;
4224 		ulint		pos;
4225 		fts_string_t	word;
4226 		ulint		prev_token_size;
4227 		ulint		estimate_size;
4228 
4229 		prev_token_size = rbt_size(result_doc.tokens);
4230 
4231 		ranking = rbt_value(fts_ranking_t, node);
4232 
4233 		/* Fetch the documents with the doc_id from the
4234 		result of first seach pass. Since we do not
4235 		store document-to-word mapping, we need to
4236 		fetch the original document and parse them.
4237 		Future optimization could be done here if we
4238 		support some forms of document-to-word mapping */
4239 		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4240 					FTS_FETCH_DOC_BY_ID_EQUAL,
4241 					fts_query_expansion_fetch_doc,
4242 					&result_doc);
4243 
4244 		/* Remove words that have already been searched in the
4245 		first pass */
4246 		pos = 0;
4247 		while (fts_ranking_words_get_next(query, ranking, &pos,
4248 		       &word)) {
4249 			ibool		ret;
4250 
4251 			ret = rbt_delete(result_doc.tokens, &word);
4252 
4253 			/* The word must exist in the doc we found */
4254 			if (!ret) {
4255 				ib_logf(IB_LOG_LEVEL_ERROR, "Did not "
4256 					"find word %s in doc %ld for query "
4257 					"expansion search.\n", word.f_str,
4258 					(ulint) ranking->doc_id);
4259 			}
4260 		}
4261 
4262 		/* Estimate memory used, see fts_process_token and fts_token_t.
4263 		   We ignore token size here. */
4264 		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4265 			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4266 			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
4267 		query->total_size += estimate_size;
4268 
4269 		if (query->total_size > fts_result_cache_limit) {
4270 			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4271 			goto	func_exit;
4272 		}
4273 	}
4274 
4275 	/* Search the table the second time with expanded search list */
4276 	for (token_node = rbt_first(result_doc.tokens);
4277 	     token_node;
4278 	     token_node = rbt_next(result_doc.tokens, token_node)) {
4279 		fts_token_t*	mytoken;
4280 		mytoken = rbt_value(fts_token_t, token_node);
4281 
4282 		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4283 		fts_query_add_word_freq(query, &mytoken->text);
4284 		error = fts_query_union(query, &mytoken->text);
4285 
4286 		if (error != DB_SUCCESS) {
4287 			break;
4288 		}
4289 	}
4290 
4291 func_exit:
4292 	fts_doc_free(&result_doc);
4293 
4294 	return(error);
4295 }
4296 /*************************************************************//**
4297 This function finds documents that contain all words in a
4298 phrase or proximity search. And if proximity search, verify
4299 the words are close enough to each other, as in specified distance.
4300 This function is called for phrase and proximity search.
4301 @return TRUE if documents are found, FALSE if otherwise */
4302 static
4303 ibool
fts_phrase_or_proximity_search(fts_query_t * query,ib_vector_t * tokens)4304 fts_phrase_or_proximity_search(
4305 /*===========================*/
4306 	fts_query_t*	query,		/*!< in/out:  query instance.
4307 					query->doc_ids might be instantiated
4308 					with qualified doc IDs */
4309 	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
4310 {
4311 	ulint		n_matched;
4312 	ulint		i;
4313 	ibool		matched = FALSE;
4314 	ulint		num_token = ib_vector_size(tokens);
4315 	fts_match_t*	match[MAX_PROXIMITY_ITEM];
4316 	ibool		end_list = FALSE;
4317 
4318 	/* Number of matched documents for the first token */
4319 	n_matched = ib_vector_size(query->match_array[0]);
4320 
4321 	/* We have a set of match list for each word, we shall
4322 	walk through the list and find common documents that
4323 	contain all the matching words. */
4324 	for (i = 0; i < n_matched; i++) {
4325 		ulint		j;
4326 		ulint		k = 0;
4327 		fts_proximity_t	qualified_pos;
4328 
4329 		match[0] = static_cast<fts_match_t*>(
4330 			ib_vector_get(query->match_array[0], i));
4331 
4332 		/* For remaining match list for the token(word), we
4333 		try to see if there is a document with the same
4334 		doc id */
4335 		for (j = 1; j < num_token; j++) {
4336 			match[j] = static_cast<fts_match_t*>(
4337 				ib_vector_get(query->match_array[j], k));
4338 
4339 			while (match[j]->doc_id < match[0]->doc_id
4340 			       && k < ib_vector_size(query->match_array[j])) {
4341 				 match[j] = static_cast<fts_match_t*>(
4342 					ib_vector_get(
4343 						query->match_array[j], k));
4344 				k++;
4345 			}
4346 
4347 			if (match[j]->doc_id > match[0]->doc_id) {
4348 				/* no match */
4349 				if (query->flags & FTS_PHRASE) {
4350 					match[0]->doc_id = 0;
4351 				}
4352 				break;
4353 			}
4354 
4355 			if (k == ib_vector_size(query->match_array[j])) {
4356 				end_list = TRUE;
4357 
4358 				if (match[j]->doc_id != match[0]->doc_id) {
4359 					/* no match */
4360 					if (query->flags & FTS_PHRASE) {
4361 						ulint	s;
4362 
4363 						match[0]->doc_id = 0;
4364 
4365 						for (s = i + 1; s < n_matched;
4366 						     s++) {
4367 							match[0] = static_cast<
4368 							fts_match_t*>(
4369 							ib_vector_get(
4370 							query->match_array[0],
4371 							s));
4372 							match[0]->doc_id = 0;
4373 						}
4374 					}
4375 
4376 					goto func_exit;
4377 				}
4378 			}
4379 
4380 			/* FIXME: A better solution will be a counter array
4381 			remember each run's last position. So we don't
4382 			reset it here very time */
4383 			k = 0;
4384 		}
4385 
4386 		if (j != num_token) {
4387 			continue;
4388 		}
4389 
4390 		/* For this matching doc, we need to further
4391 		verify whether the words in the doc are close
4392 		to each other, and within the distance specified
4393 		in the proximity search */
4394 		if (query->flags & FTS_PHRASE) {
4395 			matched = TRUE;
4396 		} else if (fts_proximity_get_positions(
4397 			match, num_token, ULINT_MAX, &qualified_pos)) {
4398 
4399 			/* Fetch the original documents and count the
4400 			words in between matching words to see that is in
4401 			specified distance */
4402 			if (fts_query_is_in_proximity_range(
4403 				query, match, &qualified_pos)) {
4404 				/* If so, mark we find a matching doc */
4405 				query->error = fts_query_process_doc_id(
4406 					query, match[0]->doc_id, 0);
4407 				if (query->error != DB_SUCCESS) {
4408 					matched = FALSE;
4409 					goto func_exit;
4410 				}
4411 
4412 				matched = TRUE;
4413 				for (ulint z = 0; z < num_token; z++) {
4414 					fts_string_t*	token;
4415 					token = static_cast<fts_string_t*>(
4416 						ib_vector_get(tokens, z));
4417 					fts_query_add_word_to_document(
4418 						query, match[0]->doc_id, token);
4419 				}
4420 			}
4421 		}
4422 
4423 		if (end_list) {
4424 			break;
4425 		}
4426 	}
4427 
4428 func_exit:
4429 	return(matched);
4430 }
4431 
4432 /*************************************************************//**
4433 This function checks whether words in result documents are close to
4434 each other (within proximity range as specified by "distance").
4435 If "distance" is MAX_ULINT, then it will find all combinations of
4436 positions of matching words and store min and max positions
4437 in the "qualified_pos" for later verification.
4438 @return true if words are close to each other, false if otherwise */
4439 static
4440 bool
fts_proximity_get_positions(fts_match_t ** match,ulint num_match,ulint distance,fts_proximity_t * qualified_pos)4441 fts_proximity_get_positions(
4442 /*========================*/
4443 	fts_match_t**		match,		/*!< in: query instance */
4444 	ulint			num_match,	/*!< in: number of matching
4445 						items */
4446 	ulint			distance,	/*!< in: distance value
4447 						for proximity search */
4448 	fts_proximity_t*	qualified_pos)	/*!< out: the position info
4449 						records ranges containing
4450 						all matching words. */
4451 {
4452 	ulint	i;
4453 	ulint	idx[MAX_PROXIMITY_ITEM];
4454 	ulint	num_pos[MAX_PROXIMITY_ITEM];
4455 	ulint	min_idx;
4456 
4457 	qualified_pos->n_pos = 0;
4458 
4459 	ut_a(num_match <= MAX_PROXIMITY_ITEM);
4460 
4461 	/* Each word could appear multiple times in a doc. So
4462 	we need to walk through each word's position list, and find
4463 	closest distance between different words to see if
4464 	they are in the proximity distance. */
4465 
4466 	/* Assume each word's position list is sorted, we
4467 	will just do a walk through to all words' lists
4468 	similar to a the merge phase of a merge sort */
4469 	for (i = 0; i < num_match; i++) {
4470 		/* idx is the current position we are checking
4471 		for a particular word */
4472 		idx[i] = 0;
4473 
4474 		/* Number of positions for this word */
4475 		num_pos[i] = ib_vector_size(match[i]->positions);
4476 	}
4477 
4478 	/* Start with the first word */
4479 	min_idx = 0;
4480 
4481 	while (idx[min_idx] < num_pos[min_idx]) {
4482 		ulint	position[MAX_PROXIMITY_ITEM];
4483 		ulint	min_pos = ULINT_MAX;
4484 		ulint	max_pos = 0;
4485 
4486 		/* Check positions in each word position list, and
4487 		record the max/min position */
4488 		for (i = 0; i < num_match; i++) {
4489 			position[i] = *(ulint*) ib_vector_get_const(
4490 				match[i]->positions, idx[i]);
4491 
4492 			if (position[i] == ULINT_UNDEFINED) {
4493 				break;
4494 			}
4495 
4496 			if (position[i] < min_pos) {
4497 				min_pos = position[i];
4498 				min_idx = i;
4499 			}
4500 
4501 			if (position[i] > max_pos) {
4502 				max_pos = position[i];
4503 			}
4504 		}
4505 
4506 		/* If max and min position are within range, we
4507 		find a good match */
4508 		if (max_pos - min_pos <= distance
4509 		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4510 			/* The charset has variable character
4511 			length encoding, record the min_pos and
4512 			max_pos, we will need to verify the actual
4513 			number of characters */
4514 			qualified_pos->min_pos.push_back(min_pos);
4515 			qualified_pos->max_pos.push_back(max_pos);
4516 			qualified_pos->n_pos++;
4517 		}
4518 
4519 		/* Otherwise, move to the next position is the
4520 		list for the word with the smallest position */
4521 		idx[min_idx]++;
4522 	}
4523 
4524 	return(qualified_pos->n_pos != 0);
4525 }
4526