1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /******************************************************************//**
28 @file include/fts0types.h
29 Full text search types file
30 
31 Created 2007-03-27 Sunny Bains
32 *******************************************************/
33 
34 #ifndef INNOBASE_FTS0TYPES_H
35 #define INNOBASE_FTS0TYPES_H
36 
37 #include "univ.i"
38 #include "fts0fts.h"
39 #include "fut0fut.h"
40 #include "pars0pars.h"
41 #include "que0types.h"
42 #include "ut0byte.h"
43 #include "ut0rbt.h"
44 
45 /** Types used within FTS. */
46 struct fts_que_t;
47 struct fts_node_t;
48 
49 /** Callbacks used within FTS. */
50 typedef pars_user_func_cb_t fts_sql_callback;
51 typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
52 
53 /** Statistics relevant to a particular document, used during retrieval. */
54 struct fts_doc_stats_t {
55 	doc_id_t	doc_id;		/*!< Document id */
56 	ulint		word_count;	/*!< Total words in the document */
57 };
58 
59 /** It's main purpose is to store the SQL prepared statements that
60 are required to retrieve a document from the database. */
61 struct fts_get_doc_t {
62 	fts_index_cache_t*
63 			index_cache;	/*!< The index cache instance */
64 
65 					/*!< Parsed sql statement */
66 	que_t*		get_document_graph;
67 	fts_cache_t*	cache;		/*!< The parent cache */
68 };
69 
70 /** Since we can have multiple FTS indexes on a table, we keep a
71 per index cache of words etc. */
72 struct fts_index_cache_t {
73 	dict_index_t*	index;		/*!< The FTS index instance */
74 
75 	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
76 					cells are fts_tokenizer_word_t*.*/
77 
78 	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
79 					contained in the memory buffer.
80 					Must be in sorted order (ascending).
81 					The  ideal choice is an rb tree but
82 					the rb tree imposes a space overhead
83 					that we can do without */
84 
85 	que_t**		ins_graph;	/*!< Insert query graphs */
86 
87 	que_t**		sel_graph;	/*!< Select query graphs */
88 	CHARSET_INFO*	charset;	/*!< charset */
89 };
90 
91 /** For supporting the tracking of updates on multiple FTS indexes we need
92 to track which FTS indexes need to be updated. For INSERT and DELETE we
93 update all fts indexes. */
94 struct fts_update_t {
95 	doc_id_t	doc_id;		/*!< The doc id affected */
96 
97 	ib_vector_t*	fts_indexes;	/*!< The FTS indexes that need to be
98 					updated. A NULL value means all
99 					indexes need to be updated.  This
100 					vector is not allocated on the heap
101 					and so must be freed explicitly,
102 					when we are done with it */
103 };
104 
105 /** Stop word control infotmation. */
106 struct fts_stopword_t {
107 	ulint		status;		/*!< Status of the stopword tree */
108 	ib_alloc_t*	heap;		/*!< The memory allocator to use */
109 	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
110 	CHARSET_INFO*	charset;	/*!< charset for stopword */
111 };
112 
113 /** The SYNC state of the cache. There is one instance of this struct
114 associated with each ADD thread. */
115 struct fts_sync_t {
116 	trx_t*		trx;		/*!< The transaction used for SYNCing
117 					the cache to disk */
118 	dict_table_t*	table;		/*!< Table with FTS index(es) */
119 	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
120 	ibool		cache_full;	/*!< flag, when true it indicates that
121 					we need to sync the cache to disk */
122 	ulint		lower_index;	/*!< the start index of the doc id
123 					vector from where to start adding
124 					documents to the FTS cache */
125 	ulint		upper_index;	/*!< max index of the doc id vector to
126 					add to the FTS cache */
127 	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
128 	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
129 					cache. It should equal to
130 					doc_ids[lower_index] */
131 	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
132 					noted as being full, we use this to
133 					set the upper_limit field */
134 	ib_time_monotonic_t	start_time;
135 	/*!< SYNC start time */
136 	bool		in_progress;	/*!< flag whether sync is in progress.*/
137 	bool		unlock_cache;	/*!< flag whether unlock cache when
138 					write fts node */
139 	os_event_t	event;		/*!< sync finish event */
140 };
141 
142 /** The cache for the FTS system. It is a memory-based inverted index
143 that new entries are added to, until it grows over the configured maximum
144 size, at which time its contents are written to the INDEX table. */
145 struct fts_cache_t {
146 	rw_lock_t	lock;		/*!< lock protecting all access to the
147 					memory buffer. FIXME: this needs to
148 					be our new upgrade-capable rw-lock */
149 
150 	rw_lock_t	init_lock;	/*!< lock used for the cache
151 					intialization, it has different
152 					SYNC level as above cache lock */
153 
154 	ib_mutex_t	optimize_lock;	/*!< Lock for OPTIMIZE */
155 
156 	ib_mutex_t	deleted_lock;	/*!< Lock covering deleted_doc_ids */
157 
158 	ib_mutex_t	doc_id_lock;	/*!< Lock covering Doc ID */
159 
160 	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
161 					element is of type fts_update_t */
162 
163 	ib_vector_t*	indexes;	/*!< We store the stats and inverted
164 					index for the individual FTS indexes
165 					in this vector. Each element is
166 					an instance of fts_index_cache_t */
167 
168 	ib_vector_t*	get_docs;	/*!< information required to read
169 					the document from the table. Each
170 					element is of type fts_doc_t */
171 
172 	ulint		total_size;	/*!< total size consumed by the ilist
173 					field of all nodes. SYNC is run
174 					whenever this gets too big */
175 	fts_sync_t*	sync;		/*!< sync structure to sync data to
176 					disk */
177 	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
178 					and deleted_doc_ids, ie. transient
179 					objects, they are recreated after
180 					a SYNC is completed */
181 
182 	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
183 					which an instance of the cache itself
184 					was created. Objects created using
185 					this heap will last for the lifetime
186 					of the cache */
187 
188 	doc_id_t	next_doc_id;	/*!< Next doc id */
189 
190 	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
191 
192 	doc_id_t	first_doc_id;	/*!< first doc id since this table
193 					was opened */
194 
195 	ulint		deleted;	/*!< Number of doc ids deleted since
196 					last optimized. This variable is
197 					covered by deleted_lock */
198 
199 	ulint		added;		/*!< Number of doc ids added since last
200 					optimized. This variable is covered by
201 					the deleted lock */
202 
203 	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
204 	mem_heap_t*	cache_heap;	/*!< Cache Heap */
205 };
206 
207 /** Columns of the FTS auxiliary INDEX table */
208 struct fts_node_t {
209 	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
210 
211 	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
212 
213 	byte*		ilist;		/*!< Binary list of documents & word
214 					positions the token appears in.
215 					TODO: For now, these are simply
216 					ut_malloc'd, but if testing shows
217 					that they waste memory unacceptably, a
218 					special memory allocator will have
219 					to be written */
220 
221 	ulint		doc_count;	/*!< Number of doc ids in ilist */
222 
223 	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
224 
225 	ulint		ilist_size_alloc;
226 					/*!< Allocated size of ilist in
227 					bytes */
228 	bool		synced;		/*!< flag whether the node is synced */
229 };
230 
231 /** A tokenizer word. Contains information about one word. */
232 struct fts_tokenizer_word_t {
233 	fts_string_t	text;		/*!< Token text. */
234 
235 	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
236 					of type fts_node_t */
237 };
238 
239 /** Word text plus it's array of nodes as on disk in FTS index */
240 struct fts_word_t {
241 	fts_string_t	text;		/*!< Word value in UTF-8 */
242 	ib_vector_t*	nodes;		/*!< Nodes read from disk */
243 
244 	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
245 };
246 
247 /** Callback for reading and filtering nodes that are read from FTS index */
248 struct fts_fetch_t {
249 	void*		read_arg;	/*!< Arg for the sql_callback */
250 
251 	fts_sql_callback
252 			read_record;	/*!< Callback for reading index
253 					record */
254 	ulint		total_memory;	/*!< Total memory used */
255 };
256 
257 /** For horizontally splitting an FTS auxiliary index */
258 struct fts_index_selector_t {
259 	ulint		value;		/*!< Character value at which
260 					to split */
261 
262 	const char*	suffix;		/*!< FTS aux index suffix */
263 };
264 
265 /** This type represents a single document. */
266 struct fts_doc_t {
267 	fts_string_t	text;		/*!< document text */
268 
269 	ibool		found;		/*!< TRUE if the document was found
270 					successfully in the database */
271 
272 	ib_rbt_t*	tokens;		/*!< This is filled when the document
273 					is tokenized. Tokens; indexed by
274 					fts_string_t*, cells are of type
275 					fts_token_t* */
276 
277 	ib_alloc_t*	self_heap;	/*!< An instance of this type is
278 					allocated from this heap along
279 					with any objects that have the
280 					same lifespan, most notably
281 					the vector of token positions */
282 	CHARSET_INFO*	charset;	/*!< Document's charset info */
283 
284 	st_mysql_ftparser* parser;	/*!< fts plugin parser */
285 
286 	bool		is_ngram;	/*!< Whether it is a ngram parser */
287 
288 	ib_rbt_t*	stopwords;	/*!< Stopwords */
289 };
290 
291 /** A token and its positions within a document. */
292 struct fts_token_t {
293 	fts_string_t	text;		/*!< token text */
294 
295 	ib_vector_t*	positions;	/*!< an array of the positions the
296 					token is found in; each item is
297 					actually an ulint. */
298 };
299 
300 /** It's defined in fts/fts0fts.c */
301 extern const fts_index_selector_t fts_index_selector[];
302 
303 /******************************************************************//**
304 Compare two fts_trx_row_t instances doc_ids. */
305 UNIV_INLINE
306 int
307 fts_trx_row_doc_id_cmp(
308 /*===================*/
309 						/*!< out:
310 						< 0 if n1 < n2,
311 						0 if n1 == n2,
312 						> 0 if n1 > n2 */
313 	const void*	p1,			/*!< in: id1 */
314 	const void*	p2);			/*!< in: id2 */
315 
316 /******************************************************************//**
317 Compare two fts_ranking_t instances doc_ids. */
318 UNIV_INLINE
319 int
320 fts_ranking_doc_id_cmp(
321 /*===================*/
322 						/*!< out:
323 						< 0 if n1 < n2,
324 						0 if n1 == n2,
325 						> 0 if n1 > n2 */
326 	const void*	p1,			/*!< in: id1 */
327 	const void*	p2);			/*!< in: id2 */
328 
329 /******************************************************************//**
330 Compare two fts_update_t instances doc_ids. */
331 UNIV_INLINE
332 int
333 fts_update_doc_id_cmp(
334 /*==================*/
335 						/*!< out:
336 						< 0 if n1 < n2,
337 						0 if n1 == n2,
338 						> 0 if n1 > n2 */
339 	const void*	p1,			/*!< in: id1 */
340 	const void*	p2);			/*!< in: id2 */
341 
342 /******************************************************************//**
343 Decode and return the integer that was encoded using our VLC scheme.*/
344 UNIV_INLINE
345 ulint
346 fts_decode_vlc(
347 /*===========*/
348 			/*!< out: value decoded */
349 	byte**	ptr);	/*!< in: ptr to decode from, this ptr is
350 			incremented by the number of bytes decoded */
351 
352 /******************************************************************//**
353 Duplicate a string. */
354 UNIV_INLINE
355 void
356 fts_string_dup(
357 /*===========*/
358 						/*!< out:
359 						< 0 if n1 < n2,
360 						0 if n1 == n2,
361 						> 0 if n1 > n2 */
362 	fts_string_t*		dst,		/*!< in: dup to here */
363 	const fts_string_t*	src,		/*!< in: src string */
364 	mem_heap_t*		heap);		/*!< in: heap to use */
365 
366 /******************************************************************//**
367 Return length of val if it were encoded using our VLC scheme. */
368 UNIV_INLINE
369 ulint
370 fts_get_encoded_len(
371 /*================*/
372 						/*!< out: length of value
373 						 encoded, in bytes */
374 	ulint		val);			/*!< in: value to encode */
375 
376 /******************************************************************//**
377 Encode an integer using our VLC scheme and return the length in bytes. */
378 UNIV_INLINE
379 ulint
380 fts_encode_int(
381 /*===========*/
382 						/*!< out: length of value
383 						encoded, in bytes */
384 	ulint		val,			/*!< in: value to encode */
385 	byte*		buf);			/*!< in: buffer, must have
386 						enough space */
387 
388 /******************************************************************//**
389 Get the selected FTS aux INDEX suffix. */
390 UNIV_INLINE
391 const char*
392 fts_get_suffix(
393 /*===========*/
394 	ulint		selected);		/*!< in: selected index */
395 
396 /** Select the FTS auxiliary index for the given character.
397 @param[in]	cs	charset
398 @param[in]	str	string
399 @param[in]	len	string length in bytes
400 @return the index to use for the string */
401 UNIV_INLINE
402 ulint
403 fts_select_index(
404 	const CHARSET_INFO*	cs,
405 	const byte*		str,
406 	ulint			len);
407 
408 #ifndef UNIV_NONINL
409 #include "fts0types.ic"
410 #include "fts0vlc.ic"
411 #endif
412 
413 #endif /* INNOBASE_FTS0TYPES_H */
414