1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /******************************************************************//**
28 @file include/fts0types.h
29 Full text search types file
30 
31 Created 2007-03-27 Sunny Bains
32 *******************************************************/
33 
34 #ifndef INNOBASE_FTS0TYPES_H
35 #define INNOBASE_FTS0TYPES_H
36 
37 #include "que0types.h"
38 #include "ut0byte.h"
39 #include "fut0fut.h"
40 #include "ut0rbt.h"
41 #include "fts0fts.h"
42 
43 /** Types used within FTS. */
44 struct fts_que_t;
45 struct fts_node_t;
46 struct fts_utf8_str_t;
47 
48 /** Callbacks used within FTS. */
49 typedef pars_user_func_cb_t fts_sql_callback;
50 typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
51 
52 /** Statistics relevant to a particular document, used during retrieval. */
53 struct fts_doc_stats_t {
54 	doc_id_t	doc_id;		/*!< Document id */
55 	ulint		word_count;	/*!< Total words in the document */
56 };
57 
58 /** It's main purpose is to store the SQL prepared statements that
59 are required to retrieve a document from the database. */
60 struct fts_get_doc_t {
61 	fts_index_cache_t*
62 			index_cache;	/*!< The index cache instance */
63 
64 					/*!< Parsed sql statement */
65 	que_t*		get_document_graph;
66 	fts_cache_t*	cache;		/*!< The parent cache */
67 };
68 
69 /** Since we can have multiple FTS indexes on a table, we keep a
70 per index cache of words etc. */
71 struct fts_index_cache_t {
72 	dict_index_t*	index;		/*!< The FTS index instance */
73 
74 	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
75 					cells are fts_tokenizer_word_t*.*/
76 
77 	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
78 					contained in the memory buffer.
79 					Must be in sorted order (ascending).
80 					The  ideal choice is an rb tree but
81 					the rb tree imposes a space overhead
82 					that we can do without */
83 
84 	que_t**		ins_graph;	/*!< Insert query graphs */
85 
86 	que_t**		sel_graph;	/*!< Select query graphs */
87 	CHARSET_INFO*	charset;	/*!< charset */
88 };
89 
90 /** For supporting the tracking of updates on multiple FTS indexes we need
91 to track which FTS indexes need to be updated. For INSERT and DELETE we
92 update all fts indexes. */
93 struct fts_update_t {
94 	doc_id_t	doc_id;		/*!< The doc id affected */
95 
96 	ib_vector_t*	fts_indexes;	/*!< The FTS indexes that need to be
97 					updated. A NULL value means all
98 					indexes need to be updated.  This
99 					vector is not allocated on the heap
100 					and so must be freed explicitly,
101 					when we are done with it */
102 };
103 
104 /** Stop word control infotmation. */
105 struct fts_stopword_t {
106 	ulint		status;		/*!< Status of the stopword tree */
107 	ib_alloc_t*	heap;		/*!< The memory allocator to use */
108 	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
109 	CHARSET_INFO*	charset;	/*!< charset for stopword */
110 };
111 
112 /** The SYNC state of the cache. There is one instance of this struct
113 associated with each ADD thread. */
114 struct fts_sync_t {
115 	trx_t*		trx;		/*!< The transaction used for SYNCing
116 					the cache to disk */
117 	dict_table_t*	table;		/*!< Table with FTS index(es) */
118 	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
119 	ibool		cache_full;	/*!< flag, when true it indicates that
120 					we need to sync the cache to disk */
121 	ulint		lower_index;	/*!< the start index of the doc id
122 					vector from where to start adding
123 					documents to the FTS cache */
124 	ulint		upper_index;	/*!< max index of the doc id vector to
125 					add to the FTS cache */
126 	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
127 	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
128 					cache. It should equal to
129 					doc_ids[lower_index] */
130 	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
131 					noted as being full, we use this to
132 					set the upper_limit field */
133 	ib_time_t	start_time;	/*!< SYNC start time */
134 	bool		in_progress;	/*!< flag whether sync is in progress.*/
135 	bool		unlock_cache;	/*!< flag whether unlock cache when
136 					write fts node */
137 	os_event_t	event;		/*!< sync finish event */
138 };
139 
140 /** The cache for the FTS system. It is a memory-based inverted index
141 that new entries are added to, until it grows over the configured maximum
142 size, at which time its contents are written to the INDEX table. */
143 struct fts_cache_t {
144 	rw_lock_t	lock;		/*!< lock protecting all access to the
145 					memory buffer. FIXME: this needs to
146 					be our new upgrade-capable rw-lock */
147 
148 	rw_lock_t	init_lock;	/*!< lock used for the cache
149 					intialization, it has different
150 					SYNC level as above cache lock */
151 
152 	ib_mutex_t	optimize_lock;	/*!< Lock for OPTIMIZE */
153 
154 	ib_mutex_t	deleted_lock;	/*!< Lock covering deleted_doc_ids */
155 
156 	ib_mutex_t	doc_id_lock;	/*!< Lock covering Doc ID */
157 
158 	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
159 					element is of type fts_update_t */
160 
161 	ib_vector_t*	indexes;	/*!< We store the stats and inverted
162 					index for the individual FTS indexes
163 					in this vector. Each element is
164 					an instance of fts_index_cache_t */
165 
166 	ib_vector_t*	get_docs;	/*!< information required to read
167 					the document from the table. Each
168 					element is of type fts_doc_t */
169 
170 	ulint		total_size;	/*!< total size consumed by the ilist
171 					field of all nodes. SYNC is run
172 					whenever this gets too big */
173 	fts_sync_t*	sync;		/*!< sync structure to sync data to
174 					disk */
175 	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
176 					and deleted_doc_ids, ie. transient
177 					objects, they are recreated after
178 					a SYNC is completed */
179 
180 	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
181 					which an instance of the cache itself
182 					was created. Objects created using
183 					this heap will last for the lifetime
184 					of the cache */
185 
186 	doc_id_t	next_doc_id;	/*!< Next doc id */
187 
188 	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
189 
190 	doc_id_t	first_doc_id;	/*!< first doc id since this table
191 					was opened */
192 
193 	ulint		deleted;	/*!< Number of doc ids deleted since
194 					last optimized. This variable is
195 					covered by deleted_lock */
196 
197 	ulint		added;		/*!< Number of doc ids added since last
198 					optimized. This variable is covered by
199 					the deleted lock */
200 
201 	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
202 	mem_heap_t*	cache_heap;	/*!< Cache Heap */
203 };
204 
205 /** Columns of the FTS auxiliary INDEX table */
206 struct fts_node_t {
207 	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
208 
209 	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
210 
211 	byte*		ilist;		/*!< Binary list of documents & word
212 					positions the token appears in.
213 					TODO: For now, these are simply
214 					ut_malloc'd, but if testing shows
215 					that they waste memory unacceptably, a
216 					special memory allocator will have
217 					to be written */
218 
219 	ulint		doc_count;	/*!< Number of doc ids in ilist */
220 
221 	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
222 
223 	ulint		ilist_size_alloc;
224 					/*!< Allocated size of ilist in
225 					bytes */
226 	bool		synced;		/*!< flag whether the node is synced */
227 };
228 
229 /** A tokenizer word. Contains information about one word. */
230 struct fts_tokenizer_word_t {
231 	fts_string_t	text;		/*!< Token text. */
232 
233 	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
234 					of type fts_node_t */
235 };
236 
237 /** Word text plus it's array of nodes as on disk in FTS index */
238 struct fts_word_t {
239 	fts_string_t	text;		/*!< Word value in UTF-8 */
240 	ib_vector_t*	nodes;		/*!< Nodes read from disk */
241 
242 	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
243 };
244 
245 /** Callback for reading and filtering nodes that are read from FTS index */
246 struct fts_fetch_t {
247 	void*		read_arg;	/*!< Arg for the sql_callback */
248 
249 	fts_sql_callback
250 			read_record;	/*!< Callback for reading index
251 					record */
252 	ulint		total_memory;	/*!< Total memory used */
253 };
254 
255 /** For horizontally splitting an FTS auxiliary index */
256 struct fts_index_selector_t {
257 	ulint		value;		/*!< Character value at which
258 					to split */
259 
260 	const char*	suffix;		/*!< FTS aux index suffix */
261 };
262 
263 /** This type represents a single document. */
264 struct fts_doc_t {
265 	fts_string_t	text;		/*!< document text */
266 
267 	ibool		found;		/*!< TRUE if the document was found
268 					successfully in the database */
269 
270 	ib_rbt_t*	tokens;		/*!< This is filled when the document
271 					is tokenized. Tokens; indexed by
272 					fts_string_t*, cells are of type
273 					fts_token_t* */
274 
275 	ib_alloc_t*	self_heap;	/*!< An instance of this type is
276 					allocated from this heap along
277 					with any objects that have the
278 					same lifespan, most notably
279 					the vector of token positions */
280 	CHARSET_INFO*	charset;	/*!< Document's charset info */
281 };
282 
283 /** A token and its positions within a document. */
284 struct fts_token_t {
285 	fts_string_t	text;		/*!< token text */
286 
287 	ib_vector_t*	positions;	/*!< an array of the positions the
288 					token is found in; each item is
289 					actually an ulint. */
290 };
291 
292 /** It's defined in fts/fts0fts.c */
293 extern const fts_index_selector_t fts_index_selector[];
294 
295 /******************************************************************//**
296 Compare two UTF-8 strings. */
297 UNIV_INLINE
298 int
299 fts_utf8_string_cmp(
300 /*================*/
301 						/*!< out:
302 						< 0 if n1 < n2,
303 						0 if n1 == n2,
304 						> 0 if n1 > n2 */
305 	const void*	p1,			/*!< in: key */
306 	const void*	p2);			/*!< in: node */
307 
308 /******************************************************************//**
309 Compare two UTF-8 strings, and return match (0) if
310 passed in "key" value equals or is the prefix of the "node" value. */
311 UNIV_INLINE
312 int
313 fts_utf8_string_cmp_prefix(
314 /*=======================*/
315 						/*!< out:
316 						< 0 if n1 < n2,
317 						0 if n1 == n2,
318 						> 0 if n1 > n2 */
319 	const void*	p1,			/*!< in: key */
320 	const void*	p2);			/*!< in: node */
321 
322 /******************************************************************//**
323 Compare two fts_trx_row_t instances doc_ids. */
324 UNIV_INLINE
325 int
326 fts_trx_row_doc_id_cmp(
327 /*===================*/
328 						/*!< out:
329 						< 0 if n1 < n2,
330 						0 if n1 == n2,
331 						> 0 if n1 > n2 */
332 	const void*	p1,			/*!< in: id1 */
333 	const void*	p2);			/*!< in: id2 */
334 
335 /******************************************************************//**
336 Compare two fts_ranking_t instances doc_ids. */
337 UNIV_INLINE
338 int
339 fts_ranking_doc_id_cmp(
340 /*===================*/
341 						/*!< out:
342 						< 0 if n1 < n2,
343 						0 if n1 == n2,
344 						> 0 if n1 > n2 */
345 	const void*	p1,			/*!< in: id1 */
346 	const void*	p2);			/*!< in: id2 */
347 
348 /******************************************************************//**
349 Compare two fts_update_t instances doc_ids. */
350 UNIV_INLINE
351 int
352 fts_update_doc_id_cmp(
353 /*==================*/
354 						/*!< out:
355 						< 0 if n1 < n2,
356 						0 if n1 == n2,
357 						> 0 if n1 > n2 */
358 	const void*	p1,			/*!< in: id1 */
359 	const void*	p2);			/*!< in: id2 */
360 
361 /******************************************************************//**
362 Decode and return the integer that was encoded using our VLC scheme.*/
363 UNIV_INLINE
364 ulint
365 fts_decode_vlc(
366 /*===========*/
367 			/*!< out: value decoded */
368 	byte**	ptr);	/*!< in: ptr to decode from, this ptr is
369 			incremented by the number of bytes decoded */
370 
371 /******************************************************************//**
372 Duplicate an UTF-8 string. */
373 UNIV_INLINE
374 void
375 fts_utf8_string_dup(
376 /*================*/
377 						/*!< out:
378 						< 0 if n1 < n2,
379 						0 if n1 == n2,
380 						> 0 if n1 > n2 */
381 	fts_string_t*		dst,		/*!< in: dup to here */
382 	const fts_string_t*	src,		/*!< in: src string */
383 	mem_heap_t*		heap);		/*!< in: heap to use */
384 
385 /******************************************************************//**
386 Return length of val if it were encoded using our VLC scheme. */
387 UNIV_INLINE
388 ulint
389 fts_get_encoded_len(
390 /*================*/
391 						/*!< out: length of value
392 						 encoded, in bytes */
393 	ulint		val);			/*!< in: value to encode */
394 
395 /******************************************************************//**
396 Encode an integer using our VLC scheme and return the length in bytes. */
397 UNIV_INLINE
398 ulint
399 fts_encode_int(
400 /*===========*/
401 						/*!< out: length of value
402 						encoded, in bytes */
403 	ulint		val,			/*!< in: value to encode */
404 	byte*		buf);			/*!< in: buffer, must have
405 						enough space */
406 
407 /******************************************************************//**
408 Decode a UTF-8 character.
409 
410 http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
411 
412  Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
413 00000000 0xxxxxxx          0xxxxxxx
414 00000yyy yyxxxxxx          110yyyyy 10xxxxxx
415 zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
416 000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
417 
418 This function decodes UTF-8 sequences up to 6 bytes (31 bits).
419 
420 On error *ptr will point to the first byte that was not correctly
421 decoded. This will hopefully help in resyncing the input. */
422 UNIV_INLINE
423 ulint
424 fts_utf8_decode(
425 /*============*/
426 						/*!< out: UTF8_ERROR if *ptr
427 						did not point to a valid
428 						UTF-8 sequence, or the
429 						Unicode code point. */
430 	const byte**	ptr);			/*!< in/out: pointer to
431 						UTF-8 string. The
432 						pointer is advanced to
433 						the start of the next
434 						character. */
435 
436 /******************************************************************//**
437 Lowercase an UTF-8 string. */
438 UNIV_INLINE
439 void
440 fts_utf8_tolower(
441 /*=============*/
442 	fts_string_t*	str);			/*!< in: string */
443 
444 /******************************************************************//**
445 Get the selected FTS aux INDEX suffix. */
446 UNIV_INLINE
447 const char*
448 fts_get_suffix(
449 /*===========*/
450 	ulint		selected);		/*!< in: selected index */
451 
452 /********************************************************************
453 Get the number of index selectors. */
454 UNIV_INLINE
455 ulint
456 fts_get_n_selectors(void);
457 /*=====================*/
458 
459 /******************************************************************//**
460 Select the FTS auxiliary index for the given string.
461 @return the index to use for the string */
462 UNIV_INLINE
463 ulint
464 fts_select_index(
465 /*=============*/
466 	const CHARSET_INFO*	cs,		/*!< Charset */
467 	const byte*		str,		/*!< in: word string */
468 	ulint			len);		/*!< in: string length */
469 
470 /********************************************************************
471 Select the next FTS auxiliary index for the given character.
472 @return the next index to use for character */
473 UNIV_INLINE
474 ulint
475 fts_select_next_index(
476 /*==================*/
477 	const CHARSET_INFO*	cs,		/*!< Charset */
478 	const byte*		str,		/*!< in: string */
479 	ulint			len);		/*!< in: string length */
480 
481 #ifndef UNIV_NONINL
482 #include "fts0types.ic"
483 #include "fts0vlc.ic"
484 #endif
485 
486 #endif /* INNOBASE_FTS0TYPES_H */
487