1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License, version 2.0, as published by the
7 Free Software Foundation.
8 
9 This program is also distributed with certain software (including but not
10 limited to OpenSSL) that is licensed under separate terms, as designated in a
11 particular file or component or in included license documentation. The authors
12 of MySQL hereby grant you an additional permission to link the program and
13 your derivative works with the separately licensed software that they have
14 included with MySQL.
15 
16 This program is distributed in the hope that it will be useful, but WITHOUT
17 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19 for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
24 
25 *****************************************************************************/
26 
27 /** @file include/fts0types.h
28  Full text search types file
29 
30  Created 2007-03-27 Sunny Bains
31  *******************************************************/
32 
33 #ifndef INNOBASE_FTS0TYPES_H
34 #define INNOBASE_FTS0TYPES_H
35 
36 #include "fts0fts.h"
37 #include "fut0fut.h"
38 #include "pars0pars.h"
39 #include "que0types.h"
40 #include "univ.i"
41 #include "ut0byte.h"
42 #include "ut0rbt.h"
43 
44 /** Types used within FTS. */
45 struct fts_que_t;
46 struct fts_node_t;
47 
48 /** Callbacks used within FTS. */
49 typedef pars_user_func_cb_t fts_sql_callback;
50 typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
51 
52 /** Statistics relevant to a particular document, used during retrieval. */
53 struct fts_doc_stats_t {
54   doc_id_t doc_id;  /*!< Document id */
55   ulint word_count; /*!< Total words in the document */
56 };
57 
58 /** It's main purpose is to store the SQL prepared statements that
59 are required to retrieve a document from the database. */
60 struct fts_get_doc_t {
61   fts_index_cache_t *index_cache; /*!< The index cache instance */
62 
63   /*!< Parsed sql statement */
64   que_t *get_document_graph;
65   fts_cache_t *cache; /*!< The parent cache */
66 };
67 
68 /** Since we can have multiple FTS indexes on a table, we keep a
69 per index cache of words etc. */
70 struct fts_index_cache_t {
71   dict_index_t *index; /*!< The FTS index instance */
72 
73   ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
74                    cells are fts_tokenizer_word_t*.*/
75 
76   ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
77                           contained in the memory buffer.
78                           Must be in sorted order (ascending).
79                           The  ideal choice is an rb tree but
80                           the rb tree imposes a space overhead
81                           that we can do without */
82 
83   que_t **ins_graph; /*!< Insert query graphs */
84 
85   que_t **sel_graph;     /*!< Select query graphs */
86   CHARSET_INFO *charset; /*!< charset */
87 };
88 
89 /** For supporting the tracking of updates on multiple FTS indexes we need
90 to track which FTS indexes need to be updated. For INSERT and DELETE we
91 update all fts indexes. */
92 struct fts_update_t {
93   doc_id_t doc_id; /*!< The doc id affected */
94 
95   ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
96                             updated. A NULL value means all
97                             indexes need to be updated.  This
98                             vector is not allocated on the heap
99                             and so must be freed explicitly,
100                             when we are done with it */
101 };
102 
103 /** Stop word control infotmation. */
104 struct fts_stopword_t {
105   ulint status;              /*!< Status of the stopword tree */
106   ib_alloc_t *heap;          /*!< The memory allocator to use */
107   ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
108   CHARSET_INFO *charset;     /*!< charset for stopword */
109 };
110 
111 /** The SYNC state of the cache. There is one instance of this struct
112 associated with each ADD thread. */
113 struct fts_sync_t {
114   trx_t *trx;           /*!< The transaction used for SYNCing
115                         the cache to disk */
116   dict_table_t *table;  /*!< Table with FTS index(es) */
117   ulint max_cache_size; /*!< Max size in bytes of the cache */
118   ibool cache_full;     /*!< flag, when true it indicates that
119                         we need to sync the cache to disk */
120   ulint lower_index;    /*!< the start index of the doc id
121                         vector from where to start adding
122                         documents to the FTS cache */
123   ulint upper_index;    /*!< max index of the doc id vector to
124                         add to the FTS cache */
125   ibool interrupted;    /*!< TRUE if SYNC was interrupted */
126   doc_id_t min_doc_id;  /*!< The smallest doc id added to the
127                         cache. It should equal to
128                         doc_ids[lower_index] */
129   doc_id_t max_doc_id;  /*!< The doc id at which the cache was
130                         noted as being full, we use this to
131                         set the upper_limit field */
132   ib_time_monotonic_t start_time;
133   /*!< SYNC start time */
134   bool in_progress;  /*!< flag whether sync is in progress.*/
135   bool unlock_cache; /*!< flag whether unlock cache when
136                      write fts node */
137   os_event_t event;  /*!< sync finish event */
138 };
139 
140 /** The cache for the FTS system. It is a memory-based inverted index
141 that new entries are added to, until it grows over the configured maximum
142 size, at which time its contents are written to the INDEX table. */
143 struct fts_cache_t {
144 #ifndef UNIV_HOTBACKUP
145   rw_lock_t lock; /*!< lock protecting all access to the
146                   memory buffer. FIXME: this needs to
147                   be our new upgrade-capable rw-lock */
148 
149   rw_lock_t init_lock; /*!< lock used for the cache
150                        intialization, it has different
151                        SYNC level as above cache lock */
152 #endif                 /* !UNIV_HOTBACKUP */
153 
154   ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
155 
156   ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
157 
158   ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
159 
160   ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
161                                 element is of type fts_update_t */
162 
163   ib_vector_t *indexes; /*!< We store the stats and inverted
164                         index for the individual FTS indexes
165                         in this vector. Each element is
166                         an instance of fts_index_cache_t */
167 
168   ib_vector_t *get_docs; /*!< information required to read
169                          the document from the table. Each
170                          element is of type fts_doc_t */
171 
172   ulint total_size;      /*!< total size consumed by the ilist
173                          field of all nodes. SYNC is run
174                          whenever this gets too big */
175   fts_sync_t *sync;      /*!< sync structure to sync data to
176                          disk */
177   ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
178                          and deleted_doc_ids, ie. transient
179                          objects, they are recreated after
180                          a SYNC is completed */
181 
182   ib_alloc_t *self_heap; /*!< This heap is the heap out of
183                          which an instance of the cache itself
184                          was created. Objects created using
185                          this heap will last for the lifetime
186                          of the cache */
187 
188   doc_id_t next_doc_id; /*!< Next doc id */
189 
190   doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
191 
192   doc_id_t first_doc_id; /*!< first doc id since this table
193                          was opened */
194 
195   ulint deleted; /*!< Number of doc ids deleted since
196                  last optimized. This variable is
197                  covered by deleted_lock */
198 
199   ulint added; /*!< Number of doc ids added since last
200                optimized. This variable is covered by
201                the deleted lock */
202 
203   fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
204   mem_heap_t *cache_heap;       /*!< Cache Heap */
205 };
206 
207 /** Columns of the FTS auxiliary INDEX table */
208 struct fts_node_t {
209   doc_id_t first_doc_id; /*!< First document id in ilist. */
210 
211   doc_id_t last_doc_id; /*!< Last document id in ilist. */
212 
213   byte *ilist; /*!< Binary list of documents & word
214                positions the token appears in.
215                TODO: For now, these are simply
216                ut_malloc'd, but if testing shows
217                that they waste memory unacceptably, a
218                special memory allocator will have
219                to be written */
220 
221   ulint doc_count; /*!< Number of doc ids in ilist */
222 
223   ulint ilist_size; /*!< Used size of ilist in bytes. */
224 
225   ulint ilist_size_alloc;
226   /*!< Allocated size of ilist in
227   bytes */
228   bool synced; /*!< flag whether the node is synced */
229 };
230 
231 /** A tokenizer word. Contains information about one word. */
232 struct fts_tokenizer_word_t {
233   fts_string_t text; /*!< Token text. */
234 
235   ib_vector_t *nodes; /*!< Word node ilists, each element is
236                       of type fts_node_t */
237 };
238 
239 /** Word text plus it's array of nodes as on disk in FTS index */
240 struct fts_word_t {
241   fts_string_t text;  /*!< Word value in UTF-8 */
242   ib_vector_t *nodes; /*!< Nodes read from disk */
243 
244   ib_alloc_t *heap_alloc; /*!< For handling all allocations */
245 };
246 
247 /** Callback for reading and filtering nodes that are read from FTS index */
248 struct fts_fetch_t {
249   void *read_arg; /*!< Arg for the sql_callback */
250 
251   fts_sql_callback read_record; /*!< Callback for reading index
252                                 record */
253   ulint total_memory;           /*!< Total memory used */
254 };
255 
256 /** For horizontally splitting an FTS auxiliary index */
257 struct fts_index_selector_t {
258   ulint value; /*!< Character value at which
259                to split */
260 
261   const char *suffix; /*!< FTS aux index suffix */
262 };
263 
264 /** This type represents a single document. */
265 struct fts_doc_t {
266   fts_string_t text; /*!< document text */
267 
268   ibool found; /*!< TRUE if the document was found
269                successfully in the database */
270 
271   ib_rbt_t *tokens; /*!< This is filled when the document
272                     is tokenized. Tokens; indexed by
273                     fts_string_t*, cells are of type
274                     fts_token_t* */
275 
276   ib_alloc_t *self_heap; /*!< An instance of this type is
277                          allocated from this heap along
278                          with any objects that have the
279                          same lifespan, most notably
280                          the vector of token positions */
281   CHARSET_INFO *charset; /*!< Document's charset info */
282 
283   st_mysql_ftparser *parser; /*!< fts plugin parser */
284 
285   bool is_ngram; /*!< Whether it is a ngram parser */
286 
287   ib_rbt_t *stopwords; /*!< Stopwords */
288 };
289 
290 /** A token and its positions within a document. */
291 struct fts_token_t {
292   fts_string_t text; /*!< token text */
293 
294   ib_vector_t *positions; /*!< an array of the positions the
295                           token is found in; each item is
296                           actually an ulint. */
297 };
298 
299 /** It's defined in fts/fts0fts.c */
300 extern const fts_index_selector_t fts_index_selector[];
301 
302 /** It's defined in fts/fts0fts.c */
303 extern const fts_index_selector_t fts_index_selector_5_7[];
304 
305 /** Compare two fts_trx_row_t instances doc_ids.
306 @param[in]	p1	id1
307 @param[in]	p2	id2
308 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
309 UNIV_INLINE
310 int fts_trx_row_doc_id_cmp(const void *p1, const void *p2);
311 
312 /** Compare two fts_ranking_t instances doc_ids.
313 @param[in]	p1	id1
314 @param[in]	p2	id2
315 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
316 UNIV_INLINE
317 int fts_ranking_doc_id_cmp(const void *p1, const void *p2);
318 
319 /** Compare two fts_update_t instances doc_ids.
320 @param[in]	p1	id1
321 @param[in]	p2	id2
322 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
323 UNIV_INLINE
324 int fts_update_doc_id_cmp(const void *p1, const void *p2);
325 
326 /** Decode and return the integer that was encoded using our VLC scheme.*/
327 UNIV_INLINE
328 ulint fts_decode_vlc(
329     /*!< out: value decoded */
330     byte **ptr); /*!< in: ptr to decode from, this ptr is
331                  incremented by the number of bytes decoded */
332 
333 /** Duplicate a string.
334 @param[in]	dst	dup to here
335 @param[in]	src	src string
336 @param[in]	heap	heap to use
337 */
338 UNIV_INLINE
339 void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
340                     mem_heap_t *heap);
341 
342 /** Return length of val if it were encoded using our VLC scheme. */
343 UNIV_INLINE
344 ulint fts_get_encoded_len(
345     /*!< out: length of value
346      encoded, in bytes */
347     ulint val); /*!< in: value to encode */
348 
349 /** Encode an integer using our VLC scheme and return the length in bytes.
350 @param[in]	val	value to encode
351 @param[in]	buf	buffer, must have enough space
352 @return length of value encoded, in bytes */
353 UNIV_INLINE
354 ulint fts_encode_int(ulint val, byte *buf);
355 
356 /** Get the selected FTS aux INDEX suffix. */
357 UNIV_INLINE
358 const char *fts_get_suffix(ulint selected); /*!< in: selected index */
359 
360 /** Return the selected FTS aux index suffix in 5.7 compatible format
361 @param[in]	selected	selected index
362 @return the suffix name */
363 UNIV_INLINE
364 const char *fts_get_suffix_5_7(ulint selected);
365 
366 /** Select the FTS auxiliary index for the given character.
367 @param[in]	cs	charset
368 @param[in]	str	string
369 @param[in]	len	string length in bytes
370 @return the index to use for the string */
371 UNIV_INLINE
372 ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len);
373 
374 #include "fts0types.ic"
375 #include "fts0vlc.ic"
376 
377 #endif /* INNOBASE_FTS0TYPES_H */
378