1 /***************************************************************************** 2 3 Copyright (c) 2007, 2019, Oracle and/or its affiliates. All Rights Reserved. 4 5 This program is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License, version 2.0, as published by the 7 Free Software Foundation. 8 9 This program is also distributed with certain software (including but not 10 limited to OpenSSL) that is licensed under separate terms, as designated in a 11 particular file or component or in included license documentation. The authors 12 of MySQL hereby grant you an additional permission to link the program and 13 your derivative works with the separately licensed software that they have 14 included with MySQL. 15 16 This program is distributed in the hope that it will be useful, but WITHOUT 17 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 18 FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, 19 for more details. 20 21 You should have received a copy of the GNU General Public License along with 22 this program; if not, write to the Free Software Foundation, Inc., 23 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 24 25 *****************************************************************************/ 26 27 /** @file include/fts0types.h 28 Full text search types file 29 30 Created 2007-03-27 Sunny Bains 31 *******************************************************/ 32 33 #ifndef INNOBASE_FTS0TYPES_H 34 #define INNOBASE_FTS0TYPES_H 35 36 #include "fts0fts.h" 37 #include "fut0fut.h" 38 #include "pars0pars.h" 39 #include "que0types.h" 40 #include "univ.i" 41 #include "ut0byte.h" 42 #include "ut0rbt.h" 43 44 /** Types used within FTS. */ 45 struct fts_que_t; 46 struct fts_node_t; 47 48 /** Callbacks used within FTS. */ 49 typedef pars_user_func_cb_t fts_sql_callback; 50 typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len); 51 52 /** Statistics relevant to a particular document, used during retrieval. */ 53 struct fts_doc_stats_t { 54 doc_id_t doc_id; /*!< Document id */ 55 ulint word_count; /*!< Total words in the document */ 56 }; 57 58 /** It's main purpose is to store the SQL prepared statements that 59 are required to retrieve a document from the database. */ 60 struct fts_get_doc_t { 61 fts_index_cache_t *index_cache; /*!< The index cache instance */ 62 63 /*!< Parsed sql statement */ 64 que_t *get_document_graph; 65 fts_cache_t *cache; /*!< The parent cache */ 66 }; 67 68 /** Since we can have multiple FTS indexes on a table, we keep a 69 per index cache of words etc. */ 70 struct fts_index_cache_t { 71 dict_index_t *index; /*!< The FTS index instance */ 72 73 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*, 74 cells are fts_tokenizer_word_t*.*/ 75 76 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t 77 contained in the memory buffer. 78 Must be in sorted order (ascending). 79 The ideal choice is an rb tree but 80 the rb tree imposes a space overhead 81 that we can do without */ 82 83 que_t **ins_graph; /*!< Insert query graphs */ 84 85 que_t **sel_graph; /*!< Select query graphs */ 86 CHARSET_INFO *charset; /*!< charset */ 87 }; 88 89 /** For supporting the tracking of updates on multiple FTS indexes we need 90 to track which FTS indexes need to be updated. For INSERT and DELETE we 91 update all fts indexes. */ 92 struct fts_update_t { 93 doc_id_t doc_id; /*!< The doc id affected */ 94 95 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be 96 updated. A NULL value means all 97 indexes need to be updated. This 98 vector is not allocated on the heap 99 and so must be freed explicitly, 100 when we are done with it */ 101 }; 102 103 /** Stop word control infotmation. */ 104 struct fts_stopword_t { 105 ulint status; /*!< Status of the stopword tree */ 106 ib_alloc_t *heap; /*!< The memory allocator to use */ 107 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */ 108 CHARSET_INFO *charset; /*!< charset for stopword */ 109 }; 110 111 /** The SYNC state of the cache. There is one instance of this struct 112 associated with each ADD thread. */ 113 struct fts_sync_t { 114 trx_t *trx; /*!< The transaction used for SYNCing 115 the cache to disk */ 116 dict_table_t *table; /*!< Table with FTS index(es) */ 117 ulint max_cache_size; /*!< Max size in bytes of the cache */ 118 ibool cache_full; /*!< flag, when true it indicates that 119 we need to sync the cache to disk */ 120 ulint lower_index; /*!< the start index of the doc id 121 vector from where to start adding 122 documents to the FTS cache */ 123 ulint upper_index; /*!< max index of the doc id vector to 124 add to the FTS cache */ 125 ibool interrupted; /*!< TRUE if SYNC was interrupted */ 126 doc_id_t min_doc_id; /*!< The smallest doc id added to the 127 cache. It should equal to 128 doc_ids[lower_index] */ 129 doc_id_t max_doc_id; /*!< The doc id at which the cache was 130 noted as being full, we use this to 131 set the upper_limit field */ 132 ib_time_monotonic_t start_time; 133 /*!< SYNC start time */ 134 bool in_progress; /*!< flag whether sync is in progress.*/ 135 bool unlock_cache; /*!< flag whether unlock cache when 136 write fts node */ 137 os_event_t event; /*!< sync finish event */ 138 }; 139 140 /** The cache for the FTS system. It is a memory-based inverted index 141 that new entries are added to, until it grows over the configured maximum 142 size, at which time its contents are written to the INDEX table. */ 143 struct fts_cache_t { 144 #ifndef UNIV_HOTBACKUP 145 rw_lock_t lock; /*!< lock protecting all access to the 146 memory buffer. FIXME: this needs to 147 be our new upgrade-capable rw-lock */ 148 149 rw_lock_t init_lock; /*!< lock used for the cache 150 intialization, it has different 151 SYNC level as above cache lock */ 152 #endif /* !UNIV_HOTBACKUP */ 153 154 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ 155 156 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ 157 158 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ 159 160 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each 161 element is of type fts_update_t */ 162 163 ib_vector_t *indexes; /*!< We store the stats and inverted 164 index for the individual FTS indexes 165 in this vector. Each element is 166 an instance of fts_index_cache_t */ 167 168 ib_vector_t *get_docs; /*!< information required to read 169 the document from the table. Each 170 element is of type fts_doc_t */ 171 172 ulint total_size; /*!< total size consumed by the ilist 173 field of all nodes. SYNC is run 174 whenever this gets too big */ 175 fts_sync_t *sync; /*!< sync structure to sync data to 176 disk */ 177 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes 178 and deleted_doc_ids, ie. transient 179 objects, they are recreated after 180 a SYNC is completed */ 181 182 ib_alloc_t *self_heap; /*!< This heap is the heap out of 183 which an instance of the cache itself 184 was created. Objects created using 185 this heap will last for the lifetime 186 of the cache */ 187 188 doc_id_t next_doc_id; /*!< Next doc id */ 189 190 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ 191 192 doc_id_t first_doc_id; /*!< first doc id since this table 193 was opened */ 194 195 ulint deleted; /*!< Number of doc ids deleted since 196 last optimized. This variable is 197 covered by deleted_lock */ 198 199 ulint added; /*!< Number of doc ids added since last 200 optimized. This variable is covered by 201 the deleted lock */ 202 203 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ 204 mem_heap_t *cache_heap; /*!< Cache Heap */ 205 }; 206 207 /** Columns of the FTS auxiliary INDEX table */ 208 struct fts_node_t { 209 doc_id_t first_doc_id; /*!< First document id in ilist. */ 210 211 doc_id_t last_doc_id; /*!< Last document id in ilist. */ 212 213 byte *ilist; /*!< Binary list of documents & word 214 positions the token appears in. 215 TODO: For now, these are simply 216 ut_malloc'd, but if testing shows 217 that they waste memory unacceptably, a 218 special memory allocator will have 219 to be written */ 220 221 ulint doc_count; /*!< Number of doc ids in ilist */ 222 223 ulint ilist_size; /*!< Used size of ilist in bytes. */ 224 225 ulint ilist_size_alloc; 226 /*!< Allocated size of ilist in 227 bytes */ 228 bool synced; /*!< flag whether the node is synced */ 229 }; 230 231 /** A tokenizer word. Contains information about one word. */ 232 struct fts_tokenizer_word_t { 233 fts_string_t text; /*!< Token text. */ 234 235 ib_vector_t *nodes; /*!< Word node ilists, each element is 236 of type fts_node_t */ 237 }; 238 239 /** Word text plus it's array of nodes as on disk in FTS index */ 240 struct fts_word_t { 241 fts_string_t text; /*!< Word value in UTF-8 */ 242 ib_vector_t *nodes; /*!< Nodes read from disk */ 243 244 ib_alloc_t *heap_alloc; /*!< For handling all allocations */ 245 }; 246 247 /** Callback for reading and filtering nodes that are read from FTS index */ 248 struct fts_fetch_t { 249 void *read_arg; /*!< Arg for the sql_callback */ 250 251 fts_sql_callback read_record; /*!< Callback for reading index 252 record */ 253 ulint total_memory; /*!< Total memory used */ 254 }; 255 256 /** For horizontally splitting an FTS auxiliary index */ 257 struct fts_index_selector_t { 258 ulint value; /*!< Character value at which 259 to split */ 260 261 const char *suffix; /*!< FTS aux index suffix */ 262 }; 263 264 /** This type represents a single document. */ 265 struct fts_doc_t { 266 fts_string_t text; /*!< document text */ 267 268 ibool found; /*!< TRUE if the document was found 269 successfully in the database */ 270 271 ib_rbt_t *tokens; /*!< This is filled when the document 272 is tokenized. Tokens; indexed by 273 fts_string_t*, cells are of type 274 fts_token_t* */ 275 276 ib_alloc_t *self_heap; /*!< An instance of this type is 277 allocated from this heap along 278 with any objects that have the 279 same lifespan, most notably 280 the vector of token positions */ 281 CHARSET_INFO *charset; /*!< Document's charset info */ 282 283 st_mysql_ftparser *parser; /*!< fts plugin parser */ 284 285 bool is_ngram; /*!< Whether it is a ngram parser */ 286 287 ib_rbt_t *stopwords; /*!< Stopwords */ 288 }; 289 290 /** A token and its positions within a document. */ 291 struct fts_token_t { 292 fts_string_t text; /*!< token text */ 293 294 ib_vector_t *positions; /*!< an array of the positions the 295 token is found in; each item is 296 actually an ulint. */ 297 }; 298 299 /** It's defined in fts/fts0fts.c */ 300 extern const fts_index_selector_t fts_index_selector[]; 301 302 /** It's defined in fts/fts0fts.c */ 303 extern const fts_index_selector_t fts_index_selector_5_7[]; 304 305 /** Compare two fts_trx_row_t instances doc_ids. 306 @param[in] p1 id1 307 @param[in] p2 id2 308 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */ 309 UNIV_INLINE 310 int fts_trx_row_doc_id_cmp(const void *p1, const void *p2); 311 312 /** Compare two fts_ranking_t instances doc_ids. 313 @param[in] p1 id1 314 @param[in] p2 id2 315 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */ 316 UNIV_INLINE 317 int fts_ranking_doc_id_cmp(const void *p1, const void *p2); 318 319 /** Compare two fts_update_t instances doc_ids. 320 @param[in] p1 id1 321 @param[in] p2 id2 322 @return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */ 323 UNIV_INLINE 324 int fts_update_doc_id_cmp(const void *p1, const void *p2); 325 326 /** Decode and return the integer that was encoded using our VLC scheme.*/ 327 UNIV_INLINE 328 ulint fts_decode_vlc( 329 /*!< out: value decoded */ 330 byte **ptr); /*!< in: ptr to decode from, this ptr is 331 incremented by the number of bytes decoded */ 332 333 /** Duplicate a string. 334 @param[in] dst dup to here 335 @param[in] src src string 336 @param[in] heap heap to use 337 */ 338 UNIV_INLINE 339 void fts_string_dup(fts_string_t *dst, const fts_string_t *src, 340 mem_heap_t *heap); 341 342 /** Return length of val if it were encoded using our VLC scheme. */ 343 UNIV_INLINE 344 ulint fts_get_encoded_len( 345 /*!< out: length of value 346 encoded, in bytes */ 347 ulint val); /*!< in: value to encode */ 348 349 /** Encode an integer using our VLC scheme and return the length in bytes. 350 @param[in] val value to encode 351 @param[in] buf buffer, must have enough space 352 @return length of value encoded, in bytes */ 353 UNIV_INLINE 354 ulint fts_encode_int(ulint val, byte *buf); 355 356 /** Get the selected FTS aux INDEX suffix. */ 357 UNIV_INLINE 358 const char *fts_get_suffix(ulint selected); /*!< in: selected index */ 359 360 /** Return the selected FTS aux index suffix in 5.7 compatible format 361 @param[in] selected selected index 362 @return the suffix name */ 363 UNIV_INLINE 364 const char *fts_get_suffix_5_7(ulint selected); 365 366 /** Select the FTS auxiliary index for the given character. 367 @param[in] cs charset 368 @param[in] str string 369 @param[in] len string length in bytes 370 @return the index to use for the string */ 371 UNIV_INLINE 372 ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len); 373 374 #include "fts0types.ic" 375 #include "fts0vlc.ic" 376 377 #endif /* INNOBASE_FTS0TYPES_H */ 378