1 /***************************************************************************** 2 3 Copyright (c) 2007, 2019, Oracle and/or its affiliates. All Rights Reserved. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License, version 2.0, 7 as published by the Free Software Foundation. 8 9 This program is also distributed with certain software (including 10 but not limited to OpenSSL) that is licensed under separate terms, 11 as designated in a particular file or component or in included license 12 documentation. The authors of MySQL hereby grant you an additional 13 permission to link the program and your derivative works with the 14 separately licensed software that they have included with MySQL. 15 16 This program is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License, version 2.0, for more details. 20 21 You should have received a copy of the GNU General Public License along with 22 this program; if not, write to the Free Software Foundation, Inc., 23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 24 25 *****************************************************************************/ 26 27 /******************************************************************//** 28 @file include/fts0types.h 29 Full text search types file 30 31 Created 2007-03-27 Sunny Bains 32 *******************************************************/ 33 34 #ifndef INNOBASE_FTS0TYPES_H 35 #define INNOBASE_FTS0TYPES_H 36 37 #include "univ.i" 38 #include "fts0fts.h" 39 #include "fut0fut.h" 40 #include "pars0pars.h" 41 #include "que0types.h" 42 #include "ut0byte.h" 43 #include "ut0rbt.h" 44 45 /** Types used within FTS. */ 46 struct fts_que_t; 47 struct fts_node_t; 48 49 /** Callbacks used within FTS. */ 50 typedef pars_user_func_cb_t fts_sql_callback; 51 typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); 52 53 /** Statistics relevant to a particular document, used during retrieval. */ 54 struct fts_doc_stats_t { 55 doc_id_t doc_id; /*!< Document id */ 56 ulint word_count; /*!< Total words in the document */ 57 }; 58 59 /** It's main purpose is to store the SQL prepared statements that 60 are required to retrieve a document from the database. */ 61 struct fts_get_doc_t { 62 fts_index_cache_t* 63 index_cache; /*!< The index cache instance */ 64 65 /*!< Parsed sql statement */ 66 que_t* get_document_graph; 67 fts_cache_t* cache; /*!< The parent cache */ 68 }; 69 70 /** Since we can have multiple FTS indexes on a table, we keep a 71 per index cache of words etc. */ 72 struct fts_index_cache_t { 73 dict_index_t* index; /*!< The FTS index instance */ 74 75 ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, 76 cells are fts_tokenizer_word_t*.*/ 77 78 ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t 79 contained in the memory buffer. 80 Must be in sorted order (ascending). 81 The ideal choice is an rb tree but 82 the rb tree imposes a space overhead 83 that we can do without */ 84 85 que_t** ins_graph; /*!< Insert query graphs */ 86 87 que_t** sel_graph; /*!< Select query graphs */ 88 CHARSET_INFO* charset; /*!< charset */ 89 }; 90 91 /** For supporting the tracking of updates on multiple FTS indexes we need 92 to track which FTS indexes need to be updated. For INSERT and DELETE we 93 update all fts indexes. */ 94 struct fts_update_t { 95 doc_id_t doc_id; /*!< The doc id affected */ 96 97 ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be 98 updated. A NULL value means all 99 indexes need to be updated. This 100 vector is not allocated on the heap 101 and so must be freed explicitly, 102 when we are done with it */ 103 }; 104 105 /** Stop word control infotmation. */ 106 struct fts_stopword_t { 107 ulint status; /*!< Status of the stopword tree */ 108 ib_alloc_t* heap; /*!< The memory allocator to use */ 109 ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ 110 CHARSET_INFO* charset; /*!< charset for stopword */ 111 }; 112 113 /** The SYNC state of the cache. There is one instance of this struct 114 associated with each ADD thread. */ 115 struct fts_sync_t { 116 trx_t* trx; /*!< The transaction used for SYNCing 117 the cache to disk */ 118 dict_table_t* table; /*!< Table with FTS index(es) */ 119 ulint max_cache_size; /*!< Max size in bytes of the cache */ 120 ibool cache_full; /*!< flag, when true it indicates that 121 we need to sync the cache to disk */ 122 ulint lower_index; /*!< the start index of the doc id 123 vector from where to start adding 124 documents to the FTS cache */ 125 ulint upper_index; /*!< max index of the doc id vector to 126 add to the FTS cache */ 127 ibool interrupted; /*!< TRUE if SYNC was interrupted */ 128 doc_id_t min_doc_id; /*!< The smallest doc id added to the 129 cache. It should equal to 130 doc_ids[lower_index] */ 131 doc_id_t max_doc_id; /*!< The doc id at which the cache was 132 noted as being full, we use this to 133 set the upper_limit field */ 134 ib_time_monotonic_t start_time; 135 /*!< SYNC start time */ 136 bool in_progress; /*!< flag whether sync is in progress.*/ 137 bool unlock_cache; /*!< flag whether unlock cache when 138 write fts node */ 139 os_event_t event; /*!< sync finish event */ 140 }; 141 142 /** The cache for the FTS system. It is a memory-based inverted index 143 that new entries are added to, until it grows over the configured maximum 144 size, at which time its contents are written to the INDEX table. */ 145 struct fts_cache_t { 146 rw_lock_t lock; /*!< lock protecting all access to the 147 memory buffer. FIXME: this needs to 148 be our new upgrade-capable rw-lock */ 149 150 rw_lock_t init_lock; /*!< lock used for the cache 151 intialization, it has different 152 SYNC level as above cache lock */ 153 154 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ 155 156 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ 157 158 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ 159 160 ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each 161 element is of type fts_update_t */ 162 163 ib_vector_t* indexes; /*!< We store the stats and inverted 164 index for the individual FTS indexes 165 in this vector. Each element is 166 an instance of fts_index_cache_t */ 167 168 ib_vector_t* get_docs; /*!< information required to read 169 the document from the table. Each 170 element is of type fts_doc_t */ 171 172 ulint total_size; /*!< total size consumed by the ilist 173 field of all nodes. SYNC is run 174 whenever this gets too big */ 175 fts_sync_t* sync; /*!< sync structure to sync data to 176 disk */ 177 ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes 178 and deleted_doc_ids, ie. transient 179 objects, they are recreated after 180 a SYNC is completed */ 181 182 ib_alloc_t* self_heap; /*!< This heap is the heap out of 183 which an instance of the cache itself 184 was created. Objects created using 185 this heap will last for the lifetime 186 of the cache */ 187 188 doc_id_t next_doc_id; /*!< Next doc id */ 189 190 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ 191 192 doc_id_t first_doc_id; /*!< first doc id since this table 193 was opened */ 194 195 ulint deleted; /*!< Number of doc ids deleted since 196 last optimized. This variable is 197 covered by deleted_lock */ 198 199 ulint added; /*!< Number of doc ids added since last 200 optimized. This variable is covered by 201 the deleted lock */ 202 203 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ 204 mem_heap_t* cache_heap; /*!< Cache Heap */ 205 }; 206 207 /** Columns of the FTS auxiliary INDEX table */ 208 struct fts_node_t { 209 doc_id_t first_doc_id; /*!< First document id in ilist. */ 210 211 doc_id_t last_doc_id; /*!< Last document id in ilist. */ 212 213 byte* ilist; /*!< Binary list of documents & word 214 positions the token appears in. 215 TODO: For now, these are simply 216 ut_malloc'd, but if testing shows 217 that they waste memory unacceptably, a 218 special memory allocator will have 219 to be written */ 220 221 ulint doc_count; /*!< Number of doc ids in ilist */ 222 223 ulint ilist_size; /*!< Used size of ilist in bytes. */ 224 225 ulint ilist_size_alloc; 226 /*!< Allocated size of ilist in 227 bytes */ 228 bool synced; /*!< flag whether the node is synced */ 229 }; 230 231 /** A tokenizer word. Contains information about one word. */ 232 struct fts_tokenizer_word_t { 233 fts_string_t text; /*!< Token text. */ 234 235 ib_vector_t* nodes; /*!< Word node ilists, each element is 236 of type fts_node_t */ 237 }; 238 239 /** Word text plus it's array of nodes as on disk in FTS index */ 240 struct fts_word_t { 241 fts_string_t text; /*!< Word value in UTF-8 */ 242 ib_vector_t* nodes; /*!< Nodes read from disk */ 243 244 ib_alloc_t* heap_alloc; /*!< For handling all allocations */ 245 }; 246 247 /** Callback for reading and filtering nodes that are read from FTS index */ 248 struct fts_fetch_t { 249 void* read_arg; /*!< Arg for the sql_callback */ 250 251 fts_sql_callback 252 read_record; /*!< Callback for reading index 253 record */ 254 ulint total_memory; /*!< Total memory used */ 255 }; 256 257 /** For horizontally splitting an FTS auxiliary index */ 258 struct fts_index_selector_t { 259 ulint value; /*!< Character value at which 260 to split */ 261 262 const char* suffix; /*!< FTS aux index suffix */ 263 }; 264 265 /** This type represents a single document. */ 266 struct fts_doc_t { 267 fts_string_t text; /*!< document text */ 268 269 ibool found; /*!< TRUE if the document was found 270 successfully in the database */ 271 272 ib_rbt_t* tokens; /*!< This is filled when the document 273 is tokenized. Tokens; indexed by 274 fts_string_t*, cells are of type 275 fts_token_t* */ 276 277 ib_alloc_t* self_heap; /*!< An instance of this type is 278 allocated from this heap along 279 with any objects that have the 280 same lifespan, most notably 281 the vector of token positions */ 282 CHARSET_INFO* charset; /*!< Document's charset info */ 283 284 st_mysql_ftparser* parser; /*!< fts plugin parser */ 285 286 bool is_ngram; /*!< Whether it is a ngram parser */ 287 288 ib_rbt_t* stopwords; /*!< Stopwords */ 289 }; 290 291 /** A token and its positions within a document. */ 292 struct fts_token_t { 293 fts_string_t text; /*!< token text */ 294 295 ib_vector_t* positions; /*!< an array of the positions the 296 token is found in; each item is 297 actually an ulint. */ 298 }; 299 300 /** It's defined in fts/fts0fts.c */ 301 extern const fts_index_selector_t fts_index_selector[]; 302 303 /******************************************************************//** 304 Compare two fts_trx_row_t instances doc_ids. */ 305 UNIV_INLINE 306 int 307 fts_trx_row_doc_id_cmp( 308 /*===================*/ 309 /*!< out: 310 < 0 if n1 < n2, 311 0 if n1 == n2, 312 > 0 if n1 > n2 */ 313 const void* p1, /*!< in: id1 */ 314 const void* p2); /*!< in: id2 */ 315 316 /******************************************************************//** 317 Compare two fts_ranking_t instances doc_ids. */ 318 UNIV_INLINE 319 int 320 fts_ranking_doc_id_cmp( 321 /*===================*/ 322 /*!< out: 323 < 0 if n1 < n2, 324 0 if n1 == n2, 325 > 0 if n1 > n2 */ 326 const void* p1, /*!< in: id1 */ 327 const void* p2); /*!< in: id2 */ 328 329 /******************************************************************//** 330 Compare two fts_update_t instances doc_ids. */ 331 UNIV_INLINE 332 int 333 fts_update_doc_id_cmp( 334 /*==================*/ 335 /*!< out: 336 < 0 if n1 < n2, 337 0 if n1 == n2, 338 > 0 if n1 > n2 */ 339 const void* p1, /*!< in: id1 */ 340 const void* p2); /*!< in: id2 */ 341 342 /******************************************************************//** 343 Decode and return the integer that was encoded using our VLC scheme.*/ 344 UNIV_INLINE 345 ulint 346 fts_decode_vlc( 347 /*===========*/ 348 /*!< out: value decoded */ 349 byte** ptr); /*!< in: ptr to decode from, this ptr is 350 incremented by the number of bytes decoded */ 351 352 /******************************************************************//** 353 Duplicate a string. */ 354 UNIV_INLINE 355 void 356 fts_string_dup( 357 /*===========*/ 358 /*!< out: 359 < 0 if n1 < n2, 360 0 if n1 == n2, 361 > 0 if n1 > n2 */ 362 fts_string_t* dst, /*!< in: dup to here */ 363 const fts_string_t* src, /*!< in: src string */ 364 mem_heap_t* heap); /*!< in: heap to use */ 365 366 /******************************************************************//** 367 Return length of val if it were encoded using our VLC scheme. */ 368 UNIV_INLINE 369 ulint 370 fts_get_encoded_len( 371 /*================*/ 372 /*!< out: length of value 373 encoded, in bytes */ 374 ulint val); /*!< in: value to encode */ 375 376 /******************************************************************//** 377 Encode an integer using our VLC scheme and return the length in bytes. */ 378 UNIV_INLINE 379 ulint 380 fts_encode_int( 381 /*===========*/ 382 /*!< out: length of value 383 encoded, in bytes */ 384 ulint val, /*!< in: value to encode */ 385 byte* buf); /*!< in: buffer, must have 386 enough space */ 387 388 /******************************************************************//** 389 Get the selected FTS aux INDEX suffix. */ 390 UNIV_INLINE 391 const char* 392 fts_get_suffix( 393 /*===========*/ 394 ulint selected); /*!< in: selected index */ 395 396 /** Select the FTS auxiliary index for the given character. 397 @param[in] cs charset 398 @param[in] str string 399 @param[in] len string length in bytes 400 @return the index to use for the string */ 401 UNIV_INLINE 402 ulint 403 fts_select_index( 404 const CHARSET_INFO* cs, 405 const byte* str, 406 ulint len); 407 408 #ifndef UNIV_NONINL 409 #include "fts0types.ic" 410 #include "fts0vlc.ic" 411 #endif 412 413 #endif /* INNOBASE_FTS0TYPES_H */ 414