1 /***************************************************************************** 2 3 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License, version 2.0, 7 as published by the Free Software Foundation. 8 9 This program is also distributed with certain software (including 10 but not limited to OpenSSL) that is licensed under separate terms, 11 as designated in a particular file or component or in included license 12 documentation. The authors of MySQL hereby grant you an additional 13 permission to link the program and your derivative works with the 14 separately licensed software that they have included with MySQL. 15 16 This program is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License, version 2.0, for more details. 20 21 You should have received a copy of the GNU General Public License along with 22 this program; if not, write to the Free Software Foundation, Inc., 23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 24 25 *****************************************************************************/ 26 27 /******************************************************************//** 28 @file include/fts0types.h 29 Full text search types file 30 31 Created 2007-03-27 Sunny Bains 32 *******************************************************/ 33 34 #ifndef INNOBASE_FTS0TYPES_H 35 #define INNOBASE_FTS0TYPES_H 36 37 #include "que0types.h" 38 #include "ut0byte.h" 39 #include "fut0fut.h" 40 #include "ut0rbt.h" 41 #include "fts0fts.h" 42 43 /** Types used within FTS. */ 44 struct fts_que_t; 45 struct fts_node_t; 46 struct fts_utf8_str_t; 47 48 /** Callbacks used within FTS. */ 49 typedef pars_user_func_cb_t fts_sql_callback; 50 typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); 51 52 /** Statistics relevant to a particular document, used during retrieval. */ 53 struct fts_doc_stats_t { 54 doc_id_t doc_id; /*!< Document id */ 55 ulint word_count; /*!< Total words in the document */ 56 }; 57 58 /** It's main purpose is to store the SQL prepared statements that 59 are required to retrieve a document from the database. */ 60 struct fts_get_doc_t { 61 fts_index_cache_t* 62 index_cache; /*!< The index cache instance */ 63 64 /*!< Parsed sql statement */ 65 que_t* get_document_graph; 66 fts_cache_t* cache; /*!< The parent cache */ 67 }; 68 69 /** Since we can have multiple FTS indexes on a table, we keep a 70 per index cache of words etc. */ 71 struct fts_index_cache_t { 72 dict_index_t* index; /*!< The FTS index instance */ 73 74 ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, 75 cells are fts_tokenizer_word_t*.*/ 76 77 ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t 78 contained in the memory buffer. 79 Must be in sorted order (ascending). 80 The ideal choice is an rb tree but 81 the rb tree imposes a space overhead 82 that we can do without */ 83 84 que_t** ins_graph; /*!< Insert query graphs */ 85 86 que_t** sel_graph; /*!< Select query graphs */ 87 CHARSET_INFO* charset; /*!< charset */ 88 }; 89 90 /** For supporting the tracking of updates on multiple FTS indexes we need 91 to track which FTS indexes need to be updated. For INSERT and DELETE we 92 update all fts indexes. */ 93 struct fts_update_t { 94 doc_id_t doc_id; /*!< The doc id affected */ 95 96 ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be 97 updated. A NULL value means all 98 indexes need to be updated. This 99 vector is not allocated on the heap 100 and so must be freed explicitly, 101 when we are done with it */ 102 }; 103 104 /** Stop word control infotmation. */ 105 struct fts_stopword_t { 106 ulint status; /*!< Status of the stopword tree */ 107 ib_alloc_t* heap; /*!< The memory allocator to use */ 108 ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ 109 CHARSET_INFO* charset; /*!< charset for stopword */ 110 }; 111 112 /** The SYNC state of the cache. There is one instance of this struct 113 associated with each ADD thread. */ 114 struct fts_sync_t { 115 trx_t* trx; /*!< The transaction used for SYNCing 116 the cache to disk */ 117 dict_table_t* table; /*!< Table with FTS index(es) */ 118 ulint max_cache_size; /*!< Max size in bytes of the cache */ 119 ibool cache_full; /*!< flag, when true it indicates that 120 we need to sync the cache to disk */ 121 ulint lower_index; /*!< the start index of the doc id 122 vector from where to start adding 123 documents to the FTS cache */ 124 ulint upper_index; /*!< max index of the doc id vector to 125 add to the FTS cache */ 126 ibool interrupted; /*!< TRUE if SYNC was interrupted */ 127 doc_id_t min_doc_id; /*!< The smallest doc id added to the 128 cache. It should equal to 129 doc_ids[lower_index] */ 130 doc_id_t max_doc_id; /*!< The doc id at which the cache was 131 noted as being full, we use this to 132 set the upper_limit field */ 133 ib_time_t start_time; /*!< SYNC start time */ 134 bool in_progress; /*!< flag whether sync is in progress.*/ 135 bool unlock_cache; /*!< flag whether unlock cache when 136 write fts node */ 137 os_event_t event; /*!< sync finish event */ 138 }; 139 140 /** The cache for the FTS system. It is a memory-based inverted index 141 that new entries are added to, until it grows over the configured maximum 142 size, at which time its contents are written to the INDEX table. */ 143 struct fts_cache_t { 144 rw_lock_t lock; /*!< lock protecting all access to the 145 memory buffer. FIXME: this needs to 146 be our new upgrade-capable rw-lock */ 147 148 rw_lock_t init_lock; /*!< lock used for the cache 149 intialization, it has different 150 SYNC level as above cache lock */ 151 152 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ 153 154 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ 155 156 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ 157 158 ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each 159 element is of type fts_update_t */ 160 161 ib_vector_t* indexes; /*!< We store the stats and inverted 162 index for the individual FTS indexes 163 in this vector. Each element is 164 an instance of fts_index_cache_t */ 165 166 ib_vector_t* get_docs; /*!< information required to read 167 the document from the table. Each 168 element is of type fts_doc_t */ 169 170 ulint total_size; /*!< total size consumed by the ilist 171 field of all nodes. SYNC is run 172 whenever this gets too big */ 173 fts_sync_t* sync; /*!< sync structure to sync data to 174 disk */ 175 ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes 176 and deleted_doc_ids, ie. transient 177 objects, they are recreated after 178 a SYNC is completed */ 179 180 ib_alloc_t* self_heap; /*!< This heap is the heap out of 181 which an instance of the cache itself 182 was created. Objects created using 183 this heap will last for the lifetime 184 of the cache */ 185 186 doc_id_t next_doc_id; /*!< Next doc id */ 187 188 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ 189 190 doc_id_t first_doc_id; /*!< first doc id since this table 191 was opened */ 192 193 ulint deleted; /*!< Number of doc ids deleted since 194 last optimized. This variable is 195 covered by deleted_lock */ 196 197 ulint added; /*!< Number of doc ids added since last 198 optimized. This variable is covered by 199 the deleted lock */ 200 201 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ 202 mem_heap_t* cache_heap; /*!< Cache Heap */ 203 }; 204 205 /** Columns of the FTS auxiliary INDEX table */ 206 struct fts_node_t { 207 doc_id_t first_doc_id; /*!< First document id in ilist. */ 208 209 doc_id_t last_doc_id; /*!< Last document id in ilist. */ 210 211 byte* ilist; /*!< Binary list of documents & word 212 positions the token appears in. 213 TODO: For now, these are simply 214 ut_malloc'd, but if testing shows 215 that they waste memory unacceptably, a 216 special memory allocator will have 217 to be written */ 218 219 ulint doc_count; /*!< Number of doc ids in ilist */ 220 221 ulint ilist_size; /*!< Used size of ilist in bytes. */ 222 223 ulint ilist_size_alloc; 224 /*!< Allocated size of ilist in 225 bytes */ 226 bool synced; /*!< flag whether the node is synced */ 227 }; 228 229 /** A tokenizer word. Contains information about one word. */ 230 struct fts_tokenizer_word_t { 231 fts_string_t text; /*!< Token text. */ 232 233 ib_vector_t* nodes; /*!< Word node ilists, each element is 234 of type fts_node_t */ 235 }; 236 237 /** Word text plus it's array of nodes as on disk in FTS index */ 238 struct fts_word_t { 239 fts_string_t text; /*!< Word value in UTF-8 */ 240 ib_vector_t* nodes; /*!< Nodes read from disk */ 241 242 ib_alloc_t* heap_alloc; /*!< For handling all allocations */ 243 }; 244 245 /** Callback for reading and filtering nodes that are read from FTS index */ 246 struct fts_fetch_t { 247 void* read_arg; /*!< Arg for the sql_callback */ 248 249 fts_sql_callback 250 read_record; /*!< Callback for reading index 251 record */ 252 ulint total_memory; /*!< Total memory used */ 253 }; 254 255 /** For horizontally splitting an FTS auxiliary index */ 256 struct fts_index_selector_t { 257 ulint value; /*!< Character value at which 258 to split */ 259 260 const char* suffix; /*!< FTS aux index suffix */ 261 }; 262 263 /** This type represents a single document. */ 264 struct fts_doc_t { 265 fts_string_t text; /*!< document text */ 266 267 ibool found; /*!< TRUE if the document was found 268 successfully in the database */ 269 270 ib_rbt_t* tokens; /*!< This is filled when the document 271 is tokenized. Tokens; indexed by 272 fts_string_t*, cells are of type 273 fts_token_t* */ 274 275 ib_alloc_t* self_heap; /*!< An instance of this type is 276 allocated from this heap along 277 with any objects that have the 278 same lifespan, most notably 279 the vector of token positions */ 280 CHARSET_INFO* charset; /*!< Document's charset info */ 281 }; 282 283 /** A token and its positions within a document. */ 284 struct fts_token_t { 285 fts_string_t text; /*!< token text */ 286 287 ib_vector_t* positions; /*!< an array of the positions the 288 token is found in; each item is 289 actually an ulint. */ 290 }; 291 292 /** It's defined in fts/fts0fts.c */ 293 extern const fts_index_selector_t fts_index_selector[]; 294 295 /******************************************************************//** 296 Compare two UTF-8 strings. */ 297 UNIV_INLINE 298 int 299 fts_utf8_string_cmp( 300 /*================*/ 301 /*!< out: 302 < 0 if n1 < n2, 303 0 if n1 == n2, 304 > 0 if n1 > n2 */ 305 const void* p1, /*!< in: key */ 306 const void* p2); /*!< in: node */ 307 308 /******************************************************************//** 309 Compare two UTF-8 strings, and return match (0) if 310 passed in "key" value equals or is the prefix of the "node" value. */ 311 UNIV_INLINE 312 int 313 fts_utf8_string_cmp_prefix( 314 /*=======================*/ 315 /*!< out: 316 < 0 if n1 < n2, 317 0 if n1 == n2, 318 > 0 if n1 > n2 */ 319 const void* p1, /*!< in: key */ 320 const void* p2); /*!< in: node */ 321 322 /******************************************************************//** 323 Compare two fts_trx_row_t instances doc_ids. */ 324 UNIV_INLINE 325 int 326 fts_trx_row_doc_id_cmp( 327 /*===================*/ 328 /*!< out: 329 < 0 if n1 < n2, 330 0 if n1 == n2, 331 > 0 if n1 > n2 */ 332 const void* p1, /*!< in: id1 */ 333 const void* p2); /*!< in: id2 */ 334 335 /******************************************************************//** 336 Compare two fts_ranking_t instances doc_ids. */ 337 UNIV_INLINE 338 int 339 fts_ranking_doc_id_cmp( 340 /*===================*/ 341 /*!< out: 342 < 0 if n1 < n2, 343 0 if n1 == n2, 344 > 0 if n1 > n2 */ 345 const void* p1, /*!< in: id1 */ 346 const void* p2); /*!< in: id2 */ 347 348 /******************************************************************//** 349 Compare two fts_update_t instances doc_ids. */ 350 UNIV_INLINE 351 int 352 fts_update_doc_id_cmp( 353 /*==================*/ 354 /*!< out: 355 < 0 if n1 < n2, 356 0 if n1 == n2, 357 > 0 if n1 > n2 */ 358 const void* p1, /*!< in: id1 */ 359 const void* p2); /*!< in: id2 */ 360 361 /******************************************************************//** 362 Decode and return the integer that was encoded using our VLC scheme.*/ 363 UNIV_INLINE 364 ulint 365 fts_decode_vlc( 366 /*===========*/ 367 /*!< out: value decoded */ 368 byte** ptr); /*!< in: ptr to decode from, this ptr is 369 incremented by the number of bytes decoded */ 370 371 /******************************************************************//** 372 Duplicate an UTF-8 string. */ 373 UNIV_INLINE 374 void 375 fts_utf8_string_dup( 376 /*================*/ 377 /*!< out: 378 < 0 if n1 < n2, 379 0 if n1 == n2, 380 > 0 if n1 > n2 */ 381 fts_string_t* dst, /*!< in: dup to here */ 382 const fts_string_t* src, /*!< in: src string */ 383 mem_heap_t* heap); /*!< in: heap to use */ 384 385 /******************************************************************//** 386 Return length of val if it were encoded using our VLC scheme. */ 387 UNIV_INLINE 388 ulint 389 fts_get_encoded_len( 390 /*================*/ 391 /*!< out: length of value 392 encoded, in bytes */ 393 ulint val); /*!< in: value to encode */ 394 395 /******************************************************************//** 396 Encode an integer using our VLC scheme and return the length in bytes. */ 397 UNIV_INLINE 398 ulint 399 fts_encode_int( 400 /*===========*/ 401 /*!< out: length of value 402 encoded, in bytes */ 403 ulint val, /*!< in: value to encode */ 404 byte* buf); /*!< in: buffer, must have 405 enough space */ 406 407 /******************************************************************//** 408 Decode a UTF-8 character. 409 410 http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: 411 412 Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte 413 00000000 0xxxxxxx 0xxxxxxx 414 00000yyy yyxxxxxx 110yyyyy 10xxxxxx 415 zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx 416 000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx 417 418 This function decodes UTF-8 sequences up to 6 bytes (31 bits). 419 420 On error *ptr will point to the first byte that was not correctly 421 decoded. This will hopefully help in resyncing the input. */ 422 UNIV_INLINE 423 ulint 424 fts_utf8_decode( 425 /*============*/ 426 /*!< out: UTF8_ERROR if *ptr 427 did not point to a valid 428 UTF-8 sequence, or the 429 Unicode code point. */ 430 const byte** ptr); /*!< in/out: pointer to 431 UTF-8 string. The 432 pointer is advanced to 433 the start of the next 434 character. */ 435 436 /******************************************************************//** 437 Lowercase an UTF-8 string. */ 438 UNIV_INLINE 439 void 440 fts_utf8_tolower( 441 /*=============*/ 442 fts_string_t* str); /*!< in: string */ 443 444 /******************************************************************//** 445 Get the selected FTS aux INDEX suffix. */ 446 UNIV_INLINE 447 const char* 448 fts_get_suffix( 449 /*===========*/ 450 ulint selected); /*!< in: selected index */ 451 452 /******************************************************************** 453 Get the number of index selectors. */ 454 UNIV_INLINE 455 ulint 456 fts_get_n_selectors(void); 457 /*=====================*/ 458 459 /******************************************************************//** 460 Select the FTS auxiliary index for the given string. 461 @return the index to use for the string */ 462 UNIV_INLINE 463 ulint 464 fts_select_index( 465 /*=============*/ 466 const CHARSET_INFO* cs, /*!< Charset */ 467 const byte* str, /*!< in: word string */ 468 ulint len); /*!< in: string length */ 469 470 /******************************************************************** 471 Select the next FTS auxiliary index for the given character. 472 @return the next index to use for character */ 473 UNIV_INLINE 474 ulint 475 fts_select_next_index( 476 /*==================*/ 477 const CHARSET_INFO* cs, /*!< Charset */ 478 const byte* str, /*!< in: string */ 479 ulint len); /*!< in: string length */ 480 481 #ifndef UNIV_NONINL 482 #include "fts0types.ic" 483 #include "fts0vlc.ic" 484 #endif 485 486 #endif /* INNOBASE_FTS0TYPES_H */ 487