1/***************************************************************************** 2 3Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. 4 5This program is free software; you can redistribute it and/or modify 6it under the terms of the GNU General Public License, version 2.0, 7as published by the Free Software Foundation. 8 9This program is also distributed with certain software (including 10but not limited to OpenSSL) that is licensed under separate terms, 11as designated in a particular file or component or in included license 12documentation. The authors of MySQL hereby grant you an additional 13permission to link the program and your derivative works with the 14separately licensed software that they have included with MySQL. 15 16This program is distributed in the hope that it will be useful, 17but WITHOUT ANY WARRANTY; without even the implied warranty of 18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19GNU General Public License, version 2.0, for more details. 20 21You should have received a copy of the GNU General Public License along with 22this program; if not, write to the Free Software Foundation, Inc., 2351 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 24 25*****************************************************************************/ 26 27/******************************************************************//** 28@file include/fts0types.ic 29Full text search types. 30 31Created 2007-03-27 Sunny Bains 32*******************************************************/ 33 34#ifndef INNOBASE_FTS0TYPES_IC 35#define INNOBASE_FTS0TYPES_IC 36 37#include <ctype.h> 38 39#include "rem0cmp.h" 40#include "ha_prototypes.h" 41 42extern const ulint UTF8_ERROR; 43 44/* Determine if a UTF-8 continuation byte is valid. */ 45#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) 46 47/******************************************************************//** 48Duplicate an UTF-8 string. 49@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 50UNIV_INLINE 51void 52fts_utf8_string_dup( 53/*================*/ 54 fts_string_t* dst, /*!< in: dup to here */ 55 const fts_string_t* src, /*!< in: src string */ 56 mem_heap_t* heap) /*!< in: heap to use */ 57{ 58 dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1); 59 memcpy(dst->f_str, src->f_str, src->f_len); 60 61 dst->f_len = src->f_len; 62 dst->f_str[src->f_len] = 0; 63 dst->f_n_char = src->f_n_char; 64} 65 66/******************************************************************//** 67Compare two fts_trx_row_t doc_ids. 68@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 69UNIV_INLINE 70int 71fts_trx_row_doc_id_cmp( 72/*===================*/ 73 const void* p1, /*!< in: id1 */ 74 const void* p2) /*!< in: id2 */ 75{ 76 const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; 77 const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; 78 79 return((int)(tr1->doc_id - tr2->doc_id)); 80} 81 82/******************************************************************//** 83Compare two fts_ranking_t doc_ids. 84@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 85UNIV_INLINE 86int 87fts_ranking_doc_id_cmp( 88/*===================*/ 89 const void* p1, /*!< in: id1 */ 90 const void* p2) /*!< in: id2 */ 91{ 92 const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; 93 const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; 94 95 return((int)(rk1->doc_id - rk2->doc_id)); 96} 97 98/******************************************************************//** 99Compare two fts_update_t doc_ids. 100@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 101UNIV_INLINE 102int 103fts_update_doc_id_cmp( 104/*==================*/ 105 const void* p1, /*!< in: id1 */ 106 const void* p2) /*!< in: id2 */ 107{ 108 const fts_update_t* up1 = (const fts_update_t*) p1; 109 const fts_update_t* up2 = (const fts_update_t*) p2; 110 111 return((int)(up1->doc_id - up2->doc_id)); 112} 113 114 115/******************************************************************//** 116Lowercase an UTF-8 string. */ 117UNIV_INLINE 118void 119fts_utf8_tolower( 120/*=============*/ 121 fts_string_t* str) /*!< in: string */ 122{ 123 innobase_casedn_str((char*) str->f_str); 124} 125 126/******************************************************************//** 127Compare two UTF-8 strings. 128@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 129UNIV_INLINE 130int 131fts_utf8_string_cmp( 132/*================*/ 133 const void* p1, /*!< in: key */ 134 const void* p2) /*!< in: node */ 135{ 136 const fts_string_t* s1 = (const fts_string_t*) p1; 137 const fts_string_t* s2 = (const fts_string_t*) p2; 138 139 return(cmp_data_data_slow_varchar( 140 s1->f_str, s1->f_len, s2->f_str, s2->f_len)); 141} 142 143/******************************************************************//** 144Compare two UTF-8 strings, and return match (0) if 145passed in "key" value equals or is the prefix of the "node" value. 146@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ 147UNIV_INLINE 148int 149fts_utf8_string_cmp_prefix( 150/*=======================*/ 151 const void* p1, /*!< in: key */ 152 const void* p2) /*!< in: node */ 153{ 154 int result; 155 ulint len; 156 157 const fts_string_t* s1 = (const fts_string_t*) p1; 158 const fts_string_t* s2 = (const fts_string_t*) p2; 159 160 len = ut_min(s1->f_len, s2->f_len); 161 162 result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len); 163 164 if (result) { 165 return(result); 166 } 167 168 if (s1->f_len > s2->f_len) { 169 return(1); 170 } 171 172 return(0); 173} 174 175/******************************************************************//** 176Decode a UTF-8 character. 177 178http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: 179 180 Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte 18100000000 0xxxxxxx 0xxxxxxx 18200000yyy yyxxxxxx 110yyyyy 10xxxxxx 183zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx 184000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx 185 186This function decodes UTF-8 sequences up to 6 bytes (31 bits). 187 188On error *ptr will point to the first byte that was not correctly 189decoded. This will hopefully help in resyncing the input. 190@return UTF8_ERROR if *ptr did not point to a valid 191UTF-8 sequence, or the Unicode code point. */ 192UNIV_INLINE 193ulint 194fts_utf8_decode( 195/*============*/ 196 const byte** ptr) /*!< in/out: pointer to 197 UTF-8 string. The 198 pointer is advanced to 199 the start of the next 200 character. */ 201{ 202 const byte* p = *ptr; 203 ulint ch = *p++; 204#ifdef UNIV_DEBUG 205 ulint min_ch; 206#endif /* UNIV_DEBUG */ 207 208 if (UNIV_LIKELY(ch < 0x80)) { 209 /* 0xxxxxxx */ 210 } else if (UNIV_UNLIKELY(ch < 0xC0)) { 211 /* A continuation byte cannot start a code. */ 212 goto err_exit; 213 } else if (ch < 0xE0) { 214 /* 110yyyyy 10xxxxxx */ 215 ch &= 0x1F; 216 ut_d(min_ch = 0x80); 217 goto get1; 218 } else if (ch < 0xF0) { 219 /* 1110zzzz 10yyyyyy 10xxxxxx */ 220 ch &= 0x0F; 221 ut_d(min_ch = 0x800); 222 goto get2; 223 } else if (ch < 0xF8) { 224 /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ 225 ch &= 0x07; 226 ut_d(min_ch = 0x10000); 227 goto get3; 228 } else if (ch < 0xFC) { 229 /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ 230 ch &= 0x03; 231 ut_d(min_ch = 0x200000); 232 goto get4; 233 } else if (ch < 0xFE) { 234 /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ 235 ut_d(min_ch = 0x4000000); 236 if (!fts_utf8_is_valid(*p)) { 237 goto err_exit; 238 } 239 ch <<= 6; 240 ch |= (*p++) & 0x3F; 241get4: 242 if (!fts_utf8_is_valid(*p)) { 243 goto err_exit; 244 } 245 ch <<= 6; 246 ch |= (*p++) & 0x3F; 247get3: 248 if (!fts_utf8_is_valid(*p)) { 249 goto err_exit; 250 } 251 ch <<= 6; 252 ch |= (*p++) & 0x3F; 253get2: 254 if (!fts_utf8_is_valid(*p)) { 255 goto err_exit; 256 } 257 ch <<= 6; 258 ch |= (*p++) & 0x3F; 259get1: 260 if (!fts_utf8_is_valid(*p)) { 261 goto err_exit; 262 } 263 ch <<= 6; 264 ch |= (*p++) & 0x3F; 265 266 /* The following is needed in the 6-byte case 267 when ulint is wider than 32 bits. */ 268 ch &= 0xFFFFFFFF; 269 270 /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs) 271 and U+FFFE and U+FFFF cannot occur in valid UTF-8. */ 272 273 if ( (ch >= 0xD800 && ch <= 0xDFFF) 274#ifdef UNIV_DEBUG 275 || ch < min_ch 276#endif /* UNIV_DEBUG */ 277 || ch == 0xFFFE || ch == 0xFFFF) { 278 279 ch = UTF8_ERROR; 280 } 281 } else { 282err_exit: 283 ch = UTF8_ERROR; 284 } 285 286 *ptr = p; 287 288 return(ch); 289} 290 291/******************************************************************//** 292Get the first character's code position for FTS index partition */ 293extern 294ulint 295innobase_strnxfrm( 296/*==============*/ 297 const CHARSET_INFO* cs, /*!< in: Character set */ 298 const uchar* p2, /*!< in: string */ 299 const ulint len2); /*!< in: string length */ 300 301/******************************************************************//** 302Select the FTS auxiliary index for the given character. 303@return the index to use for the string */ 304UNIV_INLINE 305ulint 306fts_select_index( 307/*=============*/ 308 const CHARSET_INFO* cs, /*!< in: Charset */ 309 const byte* str, /*!< in: string */ 310 ulint len) /*!< in: string length */ 311{ 312 ulint selected = 0; 313 ulint value = innobase_strnxfrm(cs, str, len); 314 315 while (fts_index_selector[selected].value != 0) { 316 317 if (fts_index_selector[selected].value == value) { 318 319 return(selected); 320 321 } else if (fts_index_selector[selected].value > value) { 322 323 return(selected > 0 ? selected - 1 : 0); 324 } 325 326 ++selected; 327 } 328 329 ut_ad(selected > 1); 330 331 return(selected - 1); 332} 333 334/******************************************************************//** 335Select the next FTS auxiliary index for the given character. 336@return the next index to use for character */ 337UNIV_INLINE 338ulint 339fts_select_next_index( 340/*==================*/ 341 const CHARSET_INFO* cs, /*!< in: Charset */ 342 const byte* str, /*!< in: string */ 343 ulint len) /*!< in: string length */ 344{ 345 ulint selected = 0; 346 ulint value = innobase_strnxfrm(cs, str, len); 347 348 while (fts_index_selector[selected].value != 0) { 349 350 if (fts_index_selector[selected].value == value) { 351 352 return(selected + 1); 353 354 } else if (fts_index_selector[selected].value > value) { 355 356 return(selected); 357 } 358 359 ++selected; 360 } 361 362 ut_ad(selected > 0); 363 364 return((ulint) selected); 365} 366 367/******************************************************************//** 368Return the selected FTS aux index suffix. */ 369UNIV_INLINE 370const char* 371fts_get_suffix( 372/*===========*/ 373 ulint selected) /*!< in: selected index */ 374{ 375 return(fts_index_selector[selected].suffix); 376} 377 378/******************************************************************//** 379Get the number of index selectors. 380@return The number of selectors */ 381UNIV_INLINE 382ulint 383fts_get_n_selectors(void) 384/*=====================*/ 385{ 386 ulint i = 0; 387 388 // FIXME: This is a hack 389 while (fts_index_selector[i].value != 0) { 390 ++i; 391 } 392 393 return(i); 394} 395 396#endif /* INNOBASE_FTS0TYPES_IC */ 397