1/*****************************************************************************
2
3Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
4
5This program is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License, version 2.0,
7as published by the Free Software Foundation.
8
9This program is also distributed with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation.  The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have included with MySQL.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19GNU General Public License, version 2.0, for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25*****************************************************************************/
26
27/******************************************************************//**
28@file include/fts0types.ic
29Full text search types.
30
31Created 2007-03-27 Sunny Bains
32*******************************************************/
33
34#ifndef INNOBASE_FTS0TYPES_IC
35#define INNOBASE_FTS0TYPES_IC
36
37#include <ctype.h>
38
39#include "rem0cmp.h"
40#include "ha_prototypes.h"
41
42extern const ulint UTF8_ERROR;
43
44/* Determine if a UTF-8 continuation byte is valid. */
45#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
46
47/******************************************************************//**
48Duplicate an UTF-8 string.
49@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
50UNIV_INLINE
51void
52fts_utf8_string_dup(
53/*================*/
54	fts_string_t*		dst,		/*!< in: dup to here */
55	const fts_string_t*	src,		/*!< in: src string */
56	mem_heap_t*		heap)		/*!< in: heap to use */
57{
58	dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
59	memcpy(dst->f_str, src->f_str, src->f_len);
60
61	dst->f_len = src->f_len;
62	dst->f_str[src->f_len] = 0;
63	dst->f_n_char = src->f_n_char;
64}
65
66/******************************************************************//**
67Compare two fts_trx_row_t doc_ids.
68@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
69UNIV_INLINE
70int
71fts_trx_row_doc_id_cmp(
72/*===================*/
73	const void*	p1,			/*!< in: id1 */
74	const void*	p2)			/*!< in: id2 */
75{
76	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
77	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
78
79	return((int)(tr1->doc_id - tr2->doc_id));
80}
81
82/******************************************************************//**
83Compare two fts_ranking_t doc_ids.
84@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
85UNIV_INLINE
86int
87fts_ranking_doc_id_cmp(
88/*===================*/
89	const void*	p1,			/*!< in: id1 */
90	const void*	p2)			/*!< in: id2 */
91{
92	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
93	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
94
95	return((int)(rk1->doc_id - rk2->doc_id));
96}
97
98/******************************************************************//**
99Compare two fts_update_t doc_ids.
100@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
101UNIV_INLINE
102int
103fts_update_doc_id_cmp(
104/*==================*/
105	const void*	p1,			/*!< in: id1 */
106	const void*	p2)			/*!< in: id2 */
107{
108	const fts_update_t*	up1 = (const fts_update_t*) p1;
109	const fts_update_t*	up2 = (const fts_update_t*) p2;
110
111	return((int)(up1->doc_id - up2->doc_id));
112}
113
114
115/******************************************************************//**
116Lowercase an UTF-8 string. */
117UNIV_INLINE
118void
119fts_utf8_tolower(
120/*=============*/
121	fts_string_t*	str)			/*!< in: string */
122{
123	innobase_casedn_str((char*) str->f_str);
124}
125
126/******************************************************************//**
127Compare two UTF-8 strings.
128@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
129UNIV_INLINE
130int
131fts_utf8_string_cmp(
132/*================*/
133	const void*	p1,			/*!< in: key */
134	const void*	p2)			/*!< in: node */
135{
136	const fts_string_t* s1 = (const fts_string_t*) p1;
137	const fts_string_t* s2 = (const fts_string_t*) p2;
138
139	return(cmp_data_data_slow_varchar(
140		s1->f_str, s1->f_len, s2->f_str, s2->f_len));
141}
142
143/******************************************************************//**
144Compare two UTF-8 strings, and return match (0) if
145passed in "key" value equals or is the prefix of the "node" value.
146@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
147UNIV_INLINE
148int
149fts_utf8_string_cmp_prefix(
150/*=======================*/
151	const void*	p1,			/*!< in: key */
152	const void*	p2)			/*!< in: node */
153{
154	int	result;
155	ulint	len;
156
157	const fts_string_t* s1 = (const fts_string_t*) p1;
158	const fts_string_t* s2 = (const fts_string_t*) p2;
159
160	len = ut_min(s1->f_len, s2->f_len);
161
162	result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
163
164	if (result) {
165		return(result);
166	}
167
168	if (s1->f_len > s2->f_len) {
169		return(1);
170	}
171
172	return(0);
173}
174
175/******************************************************************//**
176Decode a UTF-8 character.
177
178http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
179
180 Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
18100000000 0xxxxxxx          0xxxxxxx
18200000yyy yyxxxxxx          110yyyyy 10xxxxxx
183zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
184000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
185
186This function decodes UTF-8 sequences up to 6 bytes (31 bits).
187
188On error *ptr will point to the first byte that was not correctly
189decoded. This will hopefully help in resyncing the input.
190@return UTF8_ERROR if *ptr did not point to a valid
191UTF-8 sequence, or the Unicode code point. */
192UNIV_INLINE
193ulint
194fts_utf8_decode(
195/*============*/
196	const byte**	ptr)			/*!< in/out: pointer to
197						UTF-8 string. The
198						pointer is advanced to
199						the start of the next
200						character. */
201{
202	const byte*	p = *ptr;
203	ulint		ch = *p++;
204#ifdef UNIV_DEBUG
205	ulint		min_ch;
206#endif /* UNIV_DEBUG */
207
208	if (UNIV_LIKELY(ch < 0x80)) {
209		/* 0xxxxxxx */
210	} else if (UNIV_UNLIKELY(ch < 0xC0)) {
211		/* A continuation byte cannot start a code. */
212		goto err_exit;
213	} else if (ch < 0xE0) {
214		/* 110yyyyy 10xxxxxx */
215		ch &= 0x1F;
216		ut_d(min_ch = 0x80);
217		goto get1;
218	} else if (ch < 0xF0) {
219		/* 1110zzzz 10yyyyyy 10xxxxxx */
220		ch &= 0x0F;
221		ut_d(min_ch = 0x800);
222		goto get2;
223	} else if (ch < 0xF8) {
224		/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
225		ch &= 0x07;
226		ut_d(min_ch = 0x10000);
227		goto get3;
228	} else if (ch < 0xFC) {
229		/* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
230		ch &= 0x03;
231		ut_d(min_ch = 0x200000);
232		goto get4;
233	} else if (ch < 0xFE) {
234		/* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
235		ut_d(min_ch = 0x4000000);
236		if (!fts_utf8_is_valid(*p)) {
237			goto err_exit;
238		}
239		ch <<= 6;
240		ch |= (*p++) & 0x3F;
241get4:
242		if (!fts_utf8_is_valid(*p)) {
243			goto err_exit;
244		}
245		ch <<= 6;
246		ch |= (*p++) & 0x3F;
247get3:
248		if (!fts_utf8_is_valid(*p)) {
249			goto err_exit;
250		}
251		ch <<= 6;
252		ch |= (*p++) & 0x3F;
253get2:
254		if (!fts_utf8_is_valid(*p)) {
255			goto err_exit;
256		}
257		ch <<= 6;
258		ch |= (*p++) & 0x3F;
259get1:
260		if (!fts_utf8_is_valid(*p)) {
261			goto err_exit;
262		}
263		ch <<= 6;
264		ch |= (*p++) & 0x3F;
265
266		/* The following is needed in the 6-byte case
267		when ulint is wider than 32 bits. */
268		ch &= 0xFFFFFFFF;
269
270		/* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
271		and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
272
273		if ( (ch >= 0xD800 && ch <= 0xDFFF)
274#ifdef UNIV_DEBUG
275		     || ch < min_ch
276#endif /* UNIV_DEBUG */
277		     || ch == 0xFFFE || ch == 0xFFFF) {
278
279			ch = UTF8_ERROR;
280		}
281	} else {
282err_exit:
283		ch = UTF8_ERROR;
284	}
285
286	*ptr = p;
287
288	return(ch);
289}
290
291/******************************************************************//**
292Get the first character's code position for FTS index partition */
293extern
294ulint
295innobase_strnxfrm(
296/*==============*/
297        const CHARSET_INFO*	cs,	/*!< in: Character set */
298        const uchar*		p2,	/*!< in: string */
299        const ulint		len2);	/*!< in: string length */
300
301/******************************************************************//**
302Select the FTS auxiliary index for the given character.
303@return the index to use for the string */
304UNIV_INLINE
305ulint
306fts_select_index(
307/*=============*/
308	const CHARSET_INFO*	cs,	/*!< in: Charset */
309	const byte*		str,	/*!< in: string */
310	ulint			len)	/*!< in: string length */
311{
312	ulint			selected = 0;
313	ulint			value = innobase_strnxfrm(cs, str, len);
314
315	while (fts_index_selector[selected].value != 0) {
316
317		if (fts_index_selector[selected].value == value) {
318
319			return(selected);
320
321		} else if (fts_index_selector[selected].value > value) {
322
323			return(selected > 0 ? selected - 1 : 0);
324		}
325
326		++selected;
327	}
328
329	ut_ad(selected > 1);
330
331	return(selected - 1);
332}
333
334/******************************************************************//**
335Select the next FTS auxiliary index for the given character.
336@return the next index to use for character */
337UNIV_INLINE
338ulint
339fts_select_next_index(
340/*==================*/
341	const CHARSET_INFO*	cs,	/*!< in: Charset */
342	const byte*		str,	/*!< in: string */
343	ulint			len)	/*!< in: string length */
344{
345	ulint		selected = 0;
346	ulint		value = innobase_strnxfrm(cs, str, len);
347
348	while (fts_index_selector[selected].value != 0) {
349
350		if (fts_index_selector[selected].value == value) {
351
352			return(selected + 1);
353
354		} else if (fts_index_selector[selected].value > value) {
355
356			return(selected);
357		}
358
359		++selected;
360	}
361
362	ut_ad(selected > 0);
363
364	return((ulint) selected);
365}
366
367/******************************************************************//**
368Return the selected FTS aux index suffix. */
369UNIV_INLINE
370const char*
371fts_get_suffix(
372/*===========*/
373	ulint		selected)	/*!< in: selected index */
374{
375	return(fts_index_selector[selected].suffix);
376}
377
378/******************************************************************//**
379Get the number of index selectors.
380@return The number of selectors */
381UNIV_INLINE
382ulint
383fts_get_n_selectors(void)
384/*=====================*/
385{
386	ulint	i = 0;
387
388	// FIXME: This is a hack
389	while (fts_index_selector[i].value != 0) {
390		++i;
391	}
392
393	return(i);
394}
395
396#endif /* INNOBASE_FTS0TYPES_IC */
397