1 /*****************************************************************************
2 
3 Copyright (c) 2010, 2018, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file include/row0ftsort.h
29 Create Full Text Index with (parallel) merge sort
30 
31 Created 10/13/2010 Jimmy Yang
32 *******************************************************/
33 
34 #ifndef row0ftsort_h
35 #define row0ftsort_h
36 
37 #include "univ.i"
38 #include "data0data.h"
39 #include "dict0types.h"
40 #include "row0mysql.h"
41 #include "fts0fts.h"
42 #include "fts0types.h"
43 #include "fts0priv.h"
44 #include "row0merge.h"
45 #include "btr0bulk.h"
46 
47 /** This structure defineds information the scan thread will fetch
48 and put to the linked list for parallel tokenization/sort threads
49 to process */
50 typedef struct fts_doc_item     fts_doc_item_t;
51 
52 /** Information about temporary files used in merge sort */
53 struct fts_doc_item {
54 	dfield_t*	field;		/*!< field contains document string */
55 	doc_id_t	doc_id;		/*!< document ID */
56 	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
57 					/*!< list of doc items */
58 };
59 
60 /** This defines the list type that scan thread would feed the parallel
61 tokenization threads and sort threads. */
62 typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
63 
64 #define FTS_PLL_MERGE		1
65 
66 /** Sort information passed to each individual parallel sort thread */
67 struct fts_psort_t;
68 
69 /** Common info passed to each parallel sort thread */
70 struct fts_psort_common_t {
71 	row_merge_dup_t*	dup;		/*!< descriptor of FTS index */
72 	dict_table_t*		new_table;	/*!< source table */
73 	trx_t*			trx;		/*!< transaction */
74 	fts_psort_t*		all_info;	/*!< all parallel sort info */
75 	os_event_t		sort_event;	/*!< sort event */
76 	os_event_t		merge_event;	/*!< merge event */
77 	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
78 						instead of 8 bytes integer to
79 						store Doc ID during sort, if
80 						Doc ID will not be big enough
81 						to use 8 bytes value */
82 };
83 
84 struct fts_psort_t {
85 	ulint			psort_id;	/*!< Parallel sort ID */
86 	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
87 						/*!< sort buffer */
88 	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
89 						/*!< sort file */
90 	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
91 						/*!< buffer to write to file */
92 	row_merge_block_t*	block_alloc[FTS_NUM_AUX_INDEX];
93 						/*!< buffer to allocated */
94 	ulint			child_status;	/*!< child thread status */
95 	ulint			state;		/*!< parent thread state */
96 	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
97 	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
98 	os_thread_id_t		thread_hdl;	/*!< thread handle */
99 	dberr_t			error;		/*!< db error during psort */
100 	ulint			memory_used;	/*!< memory used by fts_doc_list */
101 	ib_mutex_t		mutex;		/*!< mutex for fts_doc_list */
102 };
103 
104 /** Row fts token for plugin parser */
105 struct row_fts_token_t {
106 	fts_string_t*	text;		/*!< token */
107 	ulint		position;	/*!< token position in the document */
108 	UT_LIST_NODE_T(row_fts_token_t)
109 			token_list;	/*!< next token link */
110 };
111 
112 typedef UT_LIST_BASE_NODE_T(row_fts_token_t)     fts_token_list_t;
113 
114 /** Structure stores information from string tokenization operation */
115 struct fts_tokenize_ctx {
fts_tokenize_ctxfts_tokenize_ctx116 	fts_tokenize_ctx() { memset(this, 0, sizeof(*this)); }
117 
118 	ulint			processed_len;  /*!< processed string length */
119 	ulint			init_pos;       /*!< doc start position */
120 	ulint			buf_used;       /*!< the sort buffer (ID) when
121 						tokenization stops, which
122 						could due to sort buffer full */
123 	ulint			rows_added[FTS_NUM_AUX_INDEX];
124 						/*!< number of rows added for
125 						each FTS index partition */
126 	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
127 	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
128 						/*!< in: sort field */
129 	fts_token_list_t	fts_token_list;
130 };
131 
132 typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
133 
134 /** Structure stores information needed for the insertion phase of FTS
135 parallel sort. */
136 struct fts_psort_insert {
137 	CHARSET_INFO*	charset;	/*!< charset info */
138 	mem_heap_t*	heap;		/*!< heap */
139 	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
140 					integer for Doc ID */
141 	BtrBulk*	btr_bulk;	/*!< Bulk load instance */
142 	dtuple_t*	tuple;		/*!< Tuple to insert */
143 
144 #ifdef UNIV_DEBUG
145 	ulint		aux_index_id;	/*!< Auxiliary index id */
146 #endif
147 };
148 
149 typedef struct fts_psort_insert	fts_psort_insert_t;
150 
151 
152 /** status bit used for communication between parent and child thread */
153 #define FTS_PARENT_COMPLETE	1
154 #define FTS_PARENT_EXITING	2
155 #define FTS_CHILD_COMPLETE	1
156 #define FTS_CHILD_EXITING	2
157 
158 /** Print some debug information */
159 #define	FTSORT_PRINT
160 
161 #ifdef	FTSORT_PRINT
162 #define	DEBUG_FTS_SORT_PRINT(str)		\
163 	do {					\
164 		ut_print_timestamp(stderr);	\
165 		fprintf(stderr, str);		\
166 	} while (0)
167 #else
168 #define DEBUG_FTS_SORT_PRINT(str)
169 #endif	/* FTSORT_PRINT */
170 
171 /*************************************************************//**
172 Create a temporary "fts sort index" used to merge sort the
173 tokenized doc string. The index has three "fields":
174 
175 1) Tokenized word,
176 2) Doc ID
177 3) Word's position in original 'doc'.
178 
179 @return dict_index_t structure for the fts sort index */
180 dict_index_t*
181 row_merge_create_fts_sort_index(
182 /*============================*/
183 	dict_index_t*		index,	/*!< in: Original FTS index
184 					based on which this sort index
185 					is created */
186 	const dict_table_t*	table,	/*!< in: table that FTS index
187 					is being created on */
188 	ibool*			opt_doc_id_size);
189 					/*!< out: whether to use 4 bytes
190 					instead of 8 bytes integer to
191 					store Doc ID during sort */
192 
193 /********************************************************************//**
194 Initialize FTS parallel sort structures.
195 @return TRUE if all successful */
196 ibool
197 row_fts_psort_info_init(
198 /*====================*/
199 	trx_t*			trx,	/*!< in: transaction */
200 	row_merge_dup_t*	dup,	/*!< in,own: descriptor of
201 					FTS index being created */
202 	const dict_table_t*	new_table,/*!< in: table where indexes are
203 					created */
204 	ibool			opt_doc_id_size,
205 					/*!< in: whether to use 4 bytes
206 					instead of 8 bytes integer to
207 					store Doc ID during sort */
208 	fts_psort_t**		psort,	/*!< out: parallel sort info to be
209 					instantiated */
210 	fts_psort_t**		merge)	/*!< out: parallel merge info
211 					to be instantiated */
212 	MY_ATTRIBUTE((nonnull));
213 /********************************************************************//**
214 Clean up and deallocate FTS parallel sort structures, and close
215 temparary merge sort files */
216 void
217 row_fts_psort_info_destroy(
218 /*=======================*/
219 	fts_psort_t*	psort_info,	/*!< parallel sort info */
220 	fts_psort_t*	merge_info);	/*!< parallel merge info */
221 /********************************************************************//**
222 Free up merge buffers when merge sort is done */
223 void
224 row_fts_free_pll_merge_buf(
225 /*=======================*/
226 	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
227 
228 /*********************************************************************//**
229 Function performs parallel tokenization of the incoming doc strings.
230 @return OS_THREAD_DUMMY_RETURN */
231 os_thread_ret_t
232 fts_parallel_tokenization(
233 /*======================*/
234 	void*		arg);		/*!< in: psort_info for the thread */
235 /*********************************************************************//**
236 Start the parallel tokenization and parallel merge sort */
237 void
238 row_fts_start_psort(
239 /*================*/
240 	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
241 /*********************************************************************//**
242 Function performs the merge and insertion of the sorted records.
243 @return OS_THREAD_DUMMY_RETURN */
244 os_thread_ret_t
245 fts_parallel_merge(
246 /*===============*/
247 	void*		arg);		/*!< in: parallel merge info */
248 /*********************************************************************//**
249 Kick off the parallel merge and insert thread */
250 void
251 row_fts_start_parallel_merge(
252 /*=========================*/
253 	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
254 /********************************************************************//**
255 Read sorted FTS data files and insert data tuples to auxillary tables.
256 @return DB_SUCCESS or error number */
257 void
258 row_fts_insert_tuple(
259 /*=================*/
260 	fts_psort_insert_t*
261 			ins_ctx,        /*!< in: insert context */
262 	fts_tokenizer_word_t* word,	/*!< in: last processed
263 					tokenized word */
264 	ib_vector_t*	positions,	/*!< in: word position */
265 	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
266 	dtuple_t*	dtuple);	/*!< in: entry to insert */
267 /********************************************************************//**
268 Propagate a newly added record up one level in the selection tree
269 @return parent where this value propagated to */
270 int
271 row_merge_fts_sel_propagate(
272 /*========================*/
273 	int		propogated,	/*<! in: tree node propagated */
274 	int*		sel_tree,	/*<! in: selection tree */
275 	ulint		level,		/*<! in: selection tree level */
276 	const mrec_t**	 mrec,		/*<! in: sort record */
277 	ulint**		offsets,	/*<! in: record offsets */
278 	dict_index_t*	index);		/*<! in: FTS index */
279 /********************************************************************//**
280 Read sorted file containing index data tuples and insert these data
281 tuples to the index
282 @return DB_SUCCESS or error number */
283 dberr_t
284 row_fts_merge_insert(
285 /*=================*/
286 	dict_index_t*	index,		/*!< in: index */
287 	dict_table_t*	table,		/*!< in: new table */
288 	fts_psort_t*	psort_info,	/*!< parallel sort info */
289 	ulint		id);		/* !< in: which auxiliary table's data
290 					to insert to */
291 #endif /* row0ftsort_h */
292