1 /* Copyright (c) 2001, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24
25 #define FT_CORE
26 #include "ftdefs.h"
27 #include "my_base.h" /* HA_KEYTYPE_FLOAT */
28
29 /* search with natural language queries */
30
31 typedef struct ft_doc_rec
32 {
33 my_off_t dpos;
34 double weight;
35 } FT_DOC;
36
37 struct st_ft_info
38 {
39 struct _ft_vft *please;
40 MI_INFO *info;
41 int ndocs;
42 int curdoc;
43 FT_DOC doc[1];
44 };
45
46 typedef struct st_all_in_one
47 {
48 MI_INFO *info;
49 uint keynr;
50 const CHARSET_INFO *charset;
51 uchar *keybuff;
52 TREE dtree;
53 } ALL_IN_ONE;
54
55 typedef struct st_ft_superdoc
56 {
57 FT_DOC doc;
58 FT_WORD *word_ptr;
59 double tmp_weight;
60 } FT_SUPERDOC;
61
FT_SUPERDOC_cmp(void * cmp_arg MY_ATTRIBUTE ((unused)),FT_SUPERDOC * p1,FT_SUPERDOC * p2)62 static int FT_SUPERDOC_cmp(void* cmp_arg MY_ATTRIBUTE((unused)),
63 FT_SUPERDOC *p1, FT_SUPERDOC *p2)
64 {
65 if (p1->doc.dpos < p2->doc.dpos)
66 return -1;
67 if (p1->doc.dpos == p2->doc.dpos)
68 return 0;
69 return 1;
70 }
71
walk_and_match(FT_WORD * word,uint32 count,ALL_IN_ONE * aio)72 static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
73 {
74 int subkeys= 0, r;
75 uint keylen, doc_cnt;
76 FT_SUPERDOC sdoc, *sptr;
77 TREE_ELEMENT *selem;
78 double gweight=1;
79 MI_INFO *info=aio->info;
80 MYISAM_SHARE *share= info->s;
81 uchar *keybuff=aio->keybuff;
82 MI_KEYDEF *keyinfo=info->s->keyinfo+aio->keynr;
83 my_off_t key_root;
84 uint extra= HA_FT_WLEN + info->s->rec_reflength;
85 #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
86 float tmp_weight;
87 #else
88 #error
89 #endif
90
91 DBUG_ENTER("walk_and_match");
92
93 word->weight=LWS_FOR_QUERY;
94
95 keylen=_ft_make_key(info,aio->keynr,keybuff,word,0);
96 keylen-=HA_FT_WLEN;
97 doc_cnt=0;
98
99 if (share->concurrent_insert)
100 mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
101
102 key_root= share->state.key_root[aio->keynr];
103
104 /* Skip rows inserted by current inserted */
105 for (r=_mi_search(info, keyinfo, keybuff, keylen, SEARCH_FIND, key_root) ;
106 !r &&
107 (subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
108 info->lastpos >= info->state->data_file_length ;
109 r= _mi_search_next(info, keyinfo, info->lastkey,
110 info->lastkey_length, SEARCH_BIGGER, key_root))
111 ;
112
113 if (share->concurrent_insert)
114 mysql_rwlock_unlock(&share->key_root_lock[aio->keynr]);
115
116 info->update|= HA_STATE_AKTIV; /* for _mi_test_if_changed() */
117
118 /* The following should be safe, even if we compare doubles */
119 while (!r && gweight)
120 {
121
122 if (keylen &&
123 ha_compare_text(aio->charset,info->lastkey+1,
124 info->lastkey_length-extra-1, keybuff+1,keylen-1,0,0))
125 break;
126
127 if (subkeys<0)
128 {
129 if (doc_cnt)
130 DBUG_RETURN(1); /* index is corrupted */
131 /*
132 TODO here: unsafe optimization, should this word
133 be skipped (based on subkeys) ?
134 */
135 keybuff+=keylen;
136 keyinfo=& info->s->ft2_keyinfo;
137 key_root=info->lastpos;
138 keylen=0;
139 if (share->concurrent_insert)
140 mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
141 r=_mi_search_first(info, keyinfo, key_root);
142 goto do_skip;
143 }
144 #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
145 ft_floatXget(tmp_weight, info->lastkey+info->lastkey_length-extra);
146 #else
147 #error
148 #endif
149 /* The following should be safe, even if we compare doubles */
150 if (tmp_weight==0)
151 DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
152
153 sdoc.doc.dpos=info->lastpos;
154
155 /* saving document matched into dtree */
156 if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
157 DBUG_RETURN(1);
158
159 sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
160
161 if (selem->count==1) /* document's first match */
162 sptr->doc.weight=0;
163 else
164 sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
165
166 sptr->word_ptr=word;
167 sptr->tmp_weight=tmp_weight;
168
169 doc_cnt++;
170
171 gweight=word->weight*GWS_IN_USE;
172 if (gweight < 0 || doc_cnt > 2000000)
173 gweight=0;
174
175 if (share->concurrent_insert)
176 mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
177
178 if (_mi_test_if_changed(info) == 0)
179 r=_mi_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
180 SEARCH_BIGGER, key_root);
181 else
182 r=_mi_search(info, keyinfo, info->lastkey, info->lastkey_length,
183 SEARCH_BIGGER, key_root);
184 do_skip:
185 while ((subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
186 !r && info->lastpos >= info->state->data_file_length)
187 r= _mi_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
188 SEARCH_BIGGER, key_root);
189
190 if (share->concurrent_insert)
191 mysql_rwlock_unlock(&share->key_root_lock[aio->keynr]);
192 }
193 word->weight=gweight;
194
195 DBUG_RETURN(0);
196 }
197
198
walk_and_copy(FT_SUPERDOC * from,uint32 count MY_ATTRIBUTE ((unused)),FT_DOC ** to)199 static int walk_and_copy(FT_SUPERDOC *from,
200 uint32 count MY_ATTRIBUTE((unused)), FT_DOC **to)
201 {
202 DBUG_ENTER("walk_and_copy");
203 from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
204 (*to)->dpos=from->doc.dpos;
205 (*to)->weight=from->doc.weight;
206 (*to)++;
207 DBUG_RETURN(0);
208 }
209
walk_and_push(FT_SUPERDOC * from,uint32 count MY_ATTRIBUTE ((unused)),QUEUE * best)210 static int walk_and_push(FT_SUPERDOC *from,
211 uint32 count MY_ATTRIBUTE((unused)), QUEUE *best)
212 {
213 DBUG_ENTER("walk_and_copy");
214 from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
215 set_if_smaller(best->elements, ft_query_expansion_limit-1);
216 queue_insert(best, (uchar *)& from->doc);
217 DBUG_RETURN(0);
218 }
219
220
FT_DOC_cmp(void * unused MY_ATTRIBUTE ((unused)),FT_DOC * a,FT_DOC * b)221 static int FT_DOC_cmp(void *unused MY_ATTRIBUTE((unused)),
222 FT_DOC *a, FT_DOC *b)
223 {
224 double c= b->weight - a->weight;
225 return ((c < 0) ? -1 : (c > 0) ? 1 : 0);
226 }
227
228
ft_init_nlq_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,uint flags,uchar * record)229 FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, uchar *query,
230 uint query_len, uint flags, uchar *record)
231 {
232 TREE wtree;
233 ALL_IN_ONE aio;
234 FT_DOC *dptr;
235 FT_INFO *dlist=NULL;
236 my_off_t saved_lastpos=info->lastpos;
237 struct st_mysql_ftparser *parser;
238 MYSQL_FTPARSER_PARAM *ftparser_param;
239 DBUG_ENTER("ft_init_nlq_search");
240
241 /* black magic ON */
242 if ((int) (keynr = _mi_check_index(info,keynr)) < 0)
243 DBUG_RETURN(NULL);
244 if (_mi_readinfo(info,F_RDLCK,1))
245 DBUG_RETURN(NULL);
246 /* black magic OFF */
247
248 aio.info=info;
249 aio.keynr=keynr;
250 aio.charset=info->s->keyinfo[keynr].seg->charset;
251 aio.keybuff=info->lastkey+info->s->base.max_key_length;
252 parser= info->s->keyinfo[keynr].parser;
253 if (! (ftparser_param= ftparser_call_initializer(info, keynr, 0)))
254 goto err;
255
256 memset(&wtree, 0, sizeof(wtree));
257
258 init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
259 NULL, NULL);
260
261 ft_parse_init(&wtree, aio.charset);
262 ftparser_param->flags= 0;
263 if (ft_parse(&wtree, query, query_len, parser, ftparser_param,
264 &wtree.mem_root))
265 goto err;
266
267 if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
268 left_root_right))
269 goto err;
270
271 if (flags & FT_EXPAND && ft_query_expansion_limit)
272 {
273 QUEUE best;
274 init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
275 0);
276 tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
277 &best, left_root_right);
278 while (best.elements)
279 {
280 my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
281 if (!(*info->read_record)(info,docid,record))
282 {
283 info->update|= HA_STATE_AKTIV;
284 ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
285 if (unlikely(_mi_ft_parse(&wtree, info, keynr, record, ftparser_param,
286 &wtree.mem_root)))
287 {
288 delete_queue(&best);
289 goto err;
290 }
291 }
292 }
293 delete_queue(&best);
294 reset_tree(&aio.dtree);
295 if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
296 left_root_right))
297 goto err;
298
299 }
300
301 /*
302 If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
303 so if ndocs == 0, FT_INFO.doc[] must not be accessed.
304 */
305 dlist=(FT_INFO *)my_malloc(mi_key_memory_FT_INFO,
306 sizeof(FT_INFO)+
307 sizeof(FT_DOC)*
308 (int)(aio.dtree.elements_in_tree-1),
309 MYF(0));
310 if (!dlist)
311 goto err;
312
313 dlist->please= (struct _ft_vft *) & _ft_vft_nlq;
314 dlist->ndocs=aio.dtree.elements_in_tree;
315 dlist->curdoc=-1;
316 dlist->info=aio.info;
317 dptr=dlist->doc;
318
319 tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
320 &dptr, left_root_right);
321
322 if (flags & FT_SORTED)
323 my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort2_cmp)&FT_DOC_cmp,
324 0);
325
326 err:
327 delete_tree(&aio.dtree);
328 delete_tree(&wtree);
329 info->lastpos=saved_lastpos;
330 DBUG_RETURN(dlist);
331 }
332
333
ft_nlq_read_next(FT_INFO * handler,char * record)334 int ft_nlq_read_next(FT_INFO *handler, char *record)
335 {
336 MI_INFO *info= (MI_INFO *) handler->info;
337
338 if (++handler->curdoc >= handler->ndocs)
339 {
340 --handler->curdoc;
341 return HA_ERR_END_OF_FILE;
342 }
343
344 info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
345
346 info->lastpos=handler->doc[handler->curdoc].dpos;
347 if (!(*info->read_record)(info,info->lastpos,(uchar*) record))
348 {
349 info->update|= HA_STATE_AKTIV; /* Record is read */
350 return 0;
351 }
352 return my_errno();
353 }
354
355
ft_nlq_find_relevance(FT_INFO * handler,uchar * record MY_ATTRIBUTE ((unused)),uint length MY_ATTRIBUTE ((unused)))356 float ft_nlq_find_relevance(FT_INFO *handler,
357 uchar *record MY_ATTRIBUTE((unused)),
358 uint length MY_ATTRIBUTE((unused)))
359 {
360 int a,b,c;
361 FT_DOC *docs=handler->doc;
362 my_off_t docid=handler->info->lastpos;
363
364 if (docid == HA_POS_ERROR)
365 return -5.0;
366
367 /* Assuming docs[] is sorted by dpos... */
368
369 for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
370 {
371 if (docs[c].dpos > docid)
372 b=c;
373 else
374 a=c;
375 }
376 /* bounds check to avoid accessing unallocated handler->doc */
377 if (a < handler->ndocs && docs[a].dpos == docid)
378 return (float) docs[a].weight;
379 else
380 return 0.0;
381 }
382
383
ft_nlq_close_search(FT_INFO * handler)384 void ft_nlq_close_search(FT_INFO *handler)
385 {
386 my_free(handler);
387 }
388
389
ft_nlq_get_relevance(FT_INFO * handler)390 float ft_nlq_get_relevance(FT_INFO *handler)
391 {
392 return (float) handler->doc[handler->curdoc].weight;
393 }
394
395
ft_nlq_reinit_search(FT_INFO * handler)396 void ft_nlq_reinit_search(FT_INFO *handler)
397 {
398 handler->curdoc=-1;
399 }
400
401