1 /* Copyright (c) 2001, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24 
25 #define FT_CORE
26 #include "ftdefs.h"
27 #include "my_base.h" /* HA_KEYTYPE_FLOAT */
28 
29 /* search with natural language queries */
30 
31 typedef struct ft_doc_rec
32 {
33   my_off_t  dpos;
34   double    weight;
35 } FT_DOC;
36 
37 struct st_ft_info
38 {
39   struct _ft_vft *please;
40   MI_INFO  *info;
41   int       ndocs;
42   int       curdoc;
43   FT_DOC    doc[1];
44 };
45 
46 typedef struct st_all_in_one
47 {
48   MI_INFO    *info;
49   uint	      keynr;
50   const CHARSET_INFO *charset;
51   uchar      *keybuff;
52   TREE	      dtree;
53 } ALL_IN_ONE;
54 
55 typedef struct st_ft_superdoc
56 {
57     FT_DOC   doc;
58     FT_WORD *word_ptr;
59     double   tmp_weight;
60 } FT_SUPERDOC;
61 
FT_SUPERDOC_cmp(void * cmp_arg MY_ATTRIBUTE ((unused)),FT_SUPERDOC * p1,FT_SUPERDOC * p2)62 static int FT_SUPERDOC_cmp(void* cmp_arg MY_ATTRIBUTE((unused)),
63 			   FT_SUPERDOC *p1, FT_SUPERDOC *p2)
64 {
65   if (p1->doc.dpos < p2->doc.dpos)
66     return -1;
67   if (p1->doc.dpos == p2->doc.dpos)
68     return 0;
69   return 1;
70 }
71 
walk_and_match(FT_WORD * word,uint32 count,ALL_IN_ONE * aio)72 static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
73 {
74   int	       subkeys= 0, r;
75   uint	       keylen, doc_cnt;
76   FT_SUPERDOC  sdoc, *sptr;
77   TREE_ELEMENT *selem;
78   double       gweight=1;
79   MI_INFO      *info=aio->info;
80   MYISAM_SHARE *share= info->s;
81   uchar        *keybuff=aio->keybuff;
82   MI_KEYDEF    *keyinfo=info->s->keyinfo+aio->keynr;
83   my_off_t     key_root;
84   uint         extra= HA_FT_WLEN + info->s->rec_reflength;
85 #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
86   float tmp_weight;
87 #else
88 #error
89 #endif
90 
91   DBUG_ENTER("walk_and_match");
92 
93   word->weight=LWS_FOR_QUERY;
94 
95   keylen=_ft_make_key(info,aio->keynr,keybuff,word,0);
96   keylen-=HA_FT_WLEN;
97   doc_cnt=0;
98 
99   if (share->concurrent_insert)
100     mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
101 
102   key_root= share->state.key_root[aio->keynr];
103 
104   /* Skip rows inserted by current inserted */
105   for (r=_mi_search(info, keyinfo, keybuff, keylen, SEARCH_FIND, key_root) ;
106        !r &&
107          (subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
108          info->lastpos >= info->state->data_file_length ;
109        r= _mi_search_next(info, keyinfo, info->lastkey,
110                           info->lastkey_length, SEARCH_BIGGER, key_root))
111     ;
112 
113   if (share->concurrent_insert)
114     mysql_rwlock_unlock(&share->key_root_lock[aio->keynr]);
115 
116   info->update|= HA_STATE_AKTIV;              /* for _mi_test_if_changed() */
117 
118   /* The following should be safe, even if we compare doubles */
119   while (!r && gweight)
120   {
121 
122     if (keylen &&
123         ha_compare_text(aio->charset,info->lastkey+1,
124                         info->lastkey_length-extra-1, keybuff+1,keylen-1,0,0))
125      break;
126 
127     if (subkeys<0)
128     {
129       if (doc_cnt)
130         DBUG_RETURN(1); /* index is corrupted */
131       /*
132         TODO here: unsafe optimization, should this word
133         be skipped (based on subkeys) ?
134       */
135       keybuff+=keylen;
136       keyinfo=& info->s->ft2_keyinfo;
137       key_root=info->lastpos;
138       keylen=0;
139       if (share->concurrent_insert)
140         mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
141       r=_mi_search_first(info, keyinfo, key_root);
142       goto do_skip;
143     }
144 #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
145     ft_floatXget(tmp_weight, info->lastkey+info->lastkey_length-extra);
146 #else
147 #error
148 #endif
149   /* The following should be safe, even if we compare doubles */
150     if (tmp_weight==0)
151       DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
152 
153     sdoc.doc.dpos=info->lastpos;
154 
155     /* saving document matched into dtree */
156     if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
157       DBUG_RETURN(1);
158 
159     sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
160 
161     if (selem->count==1) /* document's first match */
162       sptr->doc.weight=0;
163     else
164       sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
165 
166     sptr->word_ptr=word;
167     sptr->tmp_weight=tmp_weight;
168 
169     doc_cnt++;
170 
171     gweight=word->weight*GWS_IN_USE;
172     if (gweight < 0 || doc_cnt > 2000000)
173       gweight=0;
174 
175     if (share->concurrent_insert)
176       mysql_rwlock_rdlock(&share->key_root_lock[aio->keynr]);
177 
178     if (_mi_test_if_changed(info) == 0)
179 	r=_mi_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
180                           SEARCH_BIGGER, key_root);
181     else
182 	r=_mi_search(info, keyinfo, info->lastkey, info->lastkey_length,
183                      SEARCH_BIGGER, key_root);
184 do_skip:
185     while ((subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
186            !r && info->lastpos >= info->state->data_file_length)
187       r= _mi_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
188                          SEARCH_BIGGER, key_root);
189 
190     if (share->concurrent_insert)
191       mysql_rwlock_unlock(&share->key_root_lock[aio->keynr]);
192   }
193   word->weight=gweight;
194 
195   DBUG_RETURN(0);
196 }
197 
198 
walk_and_copy(FT_SUPERDOC * from,uint32 count MY_ATTRIBUTE ((unused)),FT_DOC ** to)199 static int walk_and_copy(FT_SUPERDOC *from,
200 			 uint32 count MY_ATTRIBUTE((unused)), FT_DOC **to)
201 {
202   DBUG_ENTER("walk_and_copy");
203   from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
204   (*to)->dpos=from->doc.dpos;
205   (*to)->weight=from->doc.weight;
206   (*to)++;
207   DBUG_RETURN(0);
208 }
209 
walk_and_push(FT_SUPERDOC * from,uint32 count MY_ATTRIBUTE ((unused)),QUEUE * best)210 static int walk_and_push(FT_SUPERDOC *from,
211 			 uint32 count MY_ATTRIBUTE((unused)), QUEUE *best)
212 {
213   DBUG_ENTER("walk_and_copy");
214   from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
215   set_if_smaller(best->elements, ft_query_expansion_limit-1);
216   queue_insert(best, (uchar *)& from->doc);
217   DBUG_RETURN(0);
218 }
219 
220 
FT_DOC_cmp(void * unused MY_ATTRIBUTE ((unused)),FT_DOC * a,FT_DOC * b)221 static int FT_DOC_cmp(void *unused MY_ATTRIBUTE((unused)),
222                       FT_DOC *a, FT_DOC *b)
223 {
224   double c= b->weight - a->weight;
225   return ((c < 0) ? -1 : (c > 0) ? 1 : 0);
226 }
227 
228 
ft_init_nlq_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,uint flags,uchar * record)229 FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, uchar *query,
230 			    uint query_len, uint flags, uchar *record)
231 {
232   TREE	      wtree;
233   ALL_IN_ONE  aio;
234   FT_DOC     *dptr;
235   FT_INFO    *dlist=NULL;
236   my_off_t    saved_lastpos=info->lastpos;
237   struct st_mysql_ftparser *parser;
238   MYSQL_FTPARSER_PARAM *ftparser_param;
239   DBUG_ENTER("ft_init_nlq_search");
240 
241 /* black magic ON */
242   if ((int) (keynr = _mi_check_index(info,keynr)) < 0)
243     DBUG_RETURN(NULL);
244   if (_mi_readinfo(info,F_RDLCK,1))
245     DBUG_RETURN(NULL);
246 /* black magic OFF */
247 
248   aio.info=info;
249   aio.keynr=keynr;
250   aio.charset=info->s->keyinfo[keynr].seg->charset;
251   aio.keybuff=info->lastkey+info->s->base.max_key_length;
252   parser= info->s->keyinfo[keynr].parser;
253   if (! (ftparser_param= ftparser_call_initializer(info, keynr, 0)))
254     goto err;
255 
256   memset(&wtree, 0, sizeof(wtree));
257 
258   init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
259             NULL, NULL);
260 
261   ft_parse_init(&wtree, aio.charset);
262   ftparser_param->flags= 0;
263   if (ft_parse(&wtree, query, query_len, parser, ftparser_param,
264                &wtree.mem_root))
265     goto err;
266 
267   if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
268 		left_root_right))
269     goto err;
270 
271   if (flags & FT_EXPAND && ft_query_expansion_limit)
272   {
273     QUEUE best;
274     init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
275 	       0);
276     tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
277               &best, left_root_right);
278     while (best.elements)
279     {
280       my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
281       if (!(*info->read_record)(info,docid,record))
282       {
283         info->update|= HA_STATE_AKTIV;
284         ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
285         if (unlikely(_mi_ft_parse(&wtree, info, keynr, record, ftparser_param,
286                                   &wtree.mem_root)))
287         {
288           delete_queue(&best);
289           goto err;
290         }
291       }
292     }
293     delete_queue(&best);
294     reset_tree(&aio.dtree);
295     if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
296                   left_root_right))
297       goto err;
298 
299   }
300 
301   /*
302     If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
303     so if ndocs == 0, FT_INFO.doc[] must not be accessed.
304    */
305   dlist=(FT_INFO *)my_malloc(mi_key_memory_FT_INFO,
306                              sizeof(FT_INFO)+
307 			     sizeof(FT_DOC)*
308 			     (int)(aio.dtree.elements_in_tree-1),
309 			     MYF(0));
310   if (!dlist)
311     goto err;
312 
313   dlist->please= (struct _ft_vft *) & _ft_vft_nlq;
314   dlist->ndocs=aio.dtree.elements_in_tree;
315   dlist->curdoc=-1;
316   dlist->info=aio.info;
317   dptr=dlist->doc;
318 
319   tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
320 	    &dptr, left_root_right);
321 
322   if (flags & FT_SORTED)
323     my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort2_cmp)&FT_DOC_cmp,
324               0);
325 
326 err:
327   delete_tree(&aio.dtree);
328   delete_tree(&wtree);
329   info->lastpos=saved_lastpos;
330   DBUG_RETURN(dlist);
331 }
332 
333 
ft_nlq_read_next(FT_INFO * handler,char * record)334 int ft_nlq_read_next(FT_INFO *handler, char *record)
335 {
336   MI_INFO *info= (MI_INFO *) handler->info;
337 
338   if (++handler->curdoc >= handler->ndocs)
339   {
340     --handler->curdoc;
341     return HA_ERR_END_OF_FILE;
342   }
343 
344   info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
345 
346   info->lastpos=handler->doc[handler->curdoc].dpos;
347   if (!(*info->read_record)(info,info->lastpos,(uchar*) record))
348   {
349     info->update|= HA_STATE_AKTIV;		/* Record is read */
350     return 0;
351   }
352   return my_errno();
353 }
354 
355 
ft_nlq_find_relevance(FT_INFO * handler,uchar * record MY_ATTRIBUTE ((unused)),uint length MY_ATTRIBUTE ((unused)))356 float ft_nlq_find_relevance(FT_INFO *handler,
357 			    uchar *record MY_ATTRIBUTE((unused)),
358 			    uint length MY_ATTRIBUTE((unused)))
359 {
360   int a,b,c;
361   FT_DOC  *docs=handler->doc;
362   my_off_t docid=handler->info->lastpos;
363 
364   if (docid == HA_POS_ERROR)
365     return -5.0;
366 
367   /* Assuming docs[] is sorted by dpos... */
368 
369   for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
370   {
371     if (docs[c].dpos > docid)
372       b=c;
373     else
374       a=c;
375   }
376   /* bounds check to avoid accessing unallocated handler->doc  */
377   if (a < handler->ndocs && docs[a].dpos == docid)
378     return (float) docs[a].weight;
379   else
380     return 0.0;
381 }
382 
383 
ft_nlq_close_search(FT_INFO * handler)384 void ft_nlq_close_search(FT_INFO *handler)
385 {
386   my_free(handler);
387 }
388 
389 
ft_nlq_get_relevance(FT_INFO * handler)390 float ft_nlq_get_relevance(FT_INFO *handler)
391 {
392   return (float) handler->doc[handler->curdoc].weight;
393 }
394 
395 
ft_nlq_reinit_search(FT_INFO * handler)396 void ft_nlq_reinit_search(FT_INFO *handler)
397 {
398   handler->curdoc=-1;
399 }
400 
401