1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB 2 3 This program is free software; you can redistribute it and/or modify 4 it under the terms of the GNU General Public License as published by 5 the Free Software Foundation; version 2 of the License. 6 7 This program is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU General Public License for more details. 11 12 You should have received a copy of the GNU General Public License 13 along with this program; if not, write to the Free Software 14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ 15 16 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ 17 18 #define FT_CORE 19 #include "ma_ftdefs.h" 20 21 /* search with natural language queries */ 22 23 typedef struct ft_doc_rec 24 { 25 my_off_t dpos; 26 double weight; 27 } FT_DOC; 28 29 struct st_ft_info 30 { 31 struct _ft_vft *please; 32 MARIA_HA *info; 33 int ndocs; 34 int curdoc; 35 FT_DOC doc[1]; 36 }; 37 38 typedef struct st_all_in_one 39 { 40 MARIA_HA *info; 41 uint keynr; 42 CHARSET_INFO *charset; 43 uchar *keybuff; 44 TREE dtree; 45 } ALL_IN_ONE; 46 47 typedef struct st_ft_superdoc 48 { 49 FT_DOC doc; 50 FT_WORD *word_ptr; 51 double tmp_weight; 52 } FT_SUPERDOC; 53 54 55 static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)), 56 FT_SUPERDOC *p1, FT_SUPERDOC *p2) 57 { 58 if (p1->doc.dpos < p2->doc.dpos) 59 return -1; 60 if (p1->doc.dpos == p2->doc.dpos) 61 return 0; 62 return 1; 63 } 64 65 static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio) 66 { 67 FT_WEIGTH subkeys; 68 int r; 69 uint doc_cnt; 70 FT_SUPERDOC sdoc, *sptr; 71 TREE_ELEMENT *selem; 72 double gweight=1; 73 MARIA_HA *info= aio->info; 74 MARIA_SHARE *share= info->s; 75 uchar *keybuff= aio->keybuff; 76 MARIA_KEYDEF *keyinfo= share->keyinfo+aio->keynr; 77 my_off_t key_root; 78 uint extra=HA_FT_WLEN+share->rec_reflength; 79 MARIA_KEY key; 80 float tmp_weight; 81 DBUG_ENTER("walk_and_match"); 82 83 word->weight=LWS_FOR_QUERY; 84 85 _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0); 86 key.data_length-= HA_FT_WLEN; 87 doc_cnt=0; 88 subkeys.i= 0; 89 90 if (share->lock_key_trees) 91 mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); 92 93 key_root= share->state.key_root[aio->keynr]; 94 95 /* Skip rows inserted by current inserted */ 96 for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ; 97 !r && 98 (subkeys.i= ft_sintXkorr(info->last_key.data + 99 info->last_key.data_length + 100 info->last_key.ref_length - extra)) > 0 && 101 info->cur_row.lastpos >= info->state->data_file_length ; 102 r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root)) 103 ; 104 105 if (share->lock_key_trees) 106 mysql_rwlock_unlock(&share->keyinfo[aio->keynr].root_lock); 107 108 info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */ 109 110 /* The following should be safe, even if we compare doubles */ 111 while (!r && gweight) 112 { 113 if (key.data_length && 114 ha_compare_text(aio->charset, 115 info->last_key.data+1, 116 info->last_key.data_length + 117 info->last_key.ref_length - extra - 1, 118 key.data+1, key.data_length-1, 0)) 119 break; 120 121 if (subkeys.i < 0) 122 { 123 if (doc_cnt) 124 DBUG_RETURN(1); /* index is corrupted */ 125 /* 126 TODO here: unsafe optimization, should this word 127 be skipped (based on subkeys) ? 128 */ 129 keybuff+= key.data_length; 130 keyinfo= &share->ft2_keyinfo; 131 key_root= info->cur_row.lastpos; 132 key.data_length= 0; 133 if (share->lock_key_trees) 134 mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); 135 r= _ma_search_first(info, keyinfo, key_root); 136 goto do_skip; 137 } 138 /* The weight we read was actually a float */ 139 tmp_weight= subkeys.f; 140 /* The following should be safe, even if we compare doubles */ 141 if (tmp_weight==0) 142 DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */ 143 144 sdoc.doc.dpos= info->cur_row.lastpos; 145 146 /* saving document matched into dtree */ 147 if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg))) 148 DBUG_RETURN(1); 149 150 sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem); 151 152 if (selem->count==1) /* document's first match */ 153 sptr->doc.weight=0; 154 else 155 sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight; 156 157 sptr->word_ptr=word; 158 sptr->tmp_weight=tmp_weight; 159 160 doc_cnt++; 161 162 gweight=word->weight*GWS_IN_USE; 163 if (gweight < 0 || doc_cnt > 2000000) 164 gweight=0; 165 166 if (share->lock_key_trees) 167 mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); 168 169 if (_ma_test_if_changed(info) == 0) 170 r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); 171 else 172 r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root); 173 do_skip: 174 while ((subkeys.i= ft_sintXkorr(info->last_key.data + 175 info->last_key.data_length + 176 info->last_key.ref_length - extra)) > 0 && 177 !r && info->cur_row.lastpos >= info->state->data_file_length) 178 r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); 179 180 if (share->lock_key_trees) 181 mysql_rwlock_unlock(&share->keyinfo[aio->keynr].root_lock); 182 } 183 word->weight=gweight; 184 185 DBUG_RETURN(0); 186 } 187 188 189 static int walk_and_copy(FT_SUPERDOC *from, 190 uint32 count __attribute__((unused)), FT_DOC **to) 191 { 192 DBUG_ENTER("walk_and_copy"); 193 from->doc.weight+=from->tmp_weight*from->word_ptr->weight; 194 (*to)->dpos=from->doc.dpos; 195 (*to)->weight=from->doc.weight; 196 (*to)++; 197 DBUG_RETURN(0); 198 } 199 200 static int walk_and_push(FT_SUPERDOC *from, 201 uint32 count __attribute__((unused)), QUEUE *best) 202 { 203 DBUG_ENTER("walk_and_copy"); 204 from->doc.weight+=from->tmp_weight*from->word_ptr->weight; 205 set_if_smaller(best->elements, ft_query_expansion_limit-1); 206 queue_insert(best, (uchar *)& from->doc); 207 DBUG_RETURN(0); 208 } 209 210 211 static int FT_DOC_cmp(void *unused __attribute__((unused)), 212 FT_DOC *a, FT_DOC *b) 213 { 214 return CMP_NUM(b->weight, a->weight); 215 } 216 217 218 FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query, 219 uint query_len, uint flags, uchar *record) 220 { 221 TREE wtree; 222 ALL_IN_ONE aio; 223 FT_DOC *dptr; 224 FT_INFO *dlist=NULL; 225 MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos; 226 struct st_mysql_ftparser *parser; 227 MYSQL_FTPARSER_PARAM *ftparser_param; 228 DBUG_ENTER("maria_ft_init_nlq_search"); 229 230 /* black magic ON */ 231 if ((int) (keynr = _ma_check_index(info,keynr)) < 0) 232 DBUG_RETURN(NULL); 233 if (_ma_readinfo(info,F_RDLCK,1)) 234 DBUG_RETURN(NULL); 235 /* black magic OFF */ 236 237 aio.info=info; 238 aio.keynr=keynr; 239 aio.charset=info->s->keyinfo[keynr].seg->charset; 240 aio.keybuff= info->lastkey_buff2; 241 parser= info->s->keyinfo[keynr].parser; 242 if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0))) 243 goto err; 244 245 bzero(&wtree,sizeof(wtree)); 246 247 init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp, 248 NULL, NULL, MYF(0)); 249 250 maria_ft_parse_init(&wtree, aio.charset); 251 ftparser_param->flags= 0; 252 if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param, 253 &wtree.mem_root)) 254 goto err; 255 256 if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, 257 left_root_right)) 258 goto err; 259 260 if (flags & FT_EXPAND && ft_query_expansion_limit) 261 { 262 QUEUE best; 263 init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp, 264 0, 0, 0); 265 tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push, 266 &best, left_root_right); 267 while (best.elements) 268 { 269 my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos; 270 if (!(*info->read_record)(info, record, docid)) 271 { 272 info->update|= HA_STATE_AKTIV; 273 ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY; 274 if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param, 275 &wtree.mem_root))) 276 { 277 delete_queue(&best); 278 goto err; 279 } 280 } 281 } 282 delete_queue(&best); 283 reset_tree(&aio.dtree); 284 if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, 285 left_root_right)) 286 goto err; 287 288 } 289 290 /* 291 If ndocs == 0, this will not allocate RAM for FT_INFO.doc[], 292 so if ndocs == 0, FT_INFO.doc[] must not be accessed. 293 */ 294 dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+ 295 sizeof(FT_DOC)* 296 (int)(aio.dtree.elements_in_tree-1), 297 MYF(0)); 298 if (!dlist) 299 goto err; 300 301 dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq; 302 dlist->ndocs=aio.dtree.elements_in_tree; 303 dlist->curdoc=-1; 304 dlist->info=aio.info; 305 dptr=dlist->doc; 306 307 tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy, 308 &dptr, left_root_right); 309 310 if (flags & FT_SORTED) 311 my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), 312 (qsort2_cmp)&FT_DOC_cmp, 0); 313 314 err: 315 delete_tree(&aio.dtree, 0); 316 delete_tree(&wtree, 0); 317 info->cur_row.lastpos= saved_lastpos; 318 DBUG_RETURN(dlist); 319 } 320 321 322 int maria_ft_nlq_read_next(FT_INFO *handler, char *record) 323 { 324 MARIA_HA *info= (MARIA_HA *) handler->info; 325 326 if (++handler->curdoc >= handler->ndocs) 327 { 328 --handler->curdoc; 329 return HA_ERR_END_OF_FILE; 330 } 331 332 info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); 333 334 info->cur_row.lastpos= handler->doc[handler->curdoc].dpos; 335 if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos)) 336 { 337 info->update|= HA_STATE_AKTIV; /* Record is read */ 338 return 0; 339 } 340 return my_errno; 341 } 342 343 344 float maria_ft_nlq_find_relevance(FT_INFO *handler, 345 uchar *record __attribute__((unused)), 346 uint length __attribute__((unused))) 347 { 348 int a,b,c; 349 FT_DOC *docs=handler->doc; 350 MARIA_RECORD_POS docid= handler->info->cur_row.lastpos; 351 352 if (docid == HA_POS_ERROR) 353 return -5.0; 354 355 /* Assuming docs[] is sorted by dpos... */ 356 357 for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2) 358 { 359 if (docs[c].dpos > docid) 360 b=c; 361 else 362 a=c; 363 } 364 /* bounds check to avoid accessing unallocated handler->doc */ 365 if (a < handler->ndocs && docs[a].dpos == docid) 366 return (float) docs[a].weight; 367 else 368 return 0.0; 369 } 370 371 372 void maria_ft_nlq_close_search(FT_INFO *handler) 373 { 374 my_free(handler); 375 } 376 377 378 float maria_ft_nlq_get_relevance(FT_INFO *handler) 379 { 380 return (float) handler->doc[handler->curdoc].weight; 381 } 382 383 384 void maria_ft_nlq_reinit_search(FT_INFO *handler) 385 { 386 handler->curdoc=-1; 387 } 388 389