1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "udm_common.h"
24 #include "udm_utils.h"
25 #include "udm_db.h"
26 #include "udm_log.h"
27 #include "udm_indexcache.h"
28 #include "udm_vars.h"
29 #include "udm_stopwords.h"
30 #include "udm_coords.h"
31
32 /** Words ************************************/
33
34
35 static void
UdmInvertedIndexWordListFree(UDM_INVERTED_INDEX_WORD_LIST * List)36 UdmInvertedIndexWordListFree(UDM_INVERTED_INDEX_WORD_LIST *List)
37 {
38 UdmFree(List->Item);
39 }
40
41
42 static inline udm_rc_t
UdmInvertedIndexWordListRealloc(UDM_INVERTED_INDEX_WORD_LIST * List)43 UdmInvertedIndexWordListRealloc(UDM_INVERTED_INDEX_WORD_LIST *List)
44 {
45 if (List->nitems >= List->mitems)
46 {
47 List->mitems+= 16*1024;
48 List->Item= (UDM_INVERTED_INDEX_WORD*)
49 UdmRealloc(List->Item, List->mitems * sizeof(UDM_INVERTED_INDEX_WORD));
50 if (!List->Item)
51 {
52 List->nitems= 0;
53 List->mitems= 0;
54 return UDM_ERROR;
55 }
56 }
57 return UDM_OK;
58
59 }
60
61
62 static udm_rc_t
UdmInvertedIndexWordListAdd(UDM_INVERTED_INDEX_WORD_LIST * List,const UDM_INVERTED_INDEX_WORD * Item)63 UdmInvertedIndexWordListAdd(UDM_INVERTED_INDEX_WORD_LIST *List,
64 const UDM_INVERTED_INDEX_WORD *Item)
65 {
66 if (UDM_OK != UdmInvertedIndexWordListRealloc(List))
67 return UDM_ERROR;
68 List->Item[List->nitems]= *Item;
69 List->nitems++;
70 return UDM_OK;
71 }
72
73
74 typedef struct
75 {
76 UDM_CONST_STR str;
77 udmhash32_t crc;
78 } UDM_INVERTED_INDEX_HASH_WORD_REC;
79
80
81 typedef struct
82 {
83 udmhash32_t crc; /* 4 */
84 uint4 word_id; /* 4 */
85 } UDM_INVERTED_INDEX_HASH_WORD_KEY;
86
87
88 #define WORD_RESERVE_SIZE 128
89
90 static udm_rc_t
UdmWordStore(UDM_HASH * hash,void * ofs,void * item)91 UdmWordStore(UDM_HASH *hash, void *ofs, void *item)
92 {
93 UDM_CONSTWORD_HASH_DATA *hash_data= (UDM_CONSTWORD_HASH_DATA *) hash->user_data;
94 UDM_ENV *Env= hash_data->cache->param.Env;
95 UDM_INVERTED_INDEX_HASH_WORD_KEY *dst= (UDM_INVERTED_INDEX_HASH_WORD_KEY*) ofs;
96 UDM_INVERTED_INDEX_HASH_WORD_REC *src= (UDM_INVERTED_INDEX_HASH_WORD_REC*) item;
97 UDM_INVERTED_INDEX_WORD W;
98
99 if (!(W.str= UdmMemrootAlloc(&hash_data->cache->Words.root,
100 WORD_RESERVE_SIZE + 1)))
101 return UDM_ERROR;
102 W.length= UdmConvLCase(hash_data->cache->param.unidata,
103 &hash_data->cnv,
104 hash_data->cache->param.cnvflags | UDM_RECODE_HTML_IN,
105 W.str, WORD_RESERVE_SIZE,
106 src->str.str, src->str.length);
107 W.str[W.length]= '\0';
108 hash_data->cache->Words.root.last_page_used_size-=
109 (WORD_RESERVE_SIZE - W.length);
110 /*W.count= 1;*/
111 W.last_url_id= hash_data->url_id;
112 W.last_url_id_count= 1;
113 W.crc= src->crc;
114 W.is_stopword= Env->StopWord.nitems &&
115 UdmStopListListFind(&Env->StopWord, W.str, "") ?
116 UDM_TRUE : UDM_FALSE;
117 dst->crc= src->crc;
118 dst->word_id= (uint4) hash_data->cache->Words.list.nitems;
119 if (UDM_OK != UdmInvertedIndexWordListAdd(&hash_data->cache->Words.list, &W))
120 return UDM_ERROR;
121
122 /*
123 fprintf(stderr, "%s-%s '%.*s'-'%.*s'\n",
124 hash_data->cnv.from->name,
125 hash_data->cnv.to->name,
126 (int) src->str.length, src->str.str,
127 (int) lcword_length, lcword);
128 */
129 /*memcpy(ofs, item, sizeof(UDM_INVERTED_INDEX_HASH_WORD));*/
130 return UDM_OK;
131 }
132
133
134 static udm_rc_t
UdmWordJoin(UDM_HASH * hash,void * ofs,void * item)135 UdmWordJoin(UDM_HASH *hash, void *ofs, void *item)
136 {
137 UDM_CONSTWORD_HASH_DATA *hash_data= (UDM_CONSTWORD_HASH_DATA *) hash->user_data;
138 UDM_INVERTED_INDEX_WORD_LIST *WL= &hash_data->cache->Words.list;
139 UDM_INVERTED_INDEX_HASH_WORD_KEY *W1= ((UDM_INVERTED_INDEX_HASH_WORD_KEY *) ofs);
140 UDM_INVERTED_INDEX_WORD *W= &WL->Item[W1->word_id];
141 /*W->count++;*/
142 if (W->last_url_id == hash_data->url_id)
143 {
144 W->last_url_id_count++;
145 }
146 else
147 {
148 W->last_url_id_count= 1;
149 W->last_url_id= hash_data->url_id;
150 }
151 return UDM_OK;
152 }
153
154
155 static int
UdmWordCmp(UDM_HASH * hash,void * W1,void * W2)156 UdmWordCmp(UDM_HASH *hash, void *W1, void *W2)
157 {
158 /* TODO34: add more thorough comparision */
159 return (((const UDM_INVERTED_INDEX_HASH_WORD_KEY *) W1)->crc !=
160 ((const UDM_INVERTED_INDEX_HASH_WORD_REC *) W2)->crc);
161 }
162
163
164 static udmcrc32_t
UdmWordKeyKey(UDM_HASH * hash,const void * key)165 UdmWordKeyKey(UDM_HASH *hash, const void *key)
166 {
167 return ((const UDM_INVERTED_INDEX_HASH_WORD_KEY *) key)->crc;
168 }
169
170
171 static udmcrc32_t
UdmWordRecKey(UDM_HASH * hash,const void * rec)172 UdmWordRecKey(UDM_HASH *hash, const void *rec)
173 {
174 return ((const UDM_INVERTED_INDEX_HASH_WORD_REC *) rec)->crc;
175 }
176
177
178 UDM_HASH_HANDLER word_hash_handler=
179 {
180 UdmWordStore, /* store */
181 UdmWordJoin, /* join */
182 UdmWordCmp, /* cmp */
183 UdmWordKeyKey, /* keykey */
184 UdmWordRecKey /* reckey */
185 };
186
187
188 static int
bc2cmp(UDM_INVERTED_INDEX_CACHE_ITEM * w1,UDM_INVERTED_INDEX_CACHE_ITEM * w2)189 bc2cmp(UDM_INVERTED_INDEX_CACHE_ITEM *w1, UDM_INVERTED_INDEX_CACHE_ITEM *w2)
190 {
191 int rc;
192 if ((rc= strcmp(w1->ptr, w2->ptr)))
193 return rc;
194 if ((rc= ((int) w1->secno - (int) w2->secno)))
195 return rc;
196 return w1->url_id < w2->url_id ? -1 : 1;
197 }
198
199
200 static void
UdmInvertedIndexWordDictionaryInit(UDM_INVERTED_INDEX_WORD_DICTIONARY * D)201 UdmInvertedIndexWordDictionaryInit(UDM_INVERTED_INDEX_WORD_DICTIONARY *D)
202 {
203 bzero((void*) &D->list, sizeof(UDM_INVERTED_INDEX_WORD_LIST));
204 UdmHashInit(&D->hash, &word_hash_handler, NULL, 1024, sizeof(UDM_INVERTED_INDEX_HASH_WORD_KEY));
205 UdmMemrootInit(&D->root, 1024*1024);
206 }
207
208
209 static void
UdmInvertedIndexWordDictionaryFree(UDM_INVERTED_INDEX_WORD_DICTIONARY * D)210 UdmInvertedIndexWordDictionaryFree(UDM_INVERTED_INDEX_WORD_DICTIONARY *D)
211 {
212 UdmHashFree(&D->hash);
213 UdmInvertedIndexWordListFree(&D->list);
214 UdmMemrootFree(&D->root);
215 }
216
217
218 static void
UdmInvertedIndexWordDictionaryReset(UDM_INVERTED_INDEX_WORD_DICTIONARY * D)219 UdmInvertedIndexWordDictionaryReset(UDM_INVERTED_INDEX_WORD_DICTIONARY *D)
220 {
221 UdmInvertedIndexWordDictionaryFree(D);
222 UdmInvertedIndexWordDictionaryInit(D);
223 }
224
225
226 /*****************************************************************/
227
228 void
UdmInvertedIndexCachePartSort(UDM_INVERTED_INDEX_CACHE_PART * part)229 UdmInvertedIndexCachePartSort(UDM_INVERTED_INDEX_CACHE_PART *part)
230 {
231 if (part->nitems)
232 UdmSort(part->Item, part->nitems,
233 sizeof(UDM_INVERTED_INDEX_CACHE_ITEM), (udm_qsort_cmp) bc2cmp);
234 }
235
236
237 void
UdmInvertedIndexCachePartInit(UDM_INVERTED_INDEX_CACHE_PART * part)238 UdmInvertedIndexCachePartInit(UDM_INVERTED_INDEX_CACHE_PART *part)
239 {
240 bzero(part, sizeof(*part));
241 }
242
243
244 void
UdmInvertedIndexCachePartFree(UDM_INVERTED_INDEX_CACHE_PART * part)245 UdmInvertedIndexCachePartFree(UDM_INVERTED_INDEX_CACHE_PART *part)
246 {
247 UdmFree(part->Item);
248 bzero(part, sizeof(*part));
249 }
250
251
252 static inline udm_rc_t
UdmInvertedIndexCachePartRealloc(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE_PART * part)253 UdmInvertedIndexCachePartRealloc(UDM_AGENT *A,
254 UDM_INVERTED_INDEX_CACHE_PART *part)
255 {
256 if (part->nitems >= part->mitems)
257 {
258 size_t mitems2= part->mitems + INVERTED_INDEX_CACHE_PART_SIZE;
259 size_t nbytes= mitems2 * sizeof(UDM_INVERTED_INDEX_CACHE_ITEM);
260 if (!(part->Item= (UDM_INVERTED_INDEX_CACHE_ITEM*) UdmRealloc(part->Item, nbytes)))
261 {
262 part->nitems= part->mitems= 0;
263 UdmLog(A, UDM_LOG_ERROR,
264 "UdmBlobCache2Realloc failed: %d bytes needed", (int) nbytes);
265 return UDM_ERROR;
266 }
267 part->mitems= mitems2;
268 }
269 return UDM_OK;
270 }
271
272
273 static udm_rc_t
UdmInvertedIndexCacheItemEncode(UDM_INVERTED_INDEX_CACHE_ITEM * Item,urlid_t url_id,const UDM_WORD * W,size_t ncoords)274 UdmInvertedIndexCacheItemEncode(UDM_INVERTED_INDEX_CACHE_ITEM *Item,
275 urlid_t url_id,
276 const UDM_WORD *W, size_t ncoords)
277 {
278 size_t wordlen, datalen, i, nbytes;
279 char *d, *de;
280 Item->url_id= url_id;
281 Item->secno= W->coord.secno;
282 datalen= (wordlen= strlen(W->word) + 1) + ncoords * 4 + 4;
283 if (!(Item->ptr= (char*) UdmMalloc(datalen)))
284 return UDM_ERROR;
285
286 /*TODO34-STAT: cache->nbytes+= datalen; ??? */
287
288 de= Item->ptr + datalen;
289
290 /* Store word together with '\0' */
291 memcpy(Item->ptr, W->word, wordlen);
292
293 /* Store ncoords */
294 d= Item->ptr + wordlen;
295 if (!(nbytes= udm_coord_put(ncoords,
296 (unsigned char*) d, (unsigned char*) de)))
297 return UDM_ERROR;
298 d+= nbytes;
299
300 /* Store coords */
301 for (i= 0; i < ncoords; i++)
302 {
303 nbytes= udm_coord_put(W[i].coord.pos,
304 (unsigned char*) d, (unsigned char*) de);
305 if (!nbytes)
306 continue;
307 d+= nbytes;
308 }
309 Item->length= d - Item->ptr;
310 return UDM_OK;
311 }
312
313
314 static udm_rc_t
UdmInvertedIndexCacheItemEncodeCoords(UDM_INVERTED_INDEX_CACHE_ITEM * Item,UDM_CONSTWORD_HASH_DATA * data,const UDM_INVERTED_INDEX_WORD * W2,const UDM_INVERTED_INDEX_COORD * C,size_t ncoords,udm_pos_t section_size)315 UdmInvertedIndexCacheItemEncodeCoords(UDM_INVERTED_INDEX_CACHE_ITEM *Item,
316 UDM_CONSTWORD_HASH_DATA *data,
317 const UDM_INVERTED_INDEX_WORD *W2,
318 const UDM_INVERTED_INDEX_COORD *C,
319 size_t ncoords,
320 udm_pos_t section_size)
321 {
322 size_t datalen, i, nbytes;
323 char *d, *de;
324 udm_pos_t prev;
325
326 Item->url_id= data->url_id;
327 Item->secno= C->coord.secno;
328 datalen= W2->length + 1 + (ncoords + (section_size ? 1 : 0)) * 4 + 4;
329 if (!(Item->ptr= (char*) UdmMemrootAlloc(&data->cache->coord_root, datalen)))
330 return UDM_ERROR;
331
332 de= Item->ptr + datalen;
333
334 /* store word */
335 memcpy(Item->ptr, W2->str, W2->length);
336 Item->ptr[W2->length]= '\0';
337
338 /* Store ncoords */
339 d= Item->ptr + W2->length + 1;
340 if (!(nbytes= udm_coord_put(ncoords + (section_size ? 1 : 0),
341 (unsigned char*) d, (unsigned char*) de)))
342 return UDM_ERROR;
343 d+= nbytes;
344
345 for (prev=0, i= 0; i < ncoords; i++)
346 {
347 nbytes= udm_coord_put(C[i].coord.pos - prev,
348 (unsigned char*) d, (unsigned char*) de);
349 prev= C[i].coord.pos;
350 if (!nbytes)
351 continue;
352 d+= nbytes;
353 }
354 if (section_size)
355 {
356 nbytes= udm_coord_put(section_size - prev,
357 (unsigned char *) d, (unsigned char *) de);
358 if (nbytes)
359 d+= nbytes;
360 }
361 Item->length= d - Item->ptr;
362 data->cache->coord_root.last_page_used_size-= (datalen - Item->length);
363 /*
364 printf("'%s' wordlen=%d datalen=%d datalen2=%d ncoords=%d\n",
365 Item->data, wordlen, datalen, Item->datalen, ncoords);
366 */
367 return UDM_OK;
368 }
369
370
371
372 static udm_rc_t
UdmInvertedIndexCachePartAdd(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE_PART * part,urlid_t url_id,const UDM_WORD * W,size_t ncoords)373 UdmInvertedIndexCachePartAdd(UDM_AGENT *A,
374 UDM_INVERTED_INDEX_CACHE_PART *part,
375 urlid_t url_id, const UDM_WORD *W, size_t ncoords)
376 {
377 if (UDM_OK != UdmInvertedIndexCachePartRealloc(A, part))
378 return UDM_ERROR;
379
380 if (UDM_OK != UdmInvertedIndexCacheItemEncode(&part->Item[part->nitems],
381 url_id, W, ncoords))
382 return UDM_ERROR;
383 part->nitems++;
384 return UDM_OK;
385 }
386
387
388 udm_rc_t
UdmInvertedIndexCacheAdd(UDM_AGENT * A,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_CACHE * cache,const UDM_WORD * W,size_t ncoords)389 UdmInvertedIndexCacheAdd(UDM_AGENT *A,
390 UDM_CONSTWORD_HASH_DATA *data,
391 UDM_INVERTED_INDEX_CACHE *cache,
392 const UDM_WORD *W, size_t ncoords)
393 {
394 /* TODO34: Turkish */
395 size_t wlen= strlen(W->word);
396 size_t crc= data->cnv.from->cset->crc32lcase(A->Conf->unidata,
397 data->cnv.from,
398 W->word, wlen,
399 UDM_RECODE_HTML);
400 size_t n= crc % cache->nitems;
401 return UdmInvertedIndexCachePartAdd(A, &cache->Item[n], data->url_id, W, ncoords);
402 }
403
404
405 static udm_rc_t
UdmInvertedIndexCachePartAddCoords(UDM_AGENT * A,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_CACHE_PART * part,UDM_INVERTED_INDEX_WORD * W2,const UDM_INVERTED_INDEX_COORD * C,size_t ncoords,udm_pos_t section_size)406 UdmInvertedIndexCachePartAddCoords(UDM_AGENT *A,
407 UDM_CONSTWORD_HASH_DATA *data,
408 UDM_INVERTED_INDEX_CACHE_PART *part,
409 UDM_INVERTED_INDEX_WORD *W2,
410 const UDM_INVERTED_INDEX_COORD *C,
411 size_t ncoords,
412 udm_pos_t section_size)
413 {
414 UDM_INVERTED_INDEX_CACHE_ITEM *Item;
415 udm_rc_t rc;
416 size_t ncoords_to_store;
417
418 if (W2->length > A->Conf->WordParam.max_word_len ||
419 W2->length < A->Conf->WordParam.min_word_len)
420 return UDM_OK;
421
422 /* TODO34-STAT */
423 if (UDM_OK != UdmInvertedIndexCachePartRealloc(A, part))
424 return UDM_ERROR;
425
426 Item= &part->Item[part->nitems];
427
428 ncoords_to_store= (data->cache->param.pair_limit &&
429 ncoords > data->cache->param.pair_limit) ? 1 : ncoords;
430 rc= UdmInvertedIndexCacheItemEncodeCoords(Item, data, W2,
431 C, ncoords_to_store,
432 section_size);
433 if (rc== UDM_OK)
434 part->nitems++;
435 return rc;
436 }
437
438
439 void
UdmInvertedIndexCacheInit(UDM_INVERTED_INDEX_CACHE * L,UDM_ENV * Env)440 UdmInvertedIndexCacheInit(UDM_INVERTED_INDEX_CACHE *L, UDM_ENV *Env)
441 {
442 size_t i;
443 bzero(L, sizeof(UDM_INVERTED_INDEX_CACHE));
444 L->param.unidata= Env->unidata;
445 L->param.Env= Env;
446 L->param.pair_limit= UdmVarListFindInt(&Env->Vars, "PairLimit", 0);
447 L->param.cnvflags= UdmVarListFindBool(&Env->Vars, "StripAccents", UDM_FALSE) ?
448 UDM_RECODE_HTML_OUT|UDM_RECODE_STRIP_ACCENTS :
449 UDM_RECODE_HTML_OUT;
450 L->param.save_section_size= UdmVarListFindInt(&Env->Vars, "SaveSectionSize", 1);
451 for (L->param.aggregate_section_flags= 0, i= 0; i < Env->Sections.nvars; i++)
452 {
453 const UDM_VAR *Item= UdmVarListFindConstByIndex(&Env->Sections, i);
454 L->param.aggregate_section_flags|= UdmVarFlags(Item);
455 }
456 UdmInvertedIndexWordDictionaryInit(&L->Words);
457 UdmMemrootInit(&L->coord_root, 16*1024*1024);
458 }
459
460
461 void
UdmInvertedIndexCacheReset(UDM_INVERTED_INDEX_CACHE * L)462 UdmInvertedIndexCacheReset(UDM_INVERTED_INDEX_CACHE *L)
463 {
464 size_t i;
465 UdmInvertedIndexWordDictionaryReset(&L->Words);
466 UdmMemrootReset(&L->coord_root);
467 for (i= 0; i < L->nitems; i++)
468 UdmInvertedIndexCachePartFree(&L->Item[i]);
469 }
470
471
472 void
UdmInvertedIndexCacheFree(UDM_INVERTED_INDEX_CACHE * L)473 UdmInvertedIndexCacheFree(UDM_INVERTED_INDEX_CACHE *L)
474 {
475 UdmInvertedIndexCacheReset(L);
476 UdmFree(L->Item);
477 UdmInvertedIndexWordDictionaryFree(&L->Words);
478 UdmMemrootFree(&L->coord_root);
479 }
480
481
482 static udm_rc_t
UdmInvertedIndexCacheRealloc(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * L)483 UdmInvertedIndexCacheRealloc(UDM_AGENT *A, UDM_INVERTED_INDEX_CACHE *L)
484 {
485 if (L->nitems >= L->mitems)
486 {
487 size_t mitems2= L->mitems + INVERTED_INDEX_CACHE_PARTS;
488 size_t nbytes= mitems2 * sizeof(UDM_INVERTED_INDEX_CACHE_PART);
489 if (!(L->Item= (UDM_INVERTED_INDEX_CACHE_PART*) UdmRealloc(L->Item, nbytes)))
490 {
491 L->nitems= L->mitems= 0;
492 UdmLog(A, UDM_LOG_ERROR,
493 "UdmBlobCache2ListRealloc failed: %d bytes needed", (int) nbytes);
494 return UDM_ERROR;
495 }
496 L->mitems= mitems2;
497 }
498 return UDM_OK;
499 }
500
501
502 udm_rc_t
UdmInvertedIndexCacheAddPart(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache)503 UdmInvertedIndexCacheAddPart(UDM_AGENT *A, UDM_INVERTED_INDEX_CACHE *cache)
504 {
505 if (UDM_OK != UdmInvertedIndexCacheRealloc(A, cache))
506 return UDM_ERROR;
507 UdmInvertedIndexCachePartInit(&cache->Item[cache->nitems]);
508 cache->nitems++;
509 return UDM_OK;
510 }
511
512
513 udm_rc_t
UdmInvertedIndexCacheAllocParts(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,size_t n)514 UdmInvertedIndexCacheAllocParts(UDM_AGENT *A,
515 UDM_INVERTED_INDEX_CACHE *cache, size_t n)
516 {
517 size_t i;
518 for (i= 0; i < n; i++)
519 {
520 if (UDM_OK != UdmInvertedIndexCacheAddPart(A, cache))
521 return UDM_ERROR;
522 }
523 return UDM_OK;
524 }
525
526
527 static udm_rc_t
UdmInvertedIndexCacheAddCoords(UDM_AGENT * A,UDM_CONSTWORD_HASH_DATA * data,const UDM_INVERTED_INDEX_COORD * C,size_t ncoords,udm_pos_t section_size)528 UdmInvertedIndexCacheAddCoords(UDM_AGENT *A,
529 UDM_CONSTWORD_HASH_DATA *data,
530 const UDM_INVERTED_INDEX_COORD *C, size_t ncoords,
531 udm_pos_t section_size)
532 {
533 UDM_INVERTED_INDEX_CACHE *cache= data->cache;
534 UDM_INVERTED_INDEX_WORD *W= &cache->Words.list.Item[C->word_id];
535 size_t n= W->crc % cache->nitems;
536 if (W->is_stopword)
537 return UDM_OK;
538 return UdmInvertedIndexCachePartAddCoords(A, data, &cache->Item[n],
539 W, C, ncoords, section_size);
540 }
541
542
543 udm_rc_t
UdmInvertedIndexCoordListInit(UDM_INVERTED_INDEX_COORD_LIST * L,size_t mitems)544 UdmInvertedIndexCoordListInit(UDM_INVERTED_INDEX_COORD_LIST *L, size_t mitems)
545 {
546 L->nitems= 0;
547 L->mitems= mitems;
548 L->Item= (UDM_INVERTED_INDEX_COORD*) UdmMalloc(sizeof(UDM_INVERTED_INDEX_COORD) * mitems);
549 return L->Item ? UDM_OK : UDM_ERROR;
550 }
551
552
553 static int
cmp_ii_coord(const UDM_INVERTED_INDEX_COORD * a,const UDM_INVERTED_INDEX_COORD * b)554 cmp_ii_coord(const UDM_INVERTED_INDEX_COORD *a,
555 const UDM_INVERTED_INDEX_COORD *b)
556 {
557 if (a->word_id != b->word_id)
558 return a->word_id < b->word_id ? -1 : 1;
559 if (a->coord.secno != b->coord.secno)
560 return a->coord.secno < b->coord.secno ? -1 : 1;
561 if (a->coord.pos != b->coord.pos)
562 return a->coord.pos < b->coord.pos ? -1 : 1;
563 return 0;
564 }
565
566
567 void
UdmInvertedIndexCoordListSort(UDM_INVERTED_INDEX_COORD_LIST * L)568 UdmInvertedIndexCoordListSort(UDM_INVERTED_INDEX_COORD_LIST *L)
569 {
570 if (L->nitems)
571 UdmSort(L->Item, L->nitems, sizeof(UDM_INVERTED_INDEX_COORD),
572 (udm_qsort_cmp) cmp_ii_coord);
573 }
574
575
576 void
UdmInvertedIndexCoordListFree(UDM_INVERTED_INDEX_COORD_LIST * L)577 UdmInvertedIndexCoordListFree(UDM_INVERTED_INDEX_COORD_LIST *L)
578 {
579 UdmFree(L->Item);
580 }
581
582
583 udm_rc_t
UdmConstWordListToInvertedIndexCoordList(UDM_INVERTED_INDEX_CACHE * cache,UDM_INVERTED_INDEX_COORD_LIST * CL,UDM_CONSTWORDLIST * CWL)584 UdmConstWordListToInvertedIndexCoordList(UDM_INVERTED_INDEX_CACHE *cache,
585 UDM_INVERTED_INDEX_COORD_LIST *CL,
586 UDM_CONSTWORDLIST *CWL)
587 {
588 udm_rc_t rc;
589 size_t i;
590
591 if (UDM_OK != (rc= UdmInvertedIndexCoordListInit(CL, CWL->nitems)))
592 return rc;
593
594 for (i= 0; i < CWL->nitems; i++)
595 {
596 UDM_CONSTWORD *Item= &CWL->Item[i];
597 UDM_INVERTED_INDEX_HASH_WORD_REC rec;
598 UDM_INVERTED_INDEX_HASH_WORD_KEY *key;
599 UDM_INVERTED_INDEX_COORD *C= &CL->Item[CL->nitems++];
600 rec.str.str= Item->str;
601 rec.str.length= Item->length;
602 rec.crc= Item->crc;
603 if (!(key= (UDM_INVERTED_INDEX_HASH_WORD_KEY*) UdmHashPut(&cache->Words.hash, &rec)))
604 {
605 UdmInvertedIndexCoordListFree(CL);
606 return UDM_ERROR;
607 }
608 C->word_id= key->word_id;
609 C->coord= Item->coord;
610 }
611 return UDM_OK;
612 }
613
614
615 static int
clcmp_search(UDM_INVERTED_INDEX_COORD * c1,UDM_INVERTED_INDEX_COORD * c2)616 clcmp_search(UDM_INVERTED_INDEX_COORD *c1, UDM_INVERTED_INDEX_COORD *c2)
617 {
618 if (c1->word_id != c2->word_id)
619 return c1->word_id < c2->word_id ? -1 : 1;
620 if (c1->coord.secno != c2->coord.secno)
621 return c1->coord.secno < c2->coord.secno ? -1 : 1;
622 return 0;
623 }
624
625
626 udm_rc_t
UdmInvertedIndexCoordList2InvertedIndexCache(UDM_AGENT * A,UDM_INVERTED_INDEX_COORD_LIST * CL,UDM_CONSTWORDLIST * CWLWithPos,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_CACHE * L)627 UdmInvertedIndexCoordList2InvertedIndexCache(UDM_AGENT *A,
628 UDM_INVERTED_INDEX_COORD_LIST *CL,
629 UDM_CONSTWORDLIST *CWLWithPos,
630 UDM_CONSTWORD_HASH_DATA *data,
631 UDM_INVERTED_INDEX_CACHE *L)
632 {
633 size_t i;
634 UDM_INVERTED_INDEX_COORD *prev;
635 if (!CL->nitems)
636 return UDM_OK;
637 for (prev= &CL->Item[0], i= 0; i <= CL->nitems; i++)
638 {
639 UDM_INVERTED_INDEX_COORD *C= &CL->Item[i];
640 if (i == CL->nitems || clcmp_search(prev, C))
641 {
642 if (UDM_OK != UdmInvertedIndexCacheAddCoords(A, data, prev, C - prev,
643 data->cache->param.save_section_size ?
644 CWLWithPos->wordpos[prev->coord.secno] + 1 :
645 0))
646 return UDM_ERROR;
647 prev= C;
648 }
649 }
650 return UDM_OK;
651 }
652
653
654 static size_t
UdmInvertedIndexWordListEstimateUsedMemory(const UDM_INVERTED_INDEX_WORD_LIST * List)655 UdmInvertedIndexWordListEstimateUsedMemory(const UDM_INVERTED_INDEX_WORD_LIST *List)
656 {
657 return List->mitems * sizeof(List->Item[0]);
658 }
659
660
661 static size_t
UdmInvertedIndexCachePartsUsedMemory(const UDM_INVERTED_INDEX_CACHE * cache)662 UdmInvertedIndexCachePartsUsedMemory(const UDM_INVERTED_INDEX_CACHE *cache)
663 {
664 size_t i, mitems;
665 for (i= mitems= 0; i < cache->nitems; i++)
666 {
667 mitems+= cache->Item[i].mitems;
668 }
669 return mitems * sizeof(UDM_INVERTED_INDEX_CACHE_ITEM);
670 }
671
672
673 #define M1 1048576e0
674 size_t
UdmInvertedIndexCacheEstimateUsedMemory(const UDM_INVERTED_INDEX_CACHE * cache)675 UdmInvertedIndexCacheEstimateUsedMemory(const UDM_INVERTED_INDEX_CACHE *cache)
676 {
677 size_t total=
678 UdmHashUsedMemorySize(&cache->Words.hash) +
679 UdmInvertedIndexWordListEstimateUsedMemory(&cache->Words.list) +
680 UdmMemrootAllocedMemory(&cache->Words.root) +
681 UdmMemrootAllocedMemory(&cache->coord_root) +
682 UdmInvertedIndexCachePartsUsedMemory(cache);
683 /*
684 fprintf(stderr, "rlist=%.1f alist=%.1f hash=%.1f parts=%.1f coords=%.1f total=%.1f rs=%.1f\n",
685 (double) UdmMemrootAllocedMemory(&cache->Words.root) / M1,
686 (double) UdmInvertedIndexWordListEstimateUsedMemory(&cache->Words.list) / M1,
687 (double) UdmHashUsedMemorySize(&cache->Words.hash) / M1,
688 (double) UdmInvertedIndexCachePartsUsedMemory(cache) / M1,
689 (double) UdmMemrootAllocedMemory(&cache->coord_root) / M1,
690 (double) total / M1,
691 (double) UdmProcessCurrentResidentSize() / M1);
692 */
693 return total;
694 }
695