1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24
25 #include "udm_common.h"
26 #include "udm_utils.h"
27 #include "udm_unicode.h"
28 #include "udm_word.h"
29 #include "udm_searchtool.h"
30
31
32 #define WSIZE 1024
33 #define BSIZE 10
34
35 /*
36 offs=0 means normal word
37 offs=1 means seclen marker
38 */
39 udm_rc_t
UdmWordListAddEx(UDM_WORDLIST * Words,const char * word,size_t secno,size_t wordpos,size_t offs)40 UdmWordListAddEx(UDM_WORDLIST *Words,
41 const char *word,
42 size_t secno,
43 size_t wordpos,
44 size_t offs)
45 {
46 UDM_WORD *W;
47 if (wordpos > 0x1FFFFF)
48 return(UDM_OK);
49 /* Realloc memory when required */
50 if(Words->nwords >= Words->mwords)
51 {
52 Words->mwords+= WSIZE;
53 Words->Word= (UDM_WORD *) UdmRealloc(Words->Word, Words->mwords * sizeof(UDM_WORD));
54 }
55 /* Add new word */
56 W= &Words->Word[Words->nwords];
57 W->word= (char*) UdmStrdup(word);
58 W->coord.pos= wordpos /*+ offs*/;
59 W->coord.secno= secno;
60 W->hash= 0;
61 W->seclen_marker= offs;
62 Words->nwords++;
63 return UDM_OK;
64 }
65
66 /* This function adds a normalized word form(s) into list using Ispell */
67 udm_rc_t
UdmWordListAdd(UDM_WORDLIST * List,char * word,int secno)68 UdmWordListAdd(UDM_WORDLIST *List, char *word, int secno)
69 {
70 return UdmWordListAddEx(List, word, secno, ++List->wordpos[secno],0);
71 }
72
73
74 void
UdmWordListReset(UDM_WORDLIST * List)75 UdmWordListReset(UDM_WORDLIST *List)
76 {
77 size_t i;
78 for (i= 0; i < List->nwords; i++)
79 UDM_FREE(List->Word[i].word);
80 List->nwords= 0;
81 }
82
83
84 void
UdmWordListFree(UDM_WORDLIST * List)85 UdmWordListFree(UDM_WORDLIST * List)
86 {
87 size_t i;
88 for(i=0;i<List->nwords;i++)
89 UDM_FREE(List->Word[i].word);
90 List->nwords=0;
91 UDM_FREE(List->Word);
92 }
93
94
95 void
UdmWordListListInit(UDM_WORDLISTLIST * WL)96 UdmWordListListInit(UDM_WORDLISTLIST *WL)
97 {
98 bzero((void*) WL, sizeof(*WL));
99 }
100
101
102 void
UdmWordListListFree(UDM_WORDLISTLIST * WL)103 UdmWordListListFree(UDM_WORDLISTLIST *WL)
104 {
105 size_t i;
106 for (i= 0; i < 255; i ++)
107 {
108 UdmWordListFree(&WL->Item[i]);
109 }
110 }
111
112
113 void
UdmWordListListReset(UDM_WORDLISTLIST * WL)114 UdmWordListListReset(UDM_WORDLISTLIST *WL)
115 {
116 size_t i;
117 for (i= 0; i < 255; i ++)
118 {
119 UdmWordListReset(&WL->Item[i]);
120 }
121 }
122
123 /***************** ConstWordList *************************/
124 void
UdmConstWordListInit(UDM_CONSTWORDLIST * L)125 UdmConstWordListInit(UDM_CONSTWORDLIST *L)
126 {
127 bzero((void*)L, sizeof(*L));
128 }
129
130
131 void
UdmConstWordListFree(UDM_CONSTWORDLIST * L)132 UdmConstWordListFree(UDM_CONSTWORDLIST *L)
133 {
134 UdmFree(L->Item);
135 }
136
137
138 udm_rc_t
UdmConstWordListAdd(UDM_CONSTWORDLIST * L,UDM_CONSTWORD * W)139 UdmConstWordListAdd(UDM_CONSTWORDLIST *L, UDM_CONSTWORD *W)
140 {
141 if (L->nitems >= L->mitems)
142 {
143 L->mitems+= 8*1024;
144 L->Item= (UDM_CONSTWORD *) UdmRealloc(L->Item, L->mitems * sizeof(UDM_CONSTWORD));
145 if (!L->Item)
146 {
147 L->mitems= L->nitems= 0;
148 return UDM_ERROR;
149 }
150 }
151 L->Item[L->nitems]= W[0];
152 L->nitems++;
153 return UDM_OK;
154 }
155
156
cwcmp_sort(UDM_CONSTWORD * w1,UDM_CONSTWORD * w2)157 static int cwcmp_sort(UDM_CONSTWORD *w1, UDM_CONSTWORD *w2)
158 {
159 /*register int _;*/
160 if (w1->crc != w2->crc)
161 return w1->crc < w2->crc ? -1 : 1;
162 /* if (w1->length != w2->length)
163 return w1->length < w2->length ? -1 : 1;*/
164 /* TODO34: UdmStrCaseCmp2 */
165 /* if ((_= memcmp(w1->word.str, w2->word.str, w1->word.length)))
166 return _;*/
167 if (w1->coord.secno != w2->coord.secno)
168 return w1->coord.secno < w2->coord.secno ? -1 : 1;
169 if (w1->coord.pos != w2->coord.pos)
170 return w1->coord.pos < w2->coord.pos ? -1 : 1;
171 return 0;
172 }
173
174
175 void
UdmConstWordListSort(UDM_CONSTWORDLIST * WL)176 UdmConstWordListSort(UDM_CONSTWORDLIST *WL)
177 {
178 if (WL->nitems)
179 UdmSort(WL->Item, WL->nitems, sizeof(UDM_CONSTWORD), (udm_qsort_cmp) cwcmp_sort);
180 }
181
182
183 udm_rc_t
UdmConstWordListAddString(UDM_WORD_SCANNER * scanner,int cnvflags,UDM_CONSTWORDLIST * CWL,udm_secno_t secno,const char * src,size_t srclen)184 UdmConstWordListAddString(UDM_WORD_SCANNER *scanner, int cnvflags,
185 UDM_CONSTWORDLIST *CWL,
186 udm_secno_t secno, const char *src, size_t srclen)
187 {
188 udm_rc_t rc= UDM_OK;
189 int len;
190 int (*getword)(UDM_WORD_SCANNER *, UDM_WORD_SCANNER_TOKEN *);
191 UDM_WORD_SCANNER_TOKEN word;
192
193 /*fprintf(stderr, "===========cs=%s\n[%d:fl=%d]%.*s\n", cs->name, secno, cnvflags, (int) srclen, src);*/
194 getword= scanner->cs->cset->getword;
195 UDM_ASSERT(getword != NULL);
196
197 UdmWordScannerSetSource(scanner, cnvflags, src, srclen);
198
199 for (len= getword(scanner, &word); len; len= getword(scanner, &word))
200 {
201 UDM_CONSTWORD W;
202 if (len > 255) /* TODO34: limit in UDM_CONSTWORD */
203 {
204 CWL->wordpos[secno]++;
205 continue;
206 }
207 W.str= word.str;
208 W.length= len;
209 W.crc= word.crc;
210 W.coord.pos= ++(CWL->wordpos[secno]); /* TODO34: check overflow */
211 /*fprintf(stderr, "[%d:%d] %08X '%.*s'\n", secno, W.pos, W.crc, (int) W.length, W.str);*/
212 W.coord.secno= secno;
213 if (UDM_OK != (rc= UdmConstWordListAdd(CWL, &W)))
214 break;
215 }
216 return rc;
217 }
218
219
220 /*********************************************************/
221
UdmWideWordParamInit(UDM_WIDEWORD_PARAM * P)222 void UdmWideWordParamInit(UDM_WIDEWORD_PARAM *P)
223 {
224 bzero((void*) P, sizeof(UDM_WIDEWORD_PARAM));
225 P->user_weight= UDM_DEFAULT_USER_WORD_WEIGHT;
226 }
227
228
UdmWideWordInit(UDM_WIDEWORD * W)229 void UdmWideWordInit(UDM_WIDEWORD *W)
230 {
231 bzero((void*)W, sizeof(UDM_WIDEWORD));
232 W->Param.user_weight= UDM_DEFAULT_USER_WORD_WEIGHT;
233 }
234
235
UdmWideWordFree(UDM_WIDEWORD * W)236 void UdmWideWordFree(UDM_WIDEWORD *W)
237 {
238 UDM_FREE(W->Word.str);
239 }
240
241
242 void
UdmWideWordParamCopySpell(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)243 UdmWideWordParamCopySpell(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
244 {
245 Dst->order= Src->order;
246 Dst->phrpos= Src->phrpos;
247 Dst->phrlen= Src->phrlen;
248 Dst->origin= UDM_WORD_ORIGIN_SPELL;
249 }
250
251
252 void
UdmWideWordParamCopySynonym(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)253 UdmWideWordParamCopySynonym(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
254 {
255 Dst->order= Src->order;
256 Dst->phrpos= Src->phrpos;
257 Dst->phrlen= Src->phrlen;
258 Dst->match_mode= Src->match_mode;
259 Dst->origin= UDM_WORD_ORIGIN_SYNONYM;
260 }
261
262
263 static void
UdmWideWordCopyParam(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)264 UdmWideWordCopyParam(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
265 {
266 UDM_ASSERT(Src->origin >= UDM_WORD_ORIGIN_QUERY &&
267 Src->origin <= UDM_WORD_ORIGIN_COLLATION);
268 /* TODO34: Dst->Param= Src->Param */
269 Dst->order= Src->order;
270 Dst->order_extra_width= Src->order_extra_width;
271 Dst->count= Src->count;
272 /* doccount? */
273 Dst->origin= Src->origin;
274 /* weight? */
275 Dst->user_weight= Src->user_weight;
276 Dst->match_mode= Src->match_mode;
277 Dst->secno= Src->secno;
278 Dst->phrpos= Src->phrpos;
279 Dst->phrlen= Src->phrlen;
280 Dst->phrwidth= Src->phrwidth;
281 }
282
283
284 static void
UdmWideWordCopyWord(UDM_WIDEWORD * Dst,const char * str,size_t length)285 UdmWideWordCopyWord(UDM_WIDEWORD *Dst, const char *str, size_t length)
286 {
287 Dst->Word.length= length;
288 Dst->Word.str= str ? UdmStrdup(str) : NULL;
289 }
290
291
292 static void
UdmWideWordCopy(UDM_WIDEWORD * Dst,UDM_WIDEWORD * Src)293 UdmWideWordCopy(UDM_WIDEWORD *Dst, UDM_WIDEWORD *Src)
294 {
295 UdmWideWordCopyParam(&Dst->Param, &Src->Param);
296 UdmWideWordCopyWord(Dst, Src->Word.str, Src->Word.length);
297 }
298
299
300 /*
301 Replace the last three words in the list to range designator
302 */
303 udm_rc_t
UdmWideWordListMakeRange(UDM_WIDEWORDLIST * WWL,int beg,int end)304 UdmWideWordListMakeRange(UDM_WIDEWORDLIST *WWL, int beg, int end)
305 {
306 UDM_WIDEWORD *W= &WWL->Word[WWL->nwords - 3];
307 char *word;
308 size_t len;
309 UDM_ASSERT(WWL->nwords >= 3);
310 UDM_ASSERT(WWL->nuniq >= 3);
311 len= 1 + W[0].Word.length + 4 + W[2].Word.length + 1;
312 if (!(word= (char*) UdmMalloc(len + 1)))
313 return UDM_ERROR;
314 udm_snprintf(word, len + 1, "%c%s TO %s%c",
315 beg, W[0].Word.str, W[2].Word.str, end);
316 UdmWideWordFree(&W[0]);
317 UdmWideWordFree(&W[1]);
318 UdmWideWordFree(&W[2]);
319 WWL->nwords-= 2;
320 WWL->nuniq-= 2;
321 W->Word.str= word;
322 W->Word.length= len;
323 W->Param.match_mode= UDM_MATCH_RANGE;
324 return UDM_OK;
325 }
326
327
328
UdmWordListInit(UDM_WORDLIST * List)329 UDM_WORDLIST * UdmWordListInit(UDM_WORDLIST * List)
330 {
331 bzero((void*)List, sizeof(*List));
332 return(List);
333 }
334
335
UdmWideWordListInit(UDM_WIDEWORDLIST * List)336 UDM_WIDEWORDLIST * UdmWideWordListInit(UDM_WIDEWORDLIST * List)
337 {
338 bzero((void*)List, sizeof(*List));
339 return(List);
340 }
341
342 static size_t
UdmWideWordListAddInternal(UDM_WIDEWORDLIST * List,UDM_WIDEWORD_PARAM * Param,UDM_CONST_STR w,int for_stat)343 UdmWideWordListAddInternal(UDM_WIDEWORDLIST * List,
344 UDM_WIDEWORD_PARAM *Param,
345 UDM_CONST_STR w,
346 int for_stat)
347 {
348 size_t i;
349
350 for (i = 0; i < List->nwords; i++)
351 {
352 UDM_WIDEWORD *ListWord= &List->Word[i];
353 if (ListWord->Word.length == w.length &&
354 strcmp(ListWord->Word.str, w.str) == 0)
355 {
356 if (for_stat)
357 {
358 ListWord->Param.count+= Param->count;
359 return List->nwords;
360 }
361 else if (ListWord->Param.order == Param->order)
362 {
363 ListWord->Param.count+= Param->count;
364 if (ListWord->Param.phrpos != Param->phrpos ||
365 ListWord->Param.phrlen != Param->phrlen)
366 {
367 ListWord->Param.phrpos= 0; /* No certain in-phrase position */
368 ListWord->Param.phrlen= 0;
369 }
370 switch (Param->origin)
371 {
372 case UDM_WORD_ORIGIN_QUERY:
373 if (ListWord->Param.origin == UDM_WORD_ORIGIN_STOP)
374 break;
375 case UDM_WORD_ORIGIN_STOP:
376 ListWord->Param.origin= Param->origin;
377 break;
378 case UDM_WORD_ORIGIN_SPELL:
379 case UDM_WORD_ORIGIN_SYNONYM:
380 case UDM_WORD_ORIGIN_SYNONYM_FINAL:
381 case UDM_WORD_ORIGIN_SUGGEST:
382 case UDM_WORD_ORIGIN_COLLATION:
383 break;
384 }
385 ListWord->Param.order= Param->order;
386 return List->nwords;
387 }
388 }
389 }
390
391 /* Realloc memory */
392 List->Word= (UDM_WIDEWORD*)UdmRealloc(List->Word,sizeof(*(List->Word))*(List->nwords+1));
393 UdmWideWordInit(&List->Word[List->nwords]);
394
395 /* Copy data */
396 UdmWideWordCopyParam(&List->Word[List->nwords].Param, Param);
397 UdmWideWordCopyWord(&List->Word[List->nwords], w.str, w.length);
398
399 List->nwords++;
400 return(List->nwords);
401 }
402
403
UdmWideWordListAdd(UDM_WIDEWORDLIST * List,UDM_WIDEWORD * Word)404 size_t UdmWideWordListAdd(UDM_WIDEWORDLIST * List, UDM_WIDEWORD * Word)
405 {
406 UDM_CONST_STR cstr;
407 cstr.str= Word->Word.str;
408 cstr.length= Word->Word.length;
409 return UdmWideWordListAddInternal(List, &Word->Param, cstr, 0);
410 }
411
412
413 size_t
UdmWideWordListAddLike(UDM_WIDEWORDLIST * WWList,UDM_WIDEWORD_PARAM * param,const char * word)414 UdmWideWordListAddLike(UDM_WIDEWORDLIST *WWList,
415 UDM_WIDEWORD_PARAM *param, const char *word)
416 {
417 UDM_CONST_STR cstr;
418 cstr.str= word;
419 cstr.length= strlen(word);
420 return UdmWideWordListAddInternal(WWList, param, cstr, 0);
421 }
422
423
424 size_t
UdmWideWordListAddLikeConstStr(UDM_WIDEWORDLIST * WWList,UDM_WIDEWORD_PARAM * param,UDM_CONST_STR str)425 UdmWideWordListAddLikeConstStr(UDM_WIDEWORDLIST *WWList,
426 UDM_WIDEWORD_PARAM *param,
427 UDM_CONST_STR str)
428 {
429 return UdmWideWordListAddInternal(WWList, param, str, 0);
430 }
431
432
UdmWideWordListAddForStat(UDM_WIDEWORDLIST * List,UDM_WIDEWORD * Word)433 size_t UdmWideWordListAddForStat(UDM_WIDEWORDLIST * List, UDM_WIDEWORD * Word)
434 {
435 UDM_CONST_STR cstr;
436 cstr.str= Word->Word.str;
437 cstr.length= Word->Word.length;
438 return UdmWideWordListAddInternal(List, &Word->Param, cstr, 1);
439 }
440
441
UdmWideWordListFree(UDM_WIDEWORDLIST * List)442 void UdmWideWordListFree(UDM_WIDEWORDLIST * List)
443 {
444 size_t i;
445 for(i=0;i<List->nwords;i++)
446 UdmWideWordFree(&List->Word[i]);
447 UDM_FREE(List->Word);
448 UdmWideWordListInit(List);
449 }
450
451
452 udm_rc_t
UdmWideWordListCopy(UDM_WIDEWORDLIST * Dst,UDM_WIDEWORDLIST * Src)453 UdmWideWordListCopy(UDM_WIDEWORDLIST *Dst, UDM_WIDEWORDLIST *Src)
454 {
455 size_t i;
456 *Dst= *Src;
457 Dst->Word= (UDM_WIDEWORD*) UdmMalloc(sizeof(*(Src->Word))*(Src->nwords));
458 for (i= 0; i < Src->nwords; i++)
459 UdmWideWordCopy(&Dst->Word[i], &Src->Word[i]);
460 return UDM_OK;
461 }
462
463
wwcmp(const UDM_WIDEWORD * w1,const UDM_WIDEWORD * w2)464 static int wwcmp(const UDM_WIDEWORD *w1, const UDM_WIDEWORD *w2)
465 {
466 int rc;
467 if ((rc= strcmp(w1->Word.str, w2->Word.str)))
468 return rc;
469 return (int) w1->Param.secno - (int) w2->Param.secno;
470 }
471
472
UdmWideWordListSort(UDM_WIDEWORDLIST * L)473 void UdmWideWordListSort(UDM_WIDEWORDLIST *L)
474 {
475 if (L->nwords)
476 UdmSort(L->Word, L->nwords, sizeof(UDM_WIDEWORD), (udm_qsort_cmp) wwcmp);
477 }
478
479
wlcmp(UDM_WORD * w1,UDM_WORD * w2)480 static int wlcmp(UDM_WORD *w1, UDM_WORD *w2)
481 {
482 register int _;
483 if ((_= strcmp(w1->word, w2->word)))
484 return _;
485 return (int) w1->coord.secno - (int) w2->coord.secno;
486 }
487
488
489 udm_rc_t
UdmWordListSaveSectionSize(UDM_WORDLIST * Words)490 UdmWordListSaveSectionSize(UDM_WORDLIST *Words)
491 {
492 size_t i= Words->nwords;
493 int prev_sec= 0;
494 const char *prev_word= "#non-existing";
495 if (i)
496 UdmSort(Words->Word, i, sizeof(UDM_WORD), (udm_qsort_cmp)wlcmp);
497 while (i--)
498 {
499 /*
500 This assignement must be inside the loop, since Word could be
501 realloced by AddOneWord
502 */
503 UDM_WORD *W= &Words->Word[i];
504 if (W->coord.secno != prev_sec || strcmp(W->word, prev_word))
505 {
506 udm_rc_t rc;
507 prev_word= W->word;
508 prev_sec= W->coord.secno;
509 if (UDM_OK != (rc= UdmWordListAddEx(Words, prev_word, prev_sec,
510 Words->wordpos[prev_sec] + 1, 1)))
511 return rc;
512 }
513 }
514 return UDM_OK;
515 }
516