1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24 
25 #include "udm_common.h"
26 #include "udm_utils.h"
27 #include "udm_unicode.h"
28 #include "udm_word.h"
29 #include "udm_searchtool.h"
30 
31 
32 #define WSIZE    1024
33 #define BSIZE    10
34 
35 /*
36   offs=0 means normal word
37   offs=1 means seclen marker
38 */
39 udm_rc_t
UdmWordListAddEx(UDM_WORDLIST * Words,const char * word,size_t secno,size_t wordpos,size_t offs)40 UdmWordListAddEx(UDM_WORDLIST *Words,
41                  const char *word,
42                  size_t secno,
43                  size_t wordpos,
44                  size_t offs)
45 {
46   UDM_WORD *W;
47   if (wordpos > 0x1FFFFF)
48     return(UDM_OK);
49   /* Realloc memory when required  */
50   if(Words->nwords >= Words->mwords)
51   {
52     Words->mwords+= WSIZE;
53     Words->Word= (UDM_WORD *) UdmRealloc(Words->Word, Words->mwords * sizeof(UDM_WORD));
54   }
55   /* Add new word */
56   W= &Words->Word[Words->nwords];
57   W->word= (char*) UdmStrdup(word);
58   W->coord.pos= wordpos /*+ offs*/;
59   W->coord.secno= secno;
60   W->hash= 0;
61   W->seclen_marker= offs;
62   Words->nwords++;
63   return UDM_OK;
64 }
65 
66 /* This function adds a normalized word form(s) into list using Ispell */
67 udm_rc_t
UdmWordListAdd(UDM_WORDLIST * List,char * word,int secno)68 UdmWordListAdd(UDM_WORDLIST *List, char *word, int secno)
69 {
70   return UdmWordListAddEx(List, word, secno, ++List->wordpos[secno],0);
71 }
72 
73 
74 void
UdmWordListReset(UDM_WORDLIST * List)75 UdmWordListReset(UDM_WORDLIST *List)
76 {
77   size_t i;
78   for (i= 0; i < List->nwords; i++)
79     UDM_FREE(List->Word[i].word);
80   List->nwords= 0;
81 }
82 
83 
84 void
UdmWordListFree(UDM_WORDLIST * List)85 UdmWordListFree(UDM_WORDLIST * List)
86 {
87   size_t i;
88   for(i=0;i<List->nwords;i++)
89     UDM_FREE(List->Word[i].word);
90   List->nwords=0;
91   UDM_FREE(List->Word);
92 }
93 
94 
95 void
UdmWordListListInit(UDM_WORDLISTLIST * WL)96 UdmWordListListInit(UDM_WORDLISTLIST *WL)
97 {
98   bzero((void*) WL, sizeof(*WL));
99 }
100 
101 
102 void
UdmWordListListFree(UDM_WORDLISTLIST * WL)103 UdmWordListListFree(UDM_WORDLISTLIST *WL)
104 {
105   size_t i;
106   for (i= 0; i < 255; i ++)
107   {
108     UdmWordListFree(&WL->Item[i]);
109   }
110 }
111 
112 
113 void
UdmWordListListReset(UDM_WORDLISTLIST * WL)114 UdmWordListListReset(UDM_WORDLISTLIST *WL)
115 {
116   size_t i;
117   for (i= 0; i < 255; i ++)
118   {
119     UdmWordListReset(&WL->Item[i]);
120   }
121 }
122 
123 /***************** ConstWordList *************************/
124 void
UdmConstWordListInit(UDM_CONSTWORDLIST * L)125 UdmConstWordListInit(UDM_CONSTWORDLIST *L)
126 {
127   bzero((void*)L, sizeof(*L));
128 }
129 
130 
131 void
UdmConstWordListFree(UDM_CONSTWORDLIST * L)132 UdmConstWordListFree(UDM_CONSTWORDLIST *L)
133 {
134   UdmFree(L->Item);
135 }
136 
137 
138 udm_rc_t
UdmConstWordListAdd(UDM_CONSTWORDLIST * L,UDM_CONSTWORD * W)139 UdmConstWordListAdd(UDM_CONSTWORDLIST *L, UDM_CONSTWORD *W)
140 {
141   if (L->nitems >= L->mitems)
142   {
143     L->mitems+= 8*1024;
144     L->Item= (UDM_CONSTWORD *) UdmRealloc(L->Item, L->mitems * sizeof(UDM_CONSTWORD));
145     if (!L->Item)
146     {
147       L->mitems= L->nitems= 0;
148       return UDM_ERROR;
149     }
150   }
151   L->Item[L->nitems]= W[0];
152   L->nitems++;
153   return UDM_OK;
154 }
155 
156 
cwcmp_sort(UDM_CONSTWORD * w1,UDM_CONSTWORD * w2)157 static int cwcmp_sort(UDM_CONSTWORD *w1, UDM_CONSTWORD *w2)
158 {
159   /*register int _;*/
160   if (w1->crc != w2->crc)
161     return w1->crc < w2->crc ? -1 : 1;
162 /*  if (w1->length != w2->length)
163     return w1->length < w2->length ? -1 : 1;*/
164   /* TODO34: UdmStrCaseCmp2 */
165 /*  if ((_= memcmp(w1->word.str, w2->word.str, w1->word.length)))
166     return _;*/
167   if (w1->coord.secno != w2->coord.secno)
168     return w1->coord.secno < w2->coord.secno ? -1 : 1;
169   if (w1->coord.pos != w2->coord.pos)
170     return w1->coord.pos < w2->coord.pos ? -1 : 1;
171   return 0;
172 }
173 
174 
175 void
UdmConstWordListSort(UDM_CONSTWORDLIST * WL)176 UdmConstWordListSort(UDM_CONSTWORDLIST *WL)
177 {
178   if (WL->nitems)
179     UdmSort(WL->Item, WL->nitems, sizeof(UDM_CONSTWORD), (udm_qsort_cmp) cwcmp_sort);
180 }
181 
182 
183 udm_rc_t
UdmConstWordListAddString(UDM_WORD_SCANNER * scanner,int cnvflags,UDM_CONSTWORDLIST * CWL,udm_secno_t secno,const char * src,size_t srclen)184 UdmConstWordListAddString(UDM_WORD_SCANNER *scanner, int cnvflags,
185                           UDM_CONSTWORDLIST *CWL,
186                           udm_secno_t secno, const char *src, size_t srclen)
187 {
188   udm_rc_t rc= UDM_OK;
189   int len;
190   int (*getword)(UDM_WORD_SCANNER *, UDM_WORD_SCANNER_TOKEN *);
191   UDM_WORD_SCANNER_TOKEN word;
192 
193   /*fprintf(stderr, "===========cs=%s\n[%d:fl=%d]%.*s\n", cs->name, secno, cnvflags, (int) srclen, src);*/
194   getword= scanner->cs->cset->getword;
195   UDM_ASSERT(getword != NULL);
196 
197   UdmWordScannerSetSource(scanner, cnvflags, src, srclen);
198 
199   for (len= getword(scanner, &word); len; len= getword(scanner, &word))
200   {
201     UDM_CONSTWORD W;
202     if (len > 255) /* TODO34: limit in UDM_CONSTWORD */
203     {
204       CWL->wordpos[secno]++;
205       continue;
206     }
207     W.str= word.str;
208     W.length= len;
209     W.crc= word.crc;
210     W.coord.pos= ++(CWL->wordpos[secno]); /* TODO34: check overflow */
211     /*fprintf(stderr, "[%d:%d] %08X '%.*s'\n", secno, W.pos, W.crc, (int) W.length, W.str);*/
212     W.coord.secno= secno;
213     if (UDM_OK != (rc= UdmConstWordListAdd(CWL, &W)))
214       break;
215   }
216   return rc;
217 }
218 
219 
220 /*********************************************************/
221 
UdmWideWordParamInit(UDM_WIDEWORD_PARAM * P)222 void UdmWideWordParamInit(UDM_WIDEWORD_PARAM *P)
223 {
224   bzero((void*) P, sizeof(UDM_WIDEWORD_PARAM));
225   P->user_weight= UDM_DEFAULT_USER_WORD_WEIGHT;
226 }
227 
228 
UdmWideWordInit(UDM_WIDEWORD * W)229 void UdmWideWordInit(UDM_WIDEWORD *W)
230 {
231   bzero((void*)W, sizeof(UDM_WIDEWORD));
232   W->Param.user_weight= UDM_DEFAULT_USER_WORD_WEIGHT;
233 }
234 
235 
UdmWideWordFree(UDM_WIDEWORD * W)236 void UdmWideWordFree(UDM_WIDEWORD *W)
237 {
238   UDM_FREE(W->Word.str);
239 }
240 
241 
242 void
UdmWideWordParamCopySpell(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)243 UdmWideWordParamCopySpell(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
244 {
245   Dst->order=  Src->order;
246   Dst->phrpos= Src->phrpos;
247   Dst->phrlen= Src->phrlen;
248   Dst->origin= UDM_WORD_ORIGIN_SPELL;
249 }
250 
251 
252 void
UdmWideWordParamCopySynonym(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)253 UdmWideWordParamCopySynonym(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
254 {
255   Dst->order=  Src->order;
256   Dst->phrpos= Src->phrpos;
257   Dst->phrlen= Src->phrlen;
258   Dst->match_mode=  Src->match_mode;
259   Dst->origin= UDM_WORD_ORIGIN_SYNONYM;
260 }
261 
262 
263 static void
UdmWideWordCopyParam(UDM_WIDEWORD_PARAM * Dst,const UDM_WIDEWORD_PARAM * Src)264 UdmWideWordCopyParam(UDM_WIDEWORD_PARAM *Dst, const UDM_WIDEWORD_PARAM *Src)
265 {
266   UDM_ASSERT(Src->origin >= UDM_WORD_ORIGIN_QUERY &&
267              Src->origin <= UDM_WORD_ORIGIN_COLLATION);
268   /* TODO34: Dst->Param= Src->Param */
269   Dst->order=  Src->order;
270   Dst->order_extra_width= Src->order_extra_width;
271   Dst->count=  Src->count;
272   /* doccount? */
273   Dst->origin= Src->origin;
274   /* weight? */
275   Dst->user_weight= Src->user_weight;
276   Dst->match_mode=  Src->match_mode;
277   Dst->secno=  Src->secno;
278   Dst->phrpos= Src->phrpos;
279   Dst->phrlen= Src->phrlen;
280   Dst->phrwidth= Src->phrwidth;
281 }
282 
283 
284 static void
UdmWideWordCopyWord(UDM_WIDEWORD * Dst,const char * str,size_t length)285 UdmWideWordCopyWord(UDM_WIDEWORD *Dst, const char *str, size_t length)
286 {
287   Dst->Word.length= length;
288   Dst->Word.str= str ? UdmStrdup(str) : NULL;
289 }
290 
291 
292 static void
UdmWideWordCopy(UDM_WIDEWORD * Dst,UDM_WIDEWORD * Src)293 UdmWideWordCopy(UDM_WIDEWORD *Dst, UDM_WIDEWORD *Src)
294 {
295   UdmWideWordCopyParam(&Dst->Param, &Src->Param);
296   UdmWideWordCopyWord(Dst, Src->Word.str, Src->Word.length);
297 }
298 
299 
300 /*
301   Replace the last three words in the list to range designator
302 */
303 udm_rc_t
UdmWideWordListMakeRange(UDM_WIDEWORDLIST * WWL,int beg,int end)304 UdmWideWordListMakeRange(UDM_WIDEWORDLIST *WWL, int beg, int end)
305 {
306   UDM_WIDEWORD *W= &WWL->Word[WWL->nwords - 3];
307   char *word;
308   size_t len;
309   UDM_ASSERT(WWL->nwords >= 3);
310   UDM_ASSERT(WWL->nuniq >= 3);
311   len= 1 + W[0].Word.length + 4 + W[2].Word.length + 1;
312   if (!(word= (char*) UdmMalloc(len + 1)))
313     return UDM_ERROR;
314   udm_snprintf(word, len + 1, "%c%s TO %s%c",
315                beg, W[0].Word.str, W[2].Word.str, end);
316   UdmWideWordFree(&W[0]);
317   UdmWideWordFree(&W[1]);
318   UdmWideWordFree(&W[2]);
319   WWL->nwords-= 2;
320   WWL->nuniq-= 2;
321   W->Word.str= word;
322   W->Word.length= len;
323   W->Param.match_mode= UDM_MATCH_RANGE;
324   return UDM_OK;
325 }
326 
327 
328 
UdmWordListInit(UDM_WORDLIST * List)329 UDM_WORDLIST * UdmWordListInit(UDM_WORDLIST * List)
330 {
331   bzero((void*)List, sizeof(*List));
332   return(List);
333 }
334 
335 
UdmWideWordListInit(UDM_WIDEWORDLIST * List)336 UDM_WIDEWORDLIST * UdmWideWordListInit(UDM_WIDEWORDLIST * List)
337 {
338   bzero((void*)List, sizeof(*List));
339   return(List);
340 }
341 
342 static size_t
UdmWideWordListAddInternal(UDM_WIDEWORDLIST * List,UDM_WIDEWORD_PARAM * Param,UDM_CONST_STR w,int for_stat)343 UdmWideWordListAddInternal(UDM_WIDEWORDLIST * List,
344                            UDM_WIDEWORD_PARAM *Param,
345                            UDM_CONST_STR w,
346                            int for_stat)
347 {
348   size_t i;
349 
350   for (i = 0; i < List->nwords; i++)
351   {
352     UDM_WIDEWORD *ListWord= &List->Word[i];
353     if (ListWord->Word.length == w.length &&
354         strcmp(ListWord->Word.str, w.str) == 0)
355     {
356       if (for_stat)
357       {
358         ListWord->Param.count+= Param->count;
359         return List->nwords;
360       }
361       else if (ListWord->Param.order == Param->order)
362       {
363         ListWord->Param.count+= Param->count;
364         if (ListWord->Param.phrpos != Param->phrpos ||
365             ListWord->Param.phrlen != Param->phrlen)
366         {
367           ListWord->Param.phrpos= 0; /* No certain in-phrase position */
368           ListWord->Param.phrlen= 0;
369         }
370         switch (Param->origin)
371         {
372           case UDM_WORD_ORIGIN_QUERY:
373             if (ListWord->Param.origin == UDM_WORD_ORIGIN_STOP)
374               break;
375           case UDM_WORD_ORIGIN_STOP:
376             ListWord->Param.origin= Param->origin;
377             break;
378           case UDM_WORD_ORIGIN_SPELL:
379           case UDM_WORD_ORIGIN_SYNONYM:
380           case UDM_WORD_ORIGIN_SYNONYM_FINAL:
381           case UDM_WORD_ORIGIN_SUGGEST:
382           case UDM_WORD_ORIGIN_COLLATION:
383             break;
384         }
385         ListWord->Param.order= Param->order;
386         return List->nwords;
387       }
388     }
389   }
390 
391   /* Realloc memory */
392   List->Word= (UDM_WIDEWORD*)UdmRealloc(List->Word,sizeof(*(List->Word))*(List->nwords+1));
393   UdmWideWordInit(&List->Word[List->nwords]);
394 
395   /* Copy data */
396   UdmWideWordCopyParam(&List->Word[List->nwords].Param, Param);
397   UdmWideWordCopyWord(&List->Word[List->nwords], w.str, w.length);
398 
399   List->nwords++;
400   return(List->nwords);
401 }
402 
403 
UdmWideWordListAdd(UDM_WIDEWORDLIST * List,UDM_WIDEWORD * Word)404 size_t UdmWideWordListAdd(UDM_WIDEWORDLIST * List, UDM_WIDEWORD * Word)
405 {
406   UDM_CONST_STR cstr;
407   cstr.str= Word->Word.str;
408   cstr.length= Word->Word.length;
409   return UdmWideWordListAddInternal(List, &Word->Param, cstr, 0);
410 }
411 
412 
413 size_t
UdmWideWordListAddLike(UDM_WIDEWORDLIST * WWList,UDM_WIDEWORD_PARAM * param,const char * word)414 UdmWideWordListAddLike(UDM_WIDEWORDLIST *WWList,
415                        UDM_WIDEWORD_PARAM *param, const char *word)
416 {
417   UDM_CONST_STR cstr;
418   cstr.str= word;
419   cstr.length= strlen(word);
420   return UdmWideWordListAddInternal(WWList, param, cstr, 0);
421 }
422 
423 
424 size_t
UdmWideWordListAddLikeConstStr(UDM_WIDEWORDLIST * WWList,UDM_WIDEWORD_PARAM * param,UDM_CONST_STR str)425 UdmWideWordListAddLikeConstStr(UDM_WIDEWORDLIST *WWList,
426                                UDM_WIDEWORD_PARAM *param,
427                                UDM_CONST_STR str)
428 {
429   return UdmWideWordListAddInternal(WWList, param, str, 0);
430 }
431 
432 
UdmWideWordListAddForStat(UDM_WIDEWORDLIST * List,UDM_WIDEWORD * Word)433 size_t UdmWideWordListAddForStat(UDM_WIDEWORDLIST * List, UDM_WIDEWORD * Word)
434 {
435   UDM_CONST_STR cstr;
436   cstr.str= Word->Word.str;
437   cstr.length= Word->Word.length;
438   return UdmWideWordListAddInternal(List, &Word->Param, cstr, 1);
439 }
440 
441 
UdmWideWordListFree(UDM_WIDEWORDLIST * List)442 void UdmWideWordListFree(UDM_WIDEWORDLIST * List)
443 {
444   size_t i;
445   for(i=0;i<List->nwords;i++)
446     UdmWideWordFree(&List->Word[i]);
447   UDM_FREE(List->Word);
448   UdmWideWordListInit(List);
449 }
450 
451 
452 udm_rc_t
UdmWideWordListCopy(UDM_WIDEWORDLIST * Dst,UDM_WIDEWORDLIST * Src)453 UdmWideWordListCopy(UDM_WIDEWORDLIST *Dst, UDM_WIDEWORDLIST *Src)
454 {
455   size_t i;
456   *Dst= *Src;
457   Dst->Word= (UDM_WIDEWORD*) UdmMalloc(sizeof(*(Src->Word))*(Src->nwords));
458   for (i= 0; i < Src->nwords; i++)
459     UdmWideWordCopy(&Dst->Word[i], &Src->Word[i]);
460   return UDM_OK;
461 }
462 
463 
wwcmp(const UDM_WIDEWORD * w1,const UDM_WIDEWORD * w2)464 static int wwcmp(const UDM_WIDEWORD *w1, const UDM_WIDEWORD *w2)
465 {
466   int rc;
467   if ((rc= strcmp(w1->Word.str, w2->Word.str)))
468     return rc;
469   return (int) w1->Param.secno - (int) w2->Param.secno;
470 }
471 
472 
UdmWideWordListSort(UDM_WIDEWORDLIST * L)473 void UdmWideWordListSort(UDM_WIDEWORDLIST *L)
474 {
475   if (L->nwords)
476     UdmSort(L->Word, L->nwords, sizeof(UDM_WIDEWORD), (udm_qsort_cmp) wwcmp);
477 }
478 
479 
wlcmp(UDM_WORD * w1,UDM_WORD * w2)480 static int wlcmp(UDM_WORD *w1, UDM_WORD *w2)
481 {
482   register int _;
483   if ((_= strcmp(w1->word, w2->word)))
484     return _;
485   return (int) w1->coord.secno - (int) w2->coord.secno;
486 }
487 
488 
489 udm_rc_t
UdmWordListSaveSectionSize(UDM_WORDLIST * Words)490 UdmWordListSaveSectionSize(UDM_WORDLIST *Words)
491 {
492   size_t i= Words->nwords;
493   int prev_sec= 0;
494   const char *prev_word= "#non-existing";
495   if (i)
496     UdmSort(Words->Word, i, sizeof(UDM_WORD), (udm_qsort_cmp)wlcmp);
497   while (i--)
498   {
499     /*
500        This assignement must be inside the loop, since Word could be
501        realloced by AddOneWord
502     */
503     UDM_WORD *W= &Words->Word[i];
504     if (W->coord.secno != prev_sec || strcmp(W->word, prev_word))
505     {
506       udm_rc_t rc;
507       prev_word= W->word;
508       prev_sec= W->coord.secno;
509       if (UDM_OK != (rc= UdmWordListAddEx(Words, prev_word, prev_sec,
510                                           Words->wordpos[prev_sec] + 1, 1)))
511         return rc;
512     }
513   }
514   return UDM_OK;
515 }
516