1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24 
25 #include "udm_common.h"
26 #include "udm_env.h"
27 #include "udm_utils.h"
28 #include "udm_unicode.h"
29 #include "udm_unidata.h"
30 #include "udm_word.h"
31 #include "udm_synonym.h"
32 #include "udm_conf.h"
33 
34 
UdmSynonymListInit(UDM_SYNONYMLIST * List)35 void UdmSynonymListInit(UDM_SYNONYMLIST * List)
36 {
37      bzero((void*)List, sizeof(*List));
38 }
39 
40 
41 /*
42   Returns the number of separators in a complex word.
43   Needed for complex synonyms.
44 
45   TODO: currently doesn't work with extra spaces: "aaa   bbb"
46 */
47 size_t
UdmMultiWordPhraseLength(const char * s)48 UdmMultiWordPhraseLength(const char *s)
49 {
50   size_t res;
51   for (res= 0; *s; s++)
52   {
53     if (*s == ' ')
54       res++;
55   }
56   return res;
57 }
58 
59 static void
AddSynonym(UDM_SYNONYMLIST * Syn,const char * first,const char * second,udm_wordorigin_t origin)60 AddSynonym(UDM_SYNONYMLIST *Syn, const char *first, const char *second,
61            udm_wordorigin_t origin)
62 {
63   UDM_SYNONYM *trg= &Syn->Synonym[Syn->nsynonyms];
64   size_t phrase_length= UdmMultiWordPhraseLength(first);
65   if (Syn->max_phrase_length < phrase_length)
66     Syn->max_phrase_length= phrase_length;
67   trg->p= strdup(first);
68   trg->s= strdup(second);
69   /*
70     We don't allow multi-word synonyms to get into loop
71     at this point - for performance purposes.
72   */
73   trg->origin= phrase_length ? UDM_WORD_ORIGIN_SYNONYM_FINAL : origin;
74   Syn->nsynonyms++;
75 }
76 
77 
78 #define UDM_SYN_MODE_ONEWAY    0
79 #define UDM_SYN_MODE_ROUNDTRIP 1
80 #define UDM_SYN_MODE_RETURN    2
81 
82 
83 UDM_API(udm_rc_t)
UdmSynonymListLoad(UDM_ENV * Env,const char * filename)84 UdmSynonymListLoad(UDM_ENV * Env,const char * filename)
85 {
86   FILE         *f;
87   char         str[512];
88   char         lang[64]="";
89   UDM_CHARSET  *cs=NULL;
90   UDM_CONV     file_lcs;
91   int          mode= UDM_SYN_MODE_ROUNDTRIP;
92   udm_wordorigin_t origin= UDM_WORD_ORIGIN_SYNONYM;
93   UDM_UNIDATA  *unidata= Env->unidata;
94   UDM_SYNONYMLIST Synonyms;
95   int          lineno= 0;
96 
97   UdmSynonymListInit(&Synonyms);
98 
99   if(!(f=fopen(filename,"r")))
100   {
101     udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,"Can't open synonyms file '%s'",filename);
102     return UDM_ERROR;
103   }
104 
105   for (lineno= 1; fgets(str,sizeof(str),f); lineno++)
106   {
107     if(str[0]=='#'||str[0]==' '||str[0]=='\t'||str[0]=='\r'||str[0]=='\n')continue;
108 
109     if(!strncmp(str,"Charset:",8))
110     {
111       char * lasttok;
112       char * charset;
113 
114       if((charset = udm_strtok_r(str + 8, " \t\n\r", &lasttok)))
115       {
116         if(!(cs=UdmGetCharSet(charset)))
117         {
118           udm_snprintf(Env->errstr, sizeof(Env->errstr),
119                        "Unknown charset '%s' in synonyms file '%s'", charset, filename);
120           fclose(f);
121           return UDM_ERROR;
122         }
123         UdmConvInit(&file_lcs,cs,Env->lcs);
124       }
125     }
126     else if(!strncmp(str,"Language:",9))
127     {
128       char *lasttok, *l;
129       if((l = udm_strtok_r(str + 9, " \t\n\r", &lasttok)))
130         strncpy(lang,l,sizeof(lang)-1);
131     }
132     else if (!strncasecmp(str, "Mode:", 5))
133     {
134       char *lasttok, *l;
135       for (l= udm_strtok_r(str + 5, " ,\t\n\r", &lasttok) ; l;
136            l= udm_strtok_r(NULL, " ,\t\n\r", &lasttok))
137       {
138         if (!strcasecmp(l, "oneway"))
139         {
140           mode= UDM_SYN_MODE_ONEWAY;
141           continue;
142         }
143         else if (!strcasecmp(l, "reverse") || !strcasecmp(l, "roundtrip"))
144         {
145           mode= UDM_SYN_MODE_ROUNDTRIP;
146           continue;
147         }
148         else if (!strcasecmp(l, "return"))
149         {
150           mode= UDM_SYN_MODE_RETURN;
151           continue;
152         }
153         else if (!strcasecmp(l, "recursive"))
154         {
155           origin= UDM_WORD_ORIGIN_SYNONYM;
156           continue;
157         }
158         else if (!strcasecmp(l, "final"))
159         {
160           origin= UDM_WORD_ORIGIN_SYNONYM_FINAL;
161           continue;
162         }
163         else
164         {
165           udm_snprintf(Env->errstr, sizeof(Env->errstr),
166                        "Bad Mode command in synonym file %s:%d",
167                        filename, lineno);
168           fclose(f);
169           return UDM_ERROR;
170         }
171       }
172     }
173     else
174     {
175       const char *av[255];
176       char tmp[512];
177       size_t   ac, i, j, len;
178 
179       if(!cs)
180       {
181         udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,
182                      "No Charset command in synonyms file '%s'",filename);
183         fclose(f);
184         return UDM_ERROR;
185       }
186       if(!lang[0])
187       {
188         udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,
189                      "No Language command in synonyms file '%s'",filename);
190         fclose(f);
191         return UDM_ERROR;
192       }
193 
194       len= UdmConvHTMLNonASCII(&file_lcs, tmp, sizeof(tmp), str, strlen(str));
195       tmp[len]= '\0';
196       UdmStrToLower(unidata, cs, tmp, len);
197 
198       if ((ac= UdmGetArgs(tmp, av, 255)) < 2)
199         continue;
200 
201       for (i = 0; i < (mode == UDM_SYN_MODE_RETURN ? ac - 1 : 1) ; i++)
202       {
203         for (j = i + 1; j < ac; j++)
204         {
205           if((Synonyms.nsynonyms + 1) >= Synonyms.msynonyms)
206           {
207             Synonyms.msynonyms+= 64;
208             {
209               size_t nbytes= sizeof(UDM_SYNONYM)*Synonyms.msynonyms;
210               Synonyms.Synonym= (UDM_SYNONYM*)UdmRealloc(Synonyms.Synonym, nbytes);
211             }
212           }
213 
214           /* Add direct order */
215           if (mode == UDM_SYN_MODE_ONEWAY || mode == UDM_SYN_MODE_ROUNDTRIP)
216             AddSynonym(&Synonyms, av[i], av[j], origin);
217 
218           /* Add reverse order */
219           if (mode == UDM_SYN_MODE_RETURN || mode == UDM_SYN_MODE_ROUNDTRIP)
220             AddSynonym(&Synonyms, av[j], av[i], origin);
221         }
222       }
223     }
224   }
225   fclose(f);
226   udm_snprintf(Synonyms.fname, sizeof(Synonyms.fname), "%s", filename);
227   udm_snprintf(Synonyms.cset, sizeof(Synonyms.cset), "%s", cs->name);
228   udm_snprintf(Synonyms.lang, sizeof(Synonyms.lang), "%s", lang);
229   UdmSynonymListListAdd(&Env->Synonym, &Synonyms);
230   return UDM_OK;
231 }
232 
UdmSynonymListFree(UDM_SYNONYMLIST * List)233 void UdmSynonymListFree(UDM_SYNONYMLIST * List)
234 {
235   size_t i;
236 
237   for(i=0;i<List->nsynonyms;i++)
238   {
239     UdmFree(List->Synonym[i].p);
240     UdmFree(List->Synonym[i].s);
241   }
242   UDM_FREE(List->Synonym);
243 }
244 
cmpsyn(const void * v1,const void * v2)245 static int cmpsyn(const void * v1,const void * v2)
246 {
247   const char *s1= ((const UDM_SYNONYM*)v1)->p;
248   const char *s2= ((const UDM_SYNONYM*)v2)->p;
249   return strcmp(s1, s2);
250 }
251 
252 UDM_API(void)
UdmSynonymListSort(UDM_SYNONYMLIST * List)253 UdmSynonymListSort(UDM_SYNONYMLIST * List)
254 {
255   if(List->nsynonyms)
256     UdmSort(List->Synonym,List->nsynonyms,sizeof(UDM_SYNONYM),&cmpsyn);
257 }
258 
259 
260 static void
UdmWideWordListAddSynonym(UDM_WIDEWORDLIST * Res,const UDM_WIDEWORD_PARAM * param,const UDM_SYNONYM * syn)261 UdmWideWordListAddSynonym(UDM_WIDEWORDLIST *Res,
262                           const UDM_WIDEWORD_PARAM *param,
263                           const UDM_SYNONYM *syn)
264 {
265   UDM_WIDEWORD_PARAM tmp= *param;
266   tmp.origin= syn->origin;
267   UdmWideWordListAddLike(Res, &tmp, syn->s);
268 }
269 
270 
271 
272 /*
273   Traverse through a synonym list starting
274   from "medium" towards the beginning or the
275   end of the list and adding all synonyms
276   equal to "word".
277   "medium" must be a valid pointer to some word
278   in the list.
279 */
280 static void
UdmWideWordListAddSynonymIterate(UDM_WIDEWORDLIST * Res,const UDM_SYNONYMLIST * List,const UDM_SYNONYM * medium,const UDM_WIDEWORD_PARAM * param,const char * word,int direction)281 UdmWideWordListAddSynonymIterate(UDM_WIDEWORDLIST *Res,
282                                  const UDM_SYNONYMLIST *List,
283                                  const UDM_SYNONYM *medium,
284                                  const UDM_WIDEWORD_PARAM *param,
285                                  const char *word,
286                                  int direction)
287 {
288   UDM_SYNONYM *first= List->Synonym;
289   UDM_SYNONYM *last= List->Synonym + List->nsynonyms;
290   for ( ; medium >= first && medium < last; medium+= direction)
291   {
292     if (strcmp(word, medium->p))
293       break;
294     UdmWideWordListAddSynonym(Res, param, medium);
295   }
296 }
297 
298 
299 UDM_WIDEWORDLIST *
UdmSynonymListFind(UDM_WIDEWORDLIST * Res,const UDM_SYNONYMLIST * List,const UDM_WIDEWORD * wword)300 UdmSynonymListFind(UDM_WIDEWORDLIST *Res,
301                    const UDM_SYNONYMLIST *List,
302                    const UDM_WIDEWORD *wword)
303 {
304   UDM_SYNONYM syn, *res;
305 
306   /* Quickly skip empty lists, and skip final synonyms */
307   if (!List->nsynonyms || wword->Param.origin == UDM_WORD_ORIGIN_SYNONYM_FINAL)
308     return NULL;
309 
310   syn.p= wword->Word.str;
311 
312   if((res= (UDM_SYNONYM*) UdmBSearch(&syn, List->Synonym, List->nsynonyms,
313                                      sizeof(UDM_SYNONYM), &cmpsyn)))
314   {
315     size_t nnorm,i;
316 
317     /* Find first and last synonym */
318     UdmWideWordListAddSynonymIterate(Res, List, res, &wword->Param, wword->Word.str, -1);
319     UdmWideWordListAddSynonymIterate(Res, List, res + 1, &wword->Param, wword->Word.str, +1);
320 
321     /* Now find each of them in reverse order */
322     nnorm=Res->nwords;
323     for(i=0; i < nnorm; i++)
324     {
325       UDM_WIDEWORD *ww= &Res->Word[i];
326       /* Skip final synonyms */
327       if (ww->Param.origin == UDM_WORD_ORIGIN_SYNONYM_FINAL)
328         continue;
329       syn.p= ww->Word.str;
330       res= (UDM_SYNONYM*) UdmBSearch(&syn, List->Synonym, List->nsynonyms,
331                                      sizeof(UDM_SYNONYM),&cmpsyn);
332 
333       if(res)
334       {
335         /* Find first and last synonym */
336         UdmWideWordListAddSynonymIterate(Res, List, res, &wword->Param, syn.p, -1);
337         /* Note, "ww" is not valid here anymore, realloc could happen */
338         UdmWideWordListAddSynonymIterate(Res, List, res + 1, &wword->Param, syn.p, +1);
339       }
340     }
341   }
342   return Res;
343 }
344 
345 
346 void
UdmSynonymListListInit(UDM_SYNONYMLISTLIST * Lst)347 UdmSynonymListListInit(UDM_SYNONYMLISTLIST *Lst)
348 {
349   bzero((void*)Lst, sizeof(Lst[0]));
350 }
351 
352 
353 void
UdmSynonymListListFree(UDM_SYNONYMLISTLIST * Lst)354 UdmSynonymListListFree(UDM_SYNONYMLISTLIST *Lst)
355 {
356   size_t i;
357   for (i= 0; i < Lst->nitems; i++)
358     UdmSynonymListFree(&Lst->Item[i]);
359   UDM_FREE(Lst->Item);
360 }
361 
362 
363 udm_rc_t
UdmSynonymListListAdd(UDM_SYNONYMLISTLIST * Lst,UDM_SYNONYMLIST * Item)364 UdmSynonymListListAdd(UDM_SYNONYMLISTLIST *Lst, UDM_SYNONYMLIST *Item)
365 {
366   size_t nbytes= (Lst->nitems + 1) * sizeof(UDM_SYNONYMLIST);
367   if (!(Lst->Item= (UDM_SYNONYMLIST*) UdmRealloc(Lst->Item, nbytes)))
368     return UDM_ERROR;
369   Lst->Item[Lst->nitems++]= Item[0];
370   return UDM_OK;
371 }
372 
373 
374 UDM_API(void)
UdmSynonymListListSortItems(UDM_SYNONYMLISTLIST * List)375 UdmSynonymListListSortItems(UDM_SYNONYMLISTLIST *List)
376 {
377   size_t i;
378   for (i= 0; i < List->nitems; i++)
379     UdmSynonymListSort(&List->Item[i]);
380 }
381 
382 
383 UDM_WIDEWORDLIST*
UdmSynonymListListFind(const UDM_SYNONYMLISTLIST * SLL,UDM_WIDEWORD * word)384 UdmSynonymListListFind(const UDM_SYNONYMLISTLIST *SLL, UDM_WIDEWORD *word)
385 {
386   size_t i;
387   UDM_WIDEWORDLIST *Res= (UDM_WIDEWORDLIST *) UdmMalloc(sizeof(*Res));
388   UdmWideWordListInit(Res);
389 
390   UDM_ASSERT(word->Param.origin >= UDM_WORD_ORIGIN_QUERY &&
391              word->Param.origin <= UDM_WORD_ORIGIN_COLLATION);
392 
393   for (i= 0; i < SLL->nitems; i++)
394     UdmSynonymListFind(Res, &SLL->Item[i], word);
395 
396   if (!Res->nwords)
397   {
398     UdmWideWordListFree(Res);
399     UdmFree(Res);
400     return NULL;
401   }
402   return Res;
403 }
404