1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24
25 #include "udm_common.h"
26 #include "udm_env.h"
27 #include "udm_utils.h"
28 #include "udm_unicode.h"
29 #include "udm_unidata.h"
30 #include "udm_word.h"
31 #include "udm_synonym.h"
32 #include "udm_conf.h"
33
34
UdmSynonymListInit(UDM_SYNONYMLIST * List)35 void UdmSynonymListInit(UDM_SYNONYMLIST * List)
36 {
37 bzero((void*)List, sizeof(*List));
38 }
39
40
41 /*
42 Returns the number of separators in a complex word.
43 Needed for complex synonyms.
44
45 TODO: currently doesn't work with extra spaces: "aaa bbb"
46 */
47 size_t
UdmMultiWordPhraseLength(const char * s)48 UdmMultiWordPhraseLength(const char *s)
49 {
50 size_t res;
51 for (res= 0; *s; s++)
52 {
53 if (*s == ' ')
54 res++;
55 }
56 return res;
57 }
58
59 static void
AddSynonym(UDM_SYNONYMLIST * Syn,const char * first,const char * second,udm_wordorigin_t origin)60 AddSynonym(UDM_SYNONYMLIST *Syn, const char *first, const char *second,
61 udm_wordorigin_t origin)
62 {
63 UDM_SYNONYM *trg= &Syn->Synonym[Syn->nsynonyms];
64 size_t phrase_length= UdmMultiWordPhraseLength(first);
65 if (Syn->max_phrase_length < phrase_length)
66 Syn->max_phrase_length= phrase_length;
67 trg->p= strdup(first);
68 trg->s= strdup(second);
69 /*
70 We don't allow multi-word synonyms to get into loop
71 at this point - for performance purposes.
72 */
73 trg->origin= phrase_length ? UDM_WORD_ORIGIN_SYNONYM_FINAL : origin;
74 Syn->nsynonyms++;
75 }
76
77
78 #define UDM_SYN_MODE_ONEWAY 0
79 #define UDM_SYN_MODE_ROUNDTRIP 1
80 #define UDM_SYN_MODE_RETURN 2
81
82
83 UDM_API(udm_rc_t)
UdmSynonymListLoad(UDM_ENV * Env,const char * filename)84 UdmSynonymListLoad(UDM_ENV * Env,const char * filename)
85 {
86 FILE *f;
87 char str[512];
88 char lang[64]="";
89 UDM_CHARSET *cs=NULL;
90 UDM_CONV file_lcs;
91 int mode= UDM_SYN_MODE_ROUNDTRIP;
92 udm_wordorigin_t origin= UDM_WORD_ORIGIN_SYNONYM;
93 UDM_UNIDATA *unidata= Env->unidata;
94 UDM_SYNONYMLIST Synonyms;
95 int lineno= 0;
96
97 UdmSynonymListInit(&Synonyms);
98
99 if(!(f=fopen(filename,"r")))
100 {
101 udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,"Can't open synonyms file '%s'",filename);
102 return UDM_ERROR;
103 }
104
105 for (lineno= 1; fgets(str,sizeof(str),f); lineno++)
106 {
107 if(str[0]=='#'||str[0]==' '||str[0]=='\t'||str[0]=='\r'||str[0]=='\n')continue;
108
109 if(!strncmp(str,"Charset:",8))
110 {
111 char * lasttok;
112 char * charset;
113
114 if((charset = udm_strtok_r(str + 8, " \t\n\r", &lasttok)))
115 {
116 if(!(cs=UdmGetCharSet(charset)))
117 {
118 udm_snprintf(Env->errstr, sizeof(Env->errstr),
119 "Unknown charset '%s' in synonyms file '%s'", charset, filename);
120 fclose(f);
121 return UDM_ERROR;
122 }
123 UdmConvInit(&file_lcs,cs,Env->lcs);
124 }
125 }
126 else if(!strncmp(str,"Language:",9))
127 {
128 char *lasttok, *l;
129 if((l = udm_strtok_r(str + 9, " \t\n\r", &lasttok)))
130 strncpy(lang,l,sizeof(lang)-1);
131 }
132 else if (!strncasecmp(str, "Mode:", 5))
133 {
134 char *lasttok, *l;
135 for (l= udm_strtok_r(str + 5, " ,\t\n\r", &lasttok) ; l;
136 l= udm_strtok_r(NULL, " ,\t\n\r", &lasttok))
137 {
138 if (!strcasecmp(l, "oneway"))
139 {
140 mode= UDM_SYN_MODE_ONEWAY;
141 continue;
142 }
143 else if (!strcasecmp(l, "reverse") || !strcasecmp(l, "roundtrip"))
144 {
145 mode= UDM_SYN_MODE_ROUNDTRIP;
146 continue;
147 }
148 else if (!strcasecmp(l, "return"))
149 {
150 mode= UDM_SYN_MODE_RETURN;
151 continue;
152 }
153 else if (!strcasecmp(l, "recursive"))
154 {
155 origin= UDM_WORD_ORIGIN_SYNONYM;
156 continue;
157 }
158 else if (!strcasecmp(l, "final"))
159 {
160 origin= UDM_WORD_ORIGIN_SYNONYM_FINAL;
161 continue;
162 }
163 else
164 {
165 udm_snprintf(Env->errstr, sizeof(Env->errstr),
166 "Bad Mode command in synonym file %s:%d",
167 filename, lineno);
168 fclose(f);
169 return UDM_ERROR;
170 }
171 }
172 }
173 else
174 {
175 const char *av[255];
176 char tmp[512];
177 size_t ac, i, j, len;
178
179 if(!cs)
180 {
181 udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,
182 "No Charset command in synonyms file '%s'",filename);
183 fclose(f);
184 return UDM_ERROR;
185 }
186 if(!lang[0])
187 {
188 udm_snprintf(Env->errstr,sizeof(Env->errstr)-1,
189 "No Language command in synonyms file '%s'",filename);
190 fclose(f);
191 return UDM_ERROR;
192 }
193
194 len= UdmConvHTMLNonASCII(&file_lcs, tmp, sizeof(tmp), str, strlen(str));
195 tmp[len]= '\0';
196 UdmStrToLower(unidata, cs, tmp, len);
197
198 if ((ac= UdmGetArgs(tmp, av, 255)) < 2)
199 continue;
200
201 for (i = 0; i < (mode == UDM_SYN_MODE_RETURN ? ac - 1 : 1) ; i++)
202 {
203 for (j = i + 1; j < ac; j++)
204 {
205 if((Synonyms.nsynonyms + 1) >= Synonyms.msynonyms)
206 {
207 Synonyms.msynonyms+= 64;
208 {
209 size_t nbytes= sizeof(UDM_SYNONYM)*Synonyms.msynonyms;
210 Synonyms.Synonym= (UDM_SYNONYM*)UdmRealloc(Synonyms.Synonym, nbytes);
211 }
212 }
213
214 /* Add direct order */
215 if (mode == UDM_SYN_MODE_ONEWAY || mode == UDM_SYN_MODE_ROUNDTRIP)
216 AddSynonym(&Synonyms, av[i], av[j], origin);
217
218 /* Add reverse order */
219 if (mode == UDM_SYN_MODE_RETURN || mode == UDM_SYN_MODE_ROUNDTRIP)
220 AddSynonym(&Synonyms, av[j], av[i], origin);
221 }
222 }
223 }
224 }
225 fclose(f);
226 udm_snprintf(Synonyms.fname, sizeof(Synonyms.fname), "%s", filename);
227 udm_snprintf(Synonyms.cset, sizeof(Synonyms.cset), "%s", cs->name);
228 udm_snprintf(Synonyms.lang, sizeof(Synonyms.lang), "%s", lang);
229 UdmSynonymListListAdd(&Env->Synonym, &Synonyms);
230 return UDM_OK;
231 }
232
UdmSynonymListFree(UDM_SYNONYMLIST * List)233 void UdmSynonymListFree(UDM_SYNONYMLIST * List)
234 {
235 size_t i;
236
237 for(i=0;i<List->nsynonyms;i++)
238 {
239 UdmFree(List->Synonym[i].p);
240 UdmFree(List->Synonym[i].s);
241 }
242 UDM_FREE(List->Synonym);
243 }
244
cmpsyn(const void * v1,const void * v2)245 static int cmpsyn(const void * v1,const void * v2)
246 {
247 const char *s1= ((const UDM_SYNONYM*)v1)->p;
248 const char *s2= ((const UDM_SYNONYM*)v2)->p;
249 return strcmp(s1, s2);
250 }
251
252 UDM_API(void)
UdmSynonymListSort(UDM_SYNONYMLIST * List)253 UdmSynonymListSort(UDM_SYNONYMLIST * List)
254 {
255 if(List->nsynonyms)
256 UdmSort(List->Synonym,List->nsynonyms,sizeof(UDM_SYNONYM),&cmpsyn);
257 }
258
259
260 static void
UdmWideWordListAddSynonym(UDM_WIDEWORDLIST * Res,const UDM_WIDEWORD_PARAM * param,const UDM_SYNONYM * syn)261 UdmWideWordListAddSynonym(UDM_WIDEWORDLIST *Res,
262 const UDM_WIDEWORD_PARAM *param,
263 const UDM_SYNONYM *syn)
264 {
265 UDM_WIDEWORD_PARAM tmp= *param;
266 tmp.origin= syn->origin;
267 UdmWideWordListAddLike(Res, &tmp, syn->s);
268 }
269
270
271
272 /*
273 Traverse through a synonym list starting
274 from "medium" towards the beginning or the
275 end of the list and adding all synonyms
276 equal to "word".
277 "medium" must be a valid pointer to some word
278 in the list.
279 */
280 static void
UdmWideWordListAddSynonymIterate(UDM_WIDEWORDLIST * Res,const UDM_SYNONYMLIST * List,const UDM_SYNONYM * medium,const UDM_WIDEWORD_PARAM * param,const char * word,int direction)281 UdmWideWordListAddSynonymIterate(UDM_WIDEWORDLIST *Res,
282 const UDM_SYNONYMLIST *List,
283 const UDM_SYNONYM *medium,
284 const UDM_WIDEWORD_PARAM *param,
285 const char *word,
286 int direction)
287 {
288 UDM_SYNONYM *first= List->Synonym;
289 UDM_SYNONYM *last= List->Synonym + List->nsynonyms;
290 for ( ; medium >= first && medium < last; medium+= direction)
291 {
292 if (strcmp(word, medium->p))
293 break;
294 UdmWideWordListAddSynonym(Res, param, medium);
295 }
296 }
297
298
299 UDM_WIDEWORDLIST *
UdmSynonymListFind(UDM_WIDEWORDLIST * Res,const UDM_SYNONYMLIST * List,const UDM_WIDEWORD * wword)300 UdmSynonymListFind(UDM_WIDEWORDLIST *Res,
301 const UDM_SYNONYMLIST *List,
302 const UDM_WIDEWORD *wword)
303 {
304 UDM_SYNONYM syn, *res;
305
306 /* Quickly skip empty lists, and skip final synonyms */
307 if (!List->nsynonyms || wword->Param.origin == UDM_WORD_ORIGIN_SYNONYM_FINAL)
308 return NULL;
309
310 syn.p= wword->Word.str;
311
312 if((res= (UDM_SYNONYM*) UdmBSearch(&syn, List->Synonym, List->nsynonyms,
313 sizeof(UDM_SYNONYM), &cmpsyn)))
314 {
315 size_t nnorm,i;
316
317 /* Find first and last synonym */
318 UdmWideWordListAddSynonymIterate(Res, List, res, &wword->Param, wword->Word.str, -1);
319 UdmWideWordListAddSynonymIterate(Res, List, res + 1, &wword->Param, wword->Word.str, +1);
320
321 /* Now find each of them in reverse order */
322 nnorm=Res->nwords;
323 for(i=0; i < nnorm; i++)
324 {
325 UDM_WIDEWORD *ww= &Res->Word[i];
326 /* Skip final synonyms */
327 if (ww->Param.origin == UDM_WORD_ORIGIN_SYNONYM_FINAL)
328 continue;
329 syn.p= ww->Word.str;
330 res= (UDM_SYNONYM*) UdmBSearch(&syn, List->Synonym, List->nsynonyms,
331 sizeof(UDM_SYNONYM),&cmpsyn);
332
333 if(res)
334 {
335 /* Find first and last synonym */
336 UdmWideWordListAddSynonymIterate(Res, List, res, &wword->Param, syn.p, -1);
337 /* Note, "ww" is not valid here anymore, realloc could happen */
338 UdmWideWordListAddSynonymIterate(Res, List, res + 1, &wword->Param, syn.p, +1);
339 }
340 }
341 }
342 return Res;
343 }
344
345
346 void
UdmSynonymListListInit(UDM_SYNONYMLISTLIST * Lst)347 UdmSynonymListListInit(UDM_SYNONYMLISTLIST *Lst)
348 {
349 bzero((void*)Lst, sizeof(Lst[0]));
350 }
351
352
353 void
UdmSynonymListListFree(UDM_SYNONYMLISTLIST * Lst)354 UdmSynonymListListFree(UDM_SYNONYMLISTLIST *Lst)
355 {
356 size_t i;
357 for (i= 0; i < Lst->nitems; i++)
358 UdmSynonymListFree(&Lst->Item[i]);
359 UDM_FREE(Lst->Item);
360 }
361
362
363 udm_rc_t
UdmSynonymListListAdd(UDM_SYNONYMLISTLIST * Lst,UDM_SYNONYMLIST * Item)364 UdmSynonymListListAdd(UDM_SYNONYMLISTLIST *Lst, UDM_SYNONYMLIST *Item)
365 {
366 size_t nbytes= (Lst->nitems + 1) * sizeof(UDM_SYNONYMLIST);
367 if (!(Lst->Item= (UDM_SYNONYMLIST*) UdmRealloc(Lst->Item, nbytes)))
368 return UDM_ERROR;
369 Lst->Item[Lst->nitems++]= Item[0];
370 return UDM_OK;
371 }
372
373
374 UDM_API(void)
UdmSynonymListListSortItems(UDM_SYNONYMLISTLIST * List)375 UdmSynonymListListSortItems(UDM_SYNONYMLISTLIST *List)
376 {
377 size_t i;
378 for (i= 0; i < List->nitems; i++)
379 UdmSynonymListSort(&List->Item[i]);
380 }
381
382
383 UDM_WIDEWORDLIST*
UdmSynonymListListFind(const UDM_SYNONYMLISTLIST * SLL,UDM_WIDEWORD * word)384 UdmSynonymListListFind(const UDM_SYNONYMLISTLIST *SLL, UDM_WIDEWORD *word)
385 {
386 size_t i;
387 UDM_WIDEWORDLIST *Res= (UDM_WIDEWORDLIST *) UdmMalloc(sizeof(*Res));
388 UdmWideWordListInit(Res);
389
390 UDM_ASSERT(word->Param.origin >= UDM_WORD_ORIGIN_QUERY &&
391 word->Param.origin <= UDM_WORD_ORIGIN_COLLATION);
392
393 for (i= 0; i < SLL->nitems; i++)
394 UdmSynonymListFind(Res, &SLL->Item[i], word);
395
396 if (!Res->nwords)
397 {
398 UdmWideWordListFree(Res);
399 UdmFree(Res);
400 return NULL;
401 }
402 return Res;
403 }
404