1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdlib.h>
21 #include <fcntl.h>
22 #include <string.h>
23 #include <sys/types.h>
24 #ifdef HAVE_UNISTD_H
25 #include <unistd.h>
26 #endif
27 #ifdef HAVE_IO_H
28 #include <io.h>
29 #endif
30 #include <sys/stat.h>
31 #include <stdio.h>
32
33
34 #include "udm_common.h"
35 #include "udm_utils.h"
36 #include "udm_unidata.h"
37 #include "udm_searchtool.h"
38 #include "udm_spell.h"
39 #include "udm_word.h"
40 #include "udm_vars.h"
41 #include "udm_db.h"
42 #include "udm_synonym.h"
43
44
45 #define UDM_MAX_FORMS 256
46 #define UDM_MAX_NORMS 64
47
48 /*
49 All the following combinations should
50 work and get as many uword forms as possible:
51
52 1. uword doesn't exist in ispell, its synonym doesn't exist in ispell.
53 This last combination should also work if no ispell dictionaries loaded.
54 Just copy all synonyms into result.
55 2. DONE: both norm(uword) and its synonym exist in ispell
56 3. norm(uword) exists in ispell, its synonym doesn't exist in ispell.
57 4. uword doesn't exist in ispell, its synonym exists in ispell.
58 */
59
UdmAllForms1(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword)60 static UDM_WIDEWORDLIST *UdmAllForms1(UDM_AGENT *Indexer,
61 UDM_WIDEWORDLIST *result,
62 const UDM_WIDEWORD *uword)
63 {
64 UDM_SPELLLISTLIST *SLL= &Indexer->Conf->Spells;
65 UDM_AFFIXLISTLIST *ALL= &Indexer->Conf->Affixes;
66 UDM_SYNONYMLISTLIST *SYN= &Indexer->Conf->Synonym;
67 char *Res[UDM_MAX_FORMS];
68 char **ResCur= Res;
69 char **ResEnd= Res + UDM_MAX_FORMS;
70 char **R;
71 UDM_AFFIXLIST *Al;
72 UDM_WIDEWORD_PARAM Param;
73 UDM_CHARSET *lcs= Indexer->Conf->lcs;
74 int sy= UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
75 int sp= UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);
76
77 if (!sp)
78 return NULL;
79
80 for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
81 {
82 UDM_SPELLLIST *Sl;
83 for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
84 {
85 if (!strcasecmp(Al->lang, Sl->lang) && !strcasecmp(Al->cset, Sl->cset))
86 {
87 UDM_SPELL Norm[UDM_MAX_NORMS];
88 UDM_SPELL *NormEnd= Norm + UDM_MAX_NORMS;
89 UDM_SPELL *NormCur= Norm;
90 UDM_SPELL *N;
91 char tmp[256];
92 char *word= uword->Word.str;
93 UDM_CONV scs_lcs, lcs_scs;
94
95 UdmConvInit(&scs_lcs, Sl->cs, lcs);
96 UdmConvInit(&lcs_scs, lcs, Sl->cs);
97
98 if (lcs != Sl->cs)
99 {
100 size_t len= strlen(word);
101 UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), word, len + 1);
102 word= tmp;
103 }
104
105 NormCur+= UdmSpellNormalize(Sl, Al, word, NormCur, NormEnd-NormCur);
106
107 if (sy && SYN->nitems)
108 {
109 UDM_WIDEWORD ww;
110 UDM_WIDEWORDLIST *syn;
111 UdmWideWordInit(&ww);
112 ww.Param.order= uword->Param.order;
113 ww.Word.str= tmp;
114 /*
115 Find synonyms for each normal form
116 and add the found synonyms into normalized
117 list for futher denormalization.
118 */
119 for (N= Norm; N < NormCur; N++)
120 {
121 ww.Word.length= UdmConvHTML(&scs_lcs,tmp,sizeof(tmp),N->word,strlen(N->word)+1) - 1;
122 ww.Param.origin= uword->Param.origin;
123 if ((syn= UdmSynonymListListFind(SYN, &ww)))
124 {
125 UDM_WIDEWORD *W;
126 for (W= syn->Word; W < syn->Word + syn->nwords; W++)
127 {
128 size_t nbytes= W->Word.length + 1;
129 UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), W->Word.str, nbytes);
130 if (NormCur < NormEnd)
131 {
132 NormCur+= UdmSpellNormalize(Sl, Al, tmp, NormCur, NormEnd-NormCur);
133 }
134 }
135 UdmWideWordListFree(syn);
136 UdmFree(syn);
137 }
138 }
139 }
140
141 for (N= Norm ; N < NormCur; N++)
142 {
143 if (ResCur < ResEnd)
144 {
145 size_t cres= 1;
146 *ResCur= UdmStrdup(N->word);
147 cres+= UdmSpellDenormalize(Sl, Al, N, ResCur+1, ResEnd-ResCur-1);
148 if (lcs != Sl->cs)
149 {
150 size_t i;
151 for (i=0; i < cres; i++)
152 {
153 UdmConvHTML(&scs_lcs, tmp, sizeof(tmp),
154 ResCur[i], strlen(ResCur[i])+1);
155 UdmFree(ResCur[i]);
156 ResCur[i]= UdmStrdup(tmp);
157 }
158 }
159 ResCur+= cres;
160 }
161 }
162
163 /* Free normalized forms*/
164 for (N= Norm; N < NormCur; N++)
165 {
166 UdmFree(N->word);
167 UdmFree(N->flags);
168 }
169 }
170 }
171 }
172
173 UdmWideWordParamInit(&Param);
174 for (R=Res; R < ResCur; R++)
175 {
176 UdmWideWordParamCopySpell(&Param, &uword->Param);
177 UdmWideWordListAddLike(result, &Param, *R);
178 UdmFree(*R);
179 }
180 return result;
181 }
182
183
184 static const char *translit_cyr_lat[]=
185 {
186 "a", "b", "v", "g", "d", "e", "zh", "z",
187 "i", "j", "k", "l", "m", "n", "o", "p",
188 "r", "s", "t", "u", "f", "h", "c", "ch",
189 "sh", "sch","`", "y", "'", "`e", "yu", "ya",
190 "", "yo"
191 };
192
193
194 static const char *translit_lat_cyr[]=
195 {
196 "а", "б", "ц", "д",
197 "е", "ф", "г", "х",
198 "и", "й", "к", "л",
199 "м", "н", "о", "п",
200 "г", "р", "с", "т",
201 "у", "в", "в", "кс",
202 "ы", "з"
203 };
204
205
206 typedef struct udm_translit_complex_subst_st
207 {
208 const char *from;
209 const char *to;
210 } UDM_TRANSLIT_COMPLEX_SUBST;
211
212
213 static UDM_TRANSLIT_COMPLEX_SUBST translit_lat_cyr_complex[]=
214 {
215 #if NOT_YET
216 {"`" , "ъ"},
217 {"'" , "ь"},
218 {"`e" , "э"},
219 #endif
220 {"ch" , "ч"},
221 {"sch", "щ"},
222 {"ya" , "я"},
223 {"zh" , "ж"},
224 {"yo" , "ё"},
225 {"kh" , "х"},
226 {"sh" , "ш"},
227 #ifdef NOT_YET
228 {"yu" , "ю"}, /* ambiguous: YERU + U, or YU */
229 #endif
230 {NULL, NULL}
231 };
232
233
234 typedef struct udm_translit_st
235 {
236 size_t from;
237 size_t to;
238 const char * const *translit;
239 UDM_TRANSLIT_COMPLEX_SUBST *complex;
240 } UDM_TRANSLIT_TABLE;
241
242
243 static UDM_TRANSLIT_TABLE tr_cyr_lat=
244 {
245 0x430, 0x451, translit_cyr_lat, NULL
246 };
247
248
249 static UDM_TRANSLIT_TABLE tr_lat_cyr=
250 {
251 0x61, 0x7A, translit_lat_cyr, translit_lat_cyr_complex
252 };
253
254
255 static udm_rc_t
UdmAllFormsTranslit(UDM_AGENT * A,UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword,const UDM_TRANSLIT_TABLE * tr)256 UdmAllFormsTranslit(UDM_AGENT *A, UDM_WIDEWORDLIST *result,
257 const UDM_WIDEWORD *uword,
258 const UDM_TRANSLIT_TABLE *tr)
259 {
260 int *wrd, tword[128], tmp[128], *t, *te= tword + 128 - 2;
261 int subst= 0;
262 UDM_CHARSET *latin1= &udm_charset_latin1;
263 UDM_CONV l1_uni, lcs_uni;
264 UdmConvInit(&l1_uni, latin1, &udm_charset_sys_int);
265 UdmConvInit(&lcs_uni, A->Conf->lcs, &udm_charset_sys_int);
266 UdmConvHTML(&lcs_uni, (char*) tmp, sizeof(tmp),
267 uword->Word.str, strlen(uword->Word.str) + 1);
268
269 for (wrd= tmp, t= tword; wrd[0] && t < te; )
270 {
271 if (*wrd >= tr->from && *wrd <= tr->to)
272 {
273 const char *repl= NULL;
274 size_t len;
275 UDM_TRANSLIT_COMPLEX_SUBST *cmpl;
276 for (cmpl= tr->complex; cmpl && cmpl->from; cmpl++)
277 {
278 size_t pos;
279 const char *from= cmpl->from;
280 for (pos=0; from[pos] && from[pos] == wrd[pos]; pos++);
281 if (!from[pos])
282 {
283 repl= cmpl->to;
284 wrd+= pos;
285 break;
286 }
287 }
288 if (!repl)
289 {
290 repl= tr->translit[*wrd - tr->from];
291 wrd++;
292 }
293
294 len= strlen(repl);
295 len= UdmConvHTML(&l1_uni, (char*) t, (te - t) * sizeof(*t), repl, len);
296 t+= len / sizeof(*t);
297 *t= 0;
298 subst++;
299 }
300 else
301 {
302 *t++= *wrd++;
303 }
304 }
305 *t= 0;
306 if (subst)
307 {
308 UDM_WIDEWORD_PARAM Param;
309 char lcsword[128];
310 UDM_CONV uni_lcs;
311 size_t nbytes= (t - tword + 1) * sizeof(*t);
312 UdmConvInit(&uni_lcs, &udm_charset_sys_int, A->Conf->lcs);
313 UdmConvHTML(&uni_lcs, lcsword, sizeof(lcsword), (const char *) tword, nbytes);
314 UdmWideWordParamInit(&Param);
315 UdmWideWordParamCopySynonym(&Param, &uword->Param);
316 UdmWideWordListAddLike(result, &Param, lcsword);
317 }
318 return UDM_OK;
319 }
320
321
322 static udm_rc_t
UdmAllForms2(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uw)323 UdmAllForms2(UDM_AGENT *Indexer,
324 UDM_WIDEWORDLIST *result,
325 UDM_WIDEWORD *uw)
326 {
327 UdmAllForms1(Indexer, result, uw); /* TODO34: UdmAllForms1 to return rc */
328 return UDM_OK;
329 }
330
331
332 /*
333 Similar to UdmWideWordAddLike()
334 but changes count, origin, and weight.
335 */
336 static udm_rc_t
UdmWideWordListAddForDehyphenate(UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword,char * tmpword)337 UdmWideWordListAddForDehyphenate(UDM_WIDEWORDLIST *result,
338 const UDM_WIDEWORD *uword,
339 char *tmpword)
340 {
341 UDM_WIDEWORD_PARAM Param;
342 /* TOD34: simplify here, and the caller, to use UDM_CONST_STR */
343 UdmWideWordParamInit(&Param);
344 UdmWideWordParamCopySynonym(&Param, &uword->Param);
345 return UdmWideWordListAddLike(result, &Param, tmpword) ? UDM_OK : UDM_ERROR;
346 }
347
348
349 static udm_rc_t
UdmAllFormsDehyphenate(UDM_AGENT * A,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)350 UdmAllFormsDehyphenate(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
351 {
352 char tmpword[128], *s, *d;
353 udm_snprintf(tmpword, sizeof(tmpword), "%s", uword->Word.str);
354 for (s= uword->Word.str, d= tmpword; ; s++)
355 {
356 *d++= *s;
357 if (UdmAutoPhraseChar((unsigned char) *s))
358 d--;
359
360 if (!*d)
361 break;
362 }
363 return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
364 }
365
366
367 /*
368 Adding hyphenated alnumeric forms: utf8 -> utf-8
369 */
370 static udm_rc_t
UdmAllFormsHyphenateNumbers(UDM_AGENT * A,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)371 UdmAllFormsHyphenateNumbers(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
372 {
373 char tmpword[128], *s, *d;
374 int prev_ctype= UDM_UNI_SEPAR, have_hyphen= 0;
375 if (strlen(uword->Word.str) + 1 >= sizeof(tmpword))
376 return UDM_OK;
377
378 for (s= uword->Word.str, d= tmpword; *s ; *d++= *s++)
379 {
380 int next_ctype= (*s >= '0' && *s <= '9') ? UDM_UNI_DIGIT :
381 UdmAutoPhraseChar(*s) ? UDM_UNI_SEPAR :
382 UDM_UNI_LETTER;
383 if ((prev_ctype == UDM_UNI_LETTER && next_ctype == UDM_UNI_DIGIT) ||
384 (prev_ctype == UDM_UNI_DIGIT && next_ctype == UDM_UNI_LETTER))
385 {
386 have_hyphen= 1;
387 *d++= '-';
388 }
389 prev_ctype= next_ctype;
390 }
391
392 if (!have_hyphen)
393 return UDM_OK;
394 *d++= '\0';
395
396 return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
397 }
398
399
400 udm_rc_t
UdmAllForms(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)401 UdmAllForms(UDM_AGENT *Indexer, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
402 {
403 udm_rc_t rc;
404 UDM_WIDEWORDLIST *uwordsyn;
405
406 /*
407 Generate all possible word forms for uword.
408 */
409 if (UDM_OK != (rc= UdmAllForms2(Indexer, result, uword)))
410 return rc;
411
412 if (UdmVarListFindBool(&Indexer->Conf->Vars, "tl", UDM_FALSE))
413 {
414 UDM_TRANSLIT_TABLE *tbl[]= {&tr_cyr_lat, &tr_lat_cyr, NULL}, **cur;
415 for (cur= tbl; *cur; cur++)
416 {
417 UDM_WIDEWORDLIST translit;
418 UdmWideWordListInit(&translit);
419 UdmAllFormsTranslit(Indexer, &translit, uword, *cur);
420 if (translit.nwords)
421 {
422 UDM_WIDEWORD *ww= &translit.Word[0];
423 UdmWideWordListAdd(result, ww);
424 UdmAllForms2(Indexer, result, ww);
425 }
426 UdmWideWordListFree(&translit);
427 }
428 }
429
430 if (UdmVarListFindBool(&Indexer->Conf->Vars, "Dehyphenate", UDM_FALSE))
431 UdmAllFormsDehyphenate(Indexer, result, uword);
432
433
434 if (UdmVarListFindBool(&Indexer->Conf->Vars, "HyphenateNumbers", UDM_FALSE))
435 UdmAllFormsHyphenateNumbers(Indexer, result, uword);
436
437
438 if (!UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1))
439 return rc;
440 /*
441 Combination one: uword is possibly a normalized form.
442 Find all uword synonyms and then process then through
443 ispell to generate all word forms for the synonyms.
444 */
445 if ((uwordsyn= UdmSynonymListListFind(&Indexer->Conf->Synonym, uword)))
446 {
447 UDM_WIDEWORD *ww;
448 for (ww= uwordsyn->Word; ww < &uwordsyn->Word[uwordsyn->nwords]; ww++)
449 {
450 UdmWideWordListAdd(result, ww);
451 UdmAllForms2(Indexer, result, ww);
452 }
453 UdmWideWordListFree(uwordsyn);
454 UdmFree(uwordsyn);
455 }
456
457 return rc;
458 }
459
460
461 /*
462 Appends synonym parts into string, recursively
463
464 SL - List to find synonyms in
465 strbeg - The very beginning of the complex synonym
466 str - The beginnig of the next part, we'll write here
467 str_reminder_size - Space available
468 WWL - We'll add found synonyms here
469 nwords - number of words to check (to avoid endless loop)
470 order - Word "order" to start building phrase at
471 phrase_length_limit - maximum possible complex synonym length
472 phrase_length_current - current length of synonym which was
473 collected on the previous recursion step.
474 */
475 static udm_rc_t
UdmComplexSynonymAdd(UDM_AGENT * A,UDM_SYNONYMLIST * SL,char * strbeg,char * str,size_t str_reminder_size,UDM_WIDEWORDLIST * WWL,size_t nwords,size_t order,size_t phrase_length_limit,size_t phrase_length_current)476 UdmComplexSynonymAdd(UDM_AGENT *A, UDM_SYNONYMLIST *SL,
477 char *strbeg, char *str, size_t str_reminder_size,
478 UDM_WIDEWORDLIST *WWL, size_t nwords,
479 size_t order,
480 size_t phrase_length_limit,
481 size_t phrase_length_current)
482 {
483 size_t i;
484 for (i= 0; i < nwords; i++)
485 {
486 UDM_WIDEWORD W= WWL->Word[i];
487 if (W.Param.order == order)
488 {
489 int need_more= (phrase_length_limit > 0);
490 size_t len= udm_snprintf(str, str_reminder_size, "%s%s",
491 phrase_length_current > 0 ? " " : "", W.Word.str);
492 if (need_more)
493 {
494 UdmComplexSynonymAdd(A, SL,
495 strbeg, str + len, str_reminder_size - len,
496 WWL, nwords, order + 1,
497 phrase_length_limit - 1,
498 phrase_length_current + 1);
499 str[len]= '\0'; /* Remove trailing space and trailing parts */
500 }
501
502 if (phrase_length_current > 0) /* Skip single word (on first level) */
503 {
504 UDM_WIDEWORDLIST Tmp;
505 UDM_WIDEWORD WW;
506 UdmWideWordListInit(&Tmp);
507 WW= W;
508 WW.Word.str= strbeg;
509 WW.Word.length= strlen(strbeg);
510 UdmSynonymListFind(&Tmp, SL, &WW);
511 /*
512 UdmLog(A, UDM_LOG_DEBUG, "ComplexSynonym: phrlen=%d: '%s' nfound=%d",
513 phrase_length_current, strbeg, Tmp.nwords);
514 */
515 {
516 size_t wrd;
517 for (wrd= 0; wrd < Tmp.nwords; wrd++)
518 {
519 if ((WW.Param.order_extra_width= UdmMultiWordPhraseLength(WW.Word.str)))
520 {
521 UDM_ASSERT(W.Param.order >= phrase_length_current);
522 WW.Param.order= W.Param.order - phrase_length_current;
523 WW.Param.order_extra_width++;
524 }
525 else
526 WW.Param.order= order; /* Should not really happen */
527 /*
528 UdmLog(A, UDM_LOG_DEBUG, "FOUND: '%s' width=%d",
529 Tmp.Word[wrd].word, WW.order_width);
530 */
531 UdmWideWordListAddLike(WWL, &WW.Param, Tmp.Word[wrd].Word.str);
532 }
533 }
534 UdmWideWordListFree(&Tmp);
535 }
536 }
537 }
538 return UDM_OK;
539 }
540
541
542
543 /*
544 Add many-to-one and many-to-many synonyms
545 */
546 udm_rc_t
UdmComplexSynonyms(UDM_AGENT * A,UDM_WIDEWORDLIST * WWL)547 UdmComplexSynonyms(UDM_AGENT *A, UDM_WIDEWORDLIST *WWL)
548 {
549 size_t nwords= WWL->nwords; /* Remember nwords, to avoid endless loop */
550 size_t i;
551 UDM_SYNONYMLISTLIST *SSL= &A->Conf->Synonym;
552 for (i= 0; i < SSL->nitems; i++)
553 {
554 UDM_SYNONYMLIST *SL= &SSL->Item[i];
555 char str[256]= "";
556 if (SL->max_phrase_length > 0)
557 {
558 size_t order;
559 for (order= 0; order < WWL->nuniq; order++)
560 UdmComplexSynonymAdd(A, SL, str, str, sizeof(str),
561 WWL, nwords, order, SL->max_phrase_length, 0);
562 }
563 }
564 return UDM_OK;
565 }
566