1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdlib.h>
21 #include <fcntl.h>
22 #include <string.h>
23 #include <sys/types.h>
24 #ifdef HAVE_UNISTD_H
25 #include <unistd.h>
26 #endif
27 #ifdef HAVE_IO_H
28 #include <io.h>
29 #endif
30 #include <sys/stat.h>
31 #include <stdio.h>
32 
33 
34 #include "udm_common.h"
35 #include "udm_utils.h"
36 #include "udm_unidata.h"
37 #include "udm_searchtool.h"
38 #include "udm_spell.h"
39 #include "udm_word.h"
40 #include "udm_vars.h"
41 #include "udm_db.h"
42 #include "udm_synonym.h"
43 
44 
45 #define UDM_MAX_FORMS 256
46 #define UDM_MAX_NORMS 64
47 
48 /*
49   All the following combinations should
50   work and get as many uword forms as possible:
51 
52   1. uword doesn't exist in ispell, its synonym doesn't exist in ispell.
53      This last combination should also work if no ispell dictionaries loaded.
54      Just copy all synonyms into result.
55   2. DONE: both norm(uword) and its synonym exist in ispell
56   3. norm(uword) exists in ispell, its synonym doesn't exist in ispell.
57   4. uword doesn't exist in ispell, its synonym exists in ispell.
58 */
59 
UdmAllForms1(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword)60 static UDM_WIDEWORDLIST *UdmAllForms1(UDM_AGENT *Indexer,
61                                       UDM_WIDEWORDLIST *result,
62                                       const UDM_WIDEWORD *uword)
63 {
64   UDM_SPELLLISTLIST *SLL= &Indexer->Conf->Spells;
65   UDM_AFFIXLISTLIST *ALL= &Indexer->Conf->Affixes;
66   UDM_SYNONYMLISTLIST *SYN= &Indexer->Conf->Synonym;
67   char *Res[UDM_MAX_FORMS];
68   char **ResCur= Res;
69   char **ResEnd= Res + UDM_MAX_FORMS;
70   char **R;
71   UDM_AFFIXLIST *Al;
72   UDM_WIDEWORD_PARAM Param;
73   UDM_CHARSET *lcs= Indexer->Conf->lcs;
74   int sy= UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
75   int sp= UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);
76 
77   if (!sp)
78     return NULL;
79 
80   for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
81   {
82     UDM_SPELLLIST *Sl;
83     for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
84     {
85       if (!strcasecmp(Al->lang, Sl->lang) && !strcasecmp(Al->cset, Sl->cset))
86       {
87         UDM_SPELL Norm[UDM_MAX_NORMS];
88         UDM_SPELL *NormEnd= Norm + UDM_MAX_NORMS;
89         UDM_SPELL *NormCur= Norm;
90         UDM_SPELL *N;
91         char tmp[256];
92         char *word= uword->Word.str;
93         UDM_CONV scs_lcs, lcs_scs;
94 
95         UdmConvInit(&scs_lcs, Sl->cs, lcs);
96         UdmConvInit(&lcs_scs, lcs, Sl->cs);
97 
98         if (lcs != Sl->cs)
99         {
100           size_t len= strlen(word);
101           UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), word, len + 1);
102           word= tmp;
103         }
104 
105         NormCur+= UdmSpellNormalize(Sl, Al, word, NormCur, NormEnd-NormCur);
106 
107         if (sy && SYN->nitems)
108         {
109           UDM_WIDEWORD ww;
110           UDM_WIDEWORDLIST *syn;
111           UdmWideWordInit(&ww);
112           ww.Param.order= uword->Param.order;
113           ww.Word.str= tmp;
114           /*
115             Find synonyms for each normal form
116             and add the found synonyms into normalized
117             list for futher denormalization.
118           */
119           for (N= Norm; N < NormCur; N++)
120           {
121             ww.Word.length= UdmConvHTML(&scs_lcs,tmp,sizeof(tmp),N->word,strlen(N->word)+1) - 1;
122             ww.Param.origin= uword->Param.origin;
123             if ((syn= UdmSynonymListListFind(SYN, &ww)))
124             {
125               UDM_WIDEWORD *W;
126               for (W= syn->Word; W < syn->Word + syn->nwords; W++)
127               {
128                 size_t nbytes= W->Word.length + 1;
129                 UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), W->Word.str, nbytes);
130                 if (NormCur < NormEnd)
131                 {
132                   NormCur+= UdmSpellNormalize(Sl, Al, tmp, NormCur, NormEnd-NormCur);
133                 }
134               }
135               UdmWideWordListFree(syn);
136               UdmFree(syn);
137             }
138           }
139         }
140 
141         for (N= Norm ; N < NormCur; N++)
142         {
143           if (ResCur < ResEnd)
144           {
145             size_t cres= 1;
146             *ResCur= UdmStrdup(N->word);
147             cres+= UdmSpellDenormalize(Sl, Al, N, ResCur+1, ResEnd-ResCur-1);
148             if (lcs != Sl->cs)
149             {
150               size_t i;
151               for (i=0; i < cres; i++)
152               {
153                 UdmConvHTML(&scs_lcs, tmp, sizeof(tmp),
154                             ResCur[i], strlen(ResCur[i])+1);
155                 UdmFree(ResCur[i]);
156                 ResCur[i]= UdmStrdup(tmp);
157               }
158             }
159             ResCur+= cres;
160           }
161         }
162 
163         /* Free normalized forms*/
164         for (N= Norm;  N < NormCur; N++)
165         {
166           UdmFree(N->word);
167           UdmFree(N->flags);
168         }
169       }
170     }
171   }
172 
173   UdmWideWordParamInit(&Param);
174   for (R=Res; R < ResCur; R++)
175   {
176     UdmWideWordParamCopySpell(&Param, &uword->Param);
177     UdmWideWordListAddLike(result, &Param, *R);
178     UdmFree(*R);
179   }
180   return result;
181 }
182 
183 
184 static const char *translit_cyr_lat[]=
185 {
186   "a",  "b",  "v",  "g",  "d",  "e",  "zh", "z",
187   "i",  "j",  "k",  "l",  "m",  "n",  "o",  "p",
188   "r",  "s",  "t",  "u",  "f",  "h",  "c",  "ch",
189   "sh", "sch","`",  "y",  "'",  "`e", "yu", "ya",
190   "",   "yo"
191 };
192 
193 
194 static const char *translit_lat_cyr[]=
195 {
196   "&#x430;", "&#x431;", "&#x446;", "&#x434;",
197   "&#x435;", "&#x444;", "&#x433;", "&#x445;",
198   "&#x438;", "&#x439;", "&#x43a;", "&#x43b;",
199   "&#x43c;", "&#x43d;", "&#x43e;", "&#x43f;",
200   "&#x433;", "&#x440;", "&#x441;", "&#x442;",
201   "&#x443;", "&#x432;", "&#x432;", "&#x43a;&#x441;",
202   "&#x44b;", "&#x437;"
203 };
204 
205 
206 typedef struct udm_translit_complex_subst_st
207 {
208   const char *from;
209   const char *to;
210 } UDM_TRANSLIT_COMPLEX_SUBST;
211 
212 
213 static UDM_TRANSLIT_COMPLEX_SUBST translit_lat_cyr_complex[]=
214 {
215 #if NOT_YET
216   {"`"  , "&#x44a;"},
217   {"'"  , "&#x44c;"},
218   {"`e" , "&#x44d;"},
219 #endif
220   {"ch" , "&#x447;"},
221   {"sch", "&#x449;"},
222   {"ya" , "&#x44f;"},
223   {"zh" , "&#x436;"},
224   {"yo" , "&#x451;"},
225   {"kh" , "&#x445;"},
226   {"sh" , "&#x448;"},
227 #ifdef NOT_YET
228   {"yu" , "&#x44e;"}, /* ambiguous: YERU + U, or YU   */
229 #endif
230   {NULL, NULL}
231 };
232 
233 
234 typedef struct udm_translit_st
235 {
236   size_t from;
237   size_t to;
238   const char * const *translit;
239   UDM_TRANSLIT_COMPLEX_SUBST *complex;
240 } UDM_TRANSLIT_TABLE;
241 
242 
243 static UDM_TRANSLIT_TABLE tr_cyr_lat=
244 {
245   0x430, 0x451, translit_cyr_lat, NULL
246 };
247 
248 
249 static UDM_TRANSLIT_TABLE tr_lat_cyr=
250 {
251   0x61, 0x7A, translit_lat_cyr, translit_lat_cyr_complex
252 };
253 
254 
255 static udm_rc_t
UdmAllFormsTranslit(UDM_AGENT * A,UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword,const UDM_TRANSLIT_TABLE * tr)256 UdmAllFormsTranslit(UDM_AGENT *A, UDM_WIDEWORDLIST *result,
257                     const UDM_WIDEWORD *uword,
258                     const UDM_TRANSLIT_TABLE *tr)
259 {
260   int *wrd, tword[128], tmp[128], *t, *te= tword + 128 - 2;
261   int subst= 0;
262   UDM_CHARSET *latin1= &udm_charset_latin1;
263   UDM_CONV l1_uni, lcs_uni;
264   UdmConvInit(&l1_uni, latin1, &udm_charset_sys_int);
265   UdmConvInit(&lcs_uni, A->Conf->lcs, &udm_charset_sys_int);
266   UdmConvHTML(&lcs_uni, (char*) tmp, sizeof(tmp),
267                         uword->Word.str, strlen(uword->Word.str) + 1);
268 
269   for (wrd= tmp, t= tword; wrd[0] && t < te; )
270   {
271     if (*wrd >= tr->from && *wrd <= tr->to)
272     {
273       const char *repl= NULL;
274       size_t len;
275       UDM_TRANSLIT_COMPLEX_SUBST *cmpl;
276       for (cmpl= tr->complex; cmpl && cmpl->from; cmpl++)
277       {
278         size_t pos;
279         const char *from= cmpl->from;
280         for (pos=0; from[pos] && from[pos] == wrd[pos]; pos++);
281         if (!from[pos])
282         {
283           repl= cmpl->to;
284           wrd+= pos;
285           break;
286         }
287       }
288       if (!repl)
289       {
290         repl= tr->translit[*wrd - tr->from];
291         wrd++;
292       }
293 
294       len= strlen(repl);
295       len= UdmConvHTML(&l1_uni, (char*) t, (te - t) * sizeof(*t), repl, len);
296       t+= len / sizeof(*t);
297       *t= 0;
298       subst++;
299     }
300     else
301     {
302       *t++= *wrd++;
303     }
304   }
305   *t= 0;
306   if (subst)
307   {
308     UDM_WIDEWORD_PARAM Param;
309     char lcsword[128];
310     UDM_CONV uni_lcs;
311     size_t nbytes= (t - tword + 1) * sizeof(*t);
312     UdmConvInit(&uni_lcs, &udm_charset_sys_int, A->Conf->lcs);
313     UdmConvHTML(&uni_lcs, lcsword, sizeof(lcsword), (const char *) tword, nbytes);
314     UdmWideWordParamInit(&Param);
315     UdmWideWordParamCopySynonym(&Param, &uword->Param);
316     UdmWideWordListAddLike(result, &Param, lcsword);
317   }
318   return UDM_OK;
319 }
320 
321 
322 static udm_rc_t
UdmAllForms2(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uw)323 UdmAllForms2(UDM_AGENT *Indexer,
324              UDM_WIDEWORDLIST *result,
325              UDM_WIDEWORD *uw)
326 {
327   UdmAllForms1(Indexer, result, uw); /* TODO34: UdmAllForms1 to return rc */
328   return UDM_OK;
329 }
330 
331 
332 /*
333   Similar to UdmWideWordAddLike()
334   but changes count, origin, and weight.
335 */
336 static udm_rc_t
UdmWideWordListAddForDehyphenate(UDM_WIDEWORDLIST * result,const UDM_WIDEWORD * uword,char * tmpword)337 UdmWideWordListAddForDehyphenate(UDM_WIDEWORDLIST *result,
338                                  const UDM_WIDEWORD *uword,
339                                  char *tmpword)
340 {
341   UDM_WIDEWORD_PARAM Param;
342   /* TOD34: simplify here, and the caller, to use UDM_CONST_STR */
343   UdmWideWordParamInit(&Param);
344   UdmWideWordParamCopySynonym(&Param, &uword->Param);
345   return UdmWideWordListAddLike(result, &Param, tmpword) ? UDM_OK : UDM_ERROR;
346 }
347 
348 
349 static udm_rc_t
UdmAllFormsDehyphenate(UDM_AGENT * A,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)350 UdmAllFormsDehyphenate(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
351 {
352   char tmpword[128], *s, *d;
353   udm_snprintf(tmpword, sizeof(tmpword), "%s", uword->Word.str);
354   for (s= uword->Word.str, d= tmpword; ; s++)
355   {
356     *d++= *s;
357     if (UdmAutoPhraseChar((unsigned char) *s))
358       d--;
359 
360     if (!*d)
361       break;
362   }
363   return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
364 }
365 
366 
367 /*
368   Adding hyphenated alnumeric forms: utf8 -> utf-8
369 */
370 static udm_rc_t
UdmAllFormsHyphenateNumbers(UDM_AGENT * A,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)371 UdmAllFormsHyphenateNumbers(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
372 {
373   char tmpword[128], *s, *d;
374   int prev_ctype= UDM_UNI_SEPAR, have_hyphen= 0;
375   if (strlen(uword->Word.str) + 1 >= sizeof(tmpword))
376     return UDM_OK;
377 
378   for (s= uword->Word.str, d= tmpword; *s ; *d++= *s++)
379   {
380     int next_ctype= (*s >= '0' && *s <= '9') ? UDM_UNI_DIGIT :
381                     UdmAutoPhraseChar(*s)    ? UDM_UNI_SEPAR :
382                     UDM_UNI_LETTER;
383     if ((prev_ctype == UDM_UNI_LETTER && next_ctype == UDM_UNI_DIGIT) ||
384         (prev_ctype == UDM_UNI_DIGIT  && next_ctype == UDM_UNI_LETTER))
385     {
386       have_hyphen= 1;
387       *d++= '-';
388     }
389     prev_ctype= next_ctype;
390   }
391 
392   if (!have_hyphen)
393     return UDM_OK;
394   *d++= '\0';
395 
396   return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
397 }
398 
399 
400 udm_rc_t
UdmAllForms(UDM_AGENT * Indexer,UDM_WIDEWORDLIST * result,UDM_WIDEWORD * uword)401 UdmAllForms(UDM_AGENT *Indexer, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
402 {
403   udm_rc_t rc;
404   UDM_WIDEWORDLIST *uwordsyn;
405 
406   /*
407     Generate all possible word forms for uword.
408   */
409   if (UDM_OK != (rc= UdmAllForms2(Indexer, result, uword)))
410     return rc;
411 
412   if (UdmVarListFindBool(&Indexer->Conf->Vars, "tl", UDM_FALSE))
413   {
414     UDM_TRANSLIT_TABLE *tbl[]= {&tr_cyr_lat, &tr_lat_cyr, NULL}, **cur;
415     for (cur= tbl; *cur; cur++)
416     {
417       UDM_WIDEWORDLIST translit;
418       UdmWideWordListInit(&translit);
419       UdmAllFormsTranslit(Indexer, &translit, uword, *cur);
420       if (translit.nwords)
421       {
422         UDM_WIDEWORD *ww= &translit.Word[0];
423         UdmWideWordListAdd(result, ww);
424         UdmAllForms2(Indexer, result, ww);
425       }
426       UdmWideWordListFree(&translit);
427     }
428   }
429 
430   if (UdmVarListFindBool(&Indexer->Conf->Vars, "Dehyphenate", UDM_FALSE))
431     UdmAllFormsDehyphenate(Indexer, result, uword);
432 
433 
434   if (UdmVarListFindBool(&Indexer->Conf->Vars, "HyphenateNumbers", UDM_FALSE))
435     UdmAllFormsHyphenateNumbers(Indexer, result, uword);
436 
437 
438   if (!UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1))
439     return rc;
440   /*
441      Combination one: uword is possibly a normalized form.
442      Find all uword synonyms and then process then through
443      ispell to generate all word forms for the synonyms.
444   */
445   if ((uwordsyn= UdmSynonymListListFind(&Indexer->Conf->Synonym, uword)))
446   {
447     UDM_WIDEWORD *ww;
448     for (ww= uwordsyn->Word; ww < &uwordsyn->Word[uwordsyn->nwords]; ww++)
449     {
450       UdmWideWordListAdd(result, ww);
451       UdmAllForms2(Indexer, result, ww);
452     }
453     UdmWideWordListFree(uwordsyn);
454     UdmFree(uwordsyn);
455   }
456 
457   return rc;
458 }
459 
460 
461 /*
462   Appends synonym parts into string, recursively
463 
464   SL                    - List to find synonyms in
465   strbeg                - The very beginning of the complex synonym
466   str                   - The beginnig of the next part, we'll write here
467   str_reminder_size     - Space available
468   WWL                   - We'll add found synonyms here
469   nwords                - number of words to check (to avoid endless loop)
470   order                 - Word "order" to start building phrase at
471   phrase_length_limit   - maximum possible complex synonym length
472   phrase_length_current - current length of synonym which was
473                           collected on the previous recursion step.
474 */
475 static udm_rc_t
UdmComplexSynonymAdd(UDM_AGENT * A,UDM_SYNONYMLIST * SL,char * strbeg,char * str,size_t str_reminder_size,UDM_WIDEWORDLIST * WWL,size_t nwords,size_t order,size_t phrase_length_limit,size_t phrase_length_current)476 UdmComplexSynonymAdd(UDM_AGENT *A, UDM_SYNONYMLIST *SL,
477                      char *strbeg, char *str, size_t str_reminder_size,
478                      UDM_WIDEWORDLIST *WWL, size_t nwords,
479                      size_t order,
480                      size_t phrase_length_limit,
481                      size_t phrase_length_current)
482 {
483   size_t i;
484   for (i= 0; i < nwords; i++)
485   {
486     UDM_WIDEWORD W= WWL->Word[i];
487     if (W.Param.order == order)
488     {
489       int need_more= (phrase_length_limit > 0);
490       size_t len= udm_snprintf(str, str_reminder_size, "%s%s",
491                                phrase_length_current > 0 ? " " : "", W.Word.str);
492       if (need_more)
493       {
494         UdmComplexSynonymAdd(A, SL,
495                              strbeg, str + len, str_reminder_size - len,
496                              WWL, nwords, order + 1,
497                              phrase_length_limit - 1,
498                              phrase_length_current + 1);
499         str[len]= '\0'; /* Remove trailing space and trailing parts */
500       }
501 
502       if (phrase_length_current > 0) /* Skip single word (on first level) */
503       {
504         UDM_WIDEWORDLIST Tmp;
505         UDM_WIDEWORD WW;
506         UdmWideWordListInit(&Tmp);
507         WW= W;
508         WW.Word.str= strbeg;
509         WW.Word.length= strlen(strbeg);
510         UdmSynonymListFind(&Tmp, SL, &WW);
511         /*
512         UdmLog(A, UDM_LOG_DEBUG, "ComplexSynonym: phrlen=%d: '%s' nfound=%d",
513                phrase_length_current, strbeg, Tmp.nwords);
514         */
515         {
516           size_t wrd;
517           for (wrd= 0; wrd < Tmp.nwords; wrd++)
518           {
519             if ((WW.Param.order_extra_width= UdmMultiWordPhraseLength(WW.Word.str)))
520             {
521               UDM_ASSERT(W.Param.order >= phrase_length_current);
522               WW.Param.order= W.Param.order - phrase_length_current;
523               WW.Param.order_extra_width++;
524             }
525             else
526               WW.Param.order= order; /* Should not really happen */
527             /*
528             UdmLog(A, UDM_LOG_DEBUG, "FOUND: '%s' width=%d",
529                    Tmp.Word[wrd].word, WW.order_width);
530             */
531             UdmWideWordListAddLike(WWL, &WW.Param, Tmp.Word[wrd].Word.str);
532           }
533         }
534         UdmWideWordListFree(&Tmp);
535       }
536     }
537   }
538   return UDM_OK;
539 }
540 
541 
542 
543 /*
544   Add many-to-one and many-to-many synonyms
545 */
546 udm_rc_t
UdmComplexSynonyms(UDM_AGENT * A,UDM_WIDEWORDLIST * WWL)547 UdmComplexSynonyms(UDM_AGENT *A, UDM_WIDEWORDLIST *WWL)
548 {
549   size_t nwords= WWL->nwords; /* Remember nwords, to avoid endless loop */
550   size_t i;
551   UDM_SYNONYMLISTLIST *SSL= &A->Conf->Synonym;
552   for (i= 0; i <  SSL->nitems; i++)
553   {
554     UDM_SYNONYMLIST *SL= &SSL->Item[i];
555     char str[256]= "";
556     if (SL->max_phrase_length > 0)
557     {
558       size_t order;
559       for (order= 0; order < WWL->nuniq; order++)
560         UdmComplexSynonymAdd(A, SL, str, str, sizeof(str),
561                              WWL, nwords, order, SL->max_phrase_length, 0);
562     }
563   }
564   return UDM_OK;
565 }
566