mnogosearch-3.4.1/src/fuzzy.c

/* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <stdio.h>


#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unidata.h"
#include "udm_searchtool.h"
#include "udm_spell.h"
#include "udm_word.h"
#include "udm_vars.h"
#include "udm_db.h"
#include "udm_synonym.h"


#define UDM_MAX_FORMS 256
#define UDM_MAX_NORMS 64

/*
  All the following combinations should
  work and get as many uword forms as possible:

  1. uword doesn't exist in ispell, its synonym doesn't exist in ispell.
     This last combination should also work if no ispell dictionaries loaded.
     Just copy all synonyms into result.
  2. DONE: both norm(uword) and its synonym exist in ispell
  3. norm(uword) exists in ispell, its synonym doesn't exist in ispell.
  4. uword doesn't exist in ispell, its synonym exists in ispell.
*/

static UDM_WIDEWORDLIST *UdmAllForms1(UDM_AGENT *Indexer,
                                      UDM_WIDEWORDLIST *result,
                                      const UDM_WIDEWORD *uword)
{
  UDM_SPELLLISTLIST *SLL= &Indexer->Conf->Spells;
  UDM_AFFIXLISTLIST *ALL= &Indexer->Conf->Affixes;
  UDM_SYNONYMLISTLIST *SYN= &Indexer->Conf->Synonym;
  char *Res[UDM_MAX_FORMS];
  char **ResCur= Res;
  char **ResEnd= Res + UDM_MAX_FORMS;
  char **R;
  UDM_AFFIXLIST *Al;
  UDM_WIDEWORD_PARAM Param;
  UDM_CHARSET *lcs= Indexer->Conf->lcs;
  int sy= UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
  int sp= UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);

  if (!sp)
    return NULL;

  for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
  {
    UDM_SPELLLIST *Sl;
    for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
    {
      if (!strcasecmp(Al->lang, Sl->lang) && !strcasecmp(Al->cset, Sl->cset))
      {
        UDM_SPELL Norm[UDM_MAX_NORMS];
        UDM_SPELL *NormEnd= Norm + UDM_MAX_NORMS;
        UDM_SPELL *NormCur= Norm;
        UDM_SPELL *N;
        char tmp[256];
        char *word= uword->Word.str;
        UDM_CONV scs_lcs, lcs_scs;

        UdmConvInit(&scs_lcs, Sl->cs, lcs);
        UdmConvInit(&lcs_scs, lcs, Sl->cs);

        if (lcs != Sl->cs)
        {
          size_t len= strlen(word);
          UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), word, len + 1);
          word= tmp;
        }

        NormCur+= UdmSpellNormalize(Sl, Al, word, NormCur, NormEnd-NormCur);

        if (sy && SYN->nitems)
        {
          UDM_WIDEWORD ww;
          UDM_WIDEWORDLIST *syn;
          UdmWideWordInit(&ww);
          ww.Param.order= uword->Param.order;
          ww.Word.str= tmp;
          /*
            Find synonyms for each normal form
            and add the found synonyms into normalized
            list for futher denormalization.
          */
          for (N= Norm; N < NormCur; N++)
          {
            ww.Word.length= UdmConvHTML(&scs_lcs,tmp,sizeof(tmp),N->word,strlen(N->word)+1) - 1;
            ww.Param.origin= uword->Param.origin;
            if ((syn= UdmSynonymListListFind(SYN, &ww)))
            {
              UDM_WIDEWORD *W;
              for (W= syn->Word; W < syn->Word + syn->nwords; W++)
              {
                size_t nbytes= W->Word.length + 1;
                UdmConvHTML(&lcs_scs, tmp, sizeof(tmp), W->Word.str, nbytes);
                if (NormCur < NormEnd)
                {
                  NormCur+= UdmSpellNormalize(Sl, Al, tmp, NormCur, NormEnd-NormCur);
                }
              }
              UdmWideWordListFree(syn);
              UdmFree(syn);
            }
          }
        }

        for (N= Norm ; N < NormCur; N++)
        {
          if (ResCur < ResEnd)
          {
            size_t cres= 1;
            *ResCur= UdmStrdup(N->word);
            cres+= UdmSpellDenormalize(Sl, Al, N, ResCur+1, ResEnd-ResCur-1);
            if (lcs != Sl->cs)
            {
              size_t i;
              for (i=0; i < cres; i++)
              {
                UdmConvHTML(&scs_lcs, tmp, sizeof(tmp),
                            ResCur[i], strlen(ResCur[i])+1);
                UdmFree(ResCur[i]);
                ResCur[i]= UdmStrdup(tmp);
              }
            }
            ResCur+= cres;
          }
        }

        /* Free normalized forms*/
        for (N= Norm;  N < NormCur; N++)
        {
          UdmFree(N->word);
          UdmFree(N->flags);
        }
      }
    }
  }

  UdmWideWordParamInit(&Param);
  for (R=Res; R < ResCur; R++)
  {
    UdmWideWordParamCopySpell(&Param, &uword->Param);
    UdmWideWordListAddLike(result, &Param, *R);
    UdmFree(*R);
  }
  return result;
}


static const char *translit_cyr_lat[]=
{
  "a",  "b",  "v",  "g",  "d",  "e",  "zh", "z",
  "i",  "j",  "k",  "l",  "m",  "n",  "o",  "p",
  "r",  "s",  "t",  "u",  "f",  "h",  "c",  "ch",
  "sh", "sch","`",  "y",  "'",  "`e", "yu", "ya",
  "",   "yo"
};


static const char *translit_lat_cyr[]=
{
  "&#x430;", "&#x431;", "&#x446;", "&#x434;",
  "&#x435;", "&#x444;", "&#x433;", "&#x445;",
  "&#x438;", "&#x439;", "&#x43a;", "&#x43b;",
  "&#x43c;", "&#x43d;", "&#x43e;", "&#x43f;",
  "&#x433;", "&#x440;", "&#x441;", "&#x442;",
  "&#x443;", "&#x432;", "&#x432;", "&#x43a;&#x441;",
  "&#x44b;", "&#x437;"
};


typedef struct udm_translit_complex_subst_st
{
  const char *from;
  const char *to;
} UDM_TRANSLIT_COMPLEX_SUBST;


static UDM_TRANSLIT_COMPLEX_SUBST translit_lat_cyr_complex[]=
{
#if NOT_YET
  {"`"  , "&#x44a;"},
  {"'"  , "&#x44c;"},
  {"`e" , "&#x44d;"},
#endif
  {"ch" , "&#x447;"},
  {"sch", "&#x449;"},
  {"ya" , "&#x44f;"},
  {"zh" , "&#x436;"},
  {"yo" , "&#x451;"},
  {"kh" , "&#x445;"},
  {"sh" , "&#x448;"},
#ifdef NOT_YET
  {"yu" , "&#x44e;"}, /* ambiguous: YERU + U, or YU   */
#endif
  {NULL, NULL}
};


typedef struct udm_translit_st
{
  size_t from;
  size_t to;
  const char * const *translit;
  UDM_TRANSLIT_COMPLEX_SUBST *complex;
} UDM_TRANSLIT_TABLE;


static UDM_TRANSLIT_TABLE tr_cyr_lat=
{
  0x430, 0x451, translit_cyr_lat, NULL
};


static UDM_TRANSLIT_TABLE tr_lat_cyr=
{
  0x61, 0x7A, translit_lat_cyr, translit_lat_cyr_complex
};


static udm_rc_t
UdmAllFormsTranslit(UDM_AGENT *A, UDM_WIDEWORDLIST *result,
                    const UDM_WIDEWORD *uword,
                    const UDM_TRANSLIT_TABLE *tr)
{
  int *wrd, tword[128], tmp[128], *t, *te= tword + 128 - 2;
  int subst= 0;
  UDM_CHARSET *latin1= &udm_charset_latin1;
  UDM_CONV l1_uni, lcs_uni;
  UdmConvInit(&l1_uni, latin1, &udm_charset_sys_int);
  UdmConvInit(&lcs_uni, A->Conf->lcs, &udm_charset_sys_int);
  UdmConvHTML(&lcs_uni, (char*) tmp, sizeof(tmp),
                        uword->Word.str, strlen(uword->Word.str) + 1);

  for (wrd= tmp, t= tword; wrd[0] && t < te; )
  {
    if (*wrd >= tr->from && *wrd <= tr->to)
    {
      const char *repl= NULL;
      size_t len;
      UDM_TRANSLIT_COMPLEX_SUBST *cmpl;
      for (cmpl= tr->complex; cmpl && cmpl->from; cmpl++)
      {
        size_t pos;
        const char *from= cmpl->from;
        for (pos=0; from[pos] && from[pos] == wrd[pos]; pos++);
        if (!from[pos])
        {
          repl= cmpl->to;
          wrd+= pos;
          break;
        }
      }
      if (!repl)
      {
        repl= tr->translit[*wrd - tr->from];
        wrd++;
      }

      len= strlen(repl);
      len= UdmConvHTML(&l1_uni, (char*) t, (te - t) * sizeof(*t), repl, len);
      t+= len / sizeof(*t);
      *t= 0;
      subst++;
    }
    else
    {
      *t++= *wrd++;
    }
  }
  *t= 0;
  if (subst)
  {
    UDM_WIDEWORD_PARAM Param;
    char lcsword[128];
    UDM_CONV uni_lcs;
    size_t nbytes= (t - tword + 1) * sizeof(*t);
    UdmConvInit(&uni_lcs, &udm_charset_sys_int, A->Conf->lcs);
    UdmConvHTML(&uni_lcs, lcsword, sizeof(lcsword), (const char *) tword, nbytes);
    UdmWideWordParamInit(&Param);
    UdmWideWordParamCopySynonym(&Param, &uword->Param);
    UdmWideWordListAddLike(result, &Param, lcsword);
  }
  return UDM_OK;
}


static udm_rc_t
UdmAllForms2(UDM_AGENT *Indexer,
             UDM_WIDEWORDLIST *result,
             UDM_WIDEWORD *uw)
{
  UdmAllForms1(Indexer, result, uw); /* TODO34: UdmAllForms1 to return rc */
  return UDM_OK;
}


/*
  Similar to UdmWideWordAddLike()
  but changes count, origin, and weight.
*/
static udm_rc_t
UdmWideWordListAddForDehyphenate(UDM_WIDEWORDLIST *result,
                                 const UDM_WIDEWORD *uword,
                                 char *tmpword)
{
  UDM_WIDEWORD_PARAM Param;
  /* TOD34: simplify here, and the caller, to use UDM_CONST_STR */
  UdmWideWordParamInit(&Param);
  UdmWideWordParamCopySynonym(&Param, &uword->Param);
  return UdmWideWordListAddLike(result, &Param, tmpword) ? UDM_OK : UDM_ERROR;
}


static udm_rc_t
UdmAllFormsDehyphenate(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
{
  char tmpword[128], *s, *d;
  udm_snprintf(tmpword, sizeof(tmpword), "%s", uword->Word.str);
  for (s= uword->Word.str, d= tmpword; ; s++)
  {
    *d++= *s;
    if (UdmAutoPhraseChar((unsigned char) *s))
      d--;

    if (!*d)
      break;
  }
  return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
}


/*
  Adding hyphenated alnumeric forms: utf8 -> utf-8
*/
static udm_rc_t
UdmAllFormsHyphenateNumbers(UDM_AGENT *A, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
{
  char tmpword[128], *s, *d;
  int prev_ctype= UDM_UNI_SEPAR, have_hyphen= 0;
  if (strlen(uword->Word.str) + 1 >= sizeof(tmpword))
    return UDM_OK;

  for (s= uword->Word.str, d= tmpword; *s ; *d++= *s++)
  {
    int next_ctype= (*s >= '0' && *s <= '9') ? UDM_UNI_DIGIT :
                    UdmAutoPhraseChar(*s)    ? UDM_UNI_SEPAR :
                    UDM_UNI_LETTER;
    if ((prev_ctype == UDM_UNI_LETTER && next_ctype == UDM_UNI_DIGIT) ||
        (prev_ctype == UDM_UNI_DIGIT  && next_ctype == UDM_UNI_LETTER))
    {
      have_hyphen= 1;
      *d++= '-';
    }
    prev_ctype= next_ctype;
  }

  if (!have_hyphen)
    return UDM_OK;
  *d++= '\0';

  return UdmWideWordListAddForDehyphenate(result, uword, tmpword);
}


udm_rc_t
UdmAllForms(UDM_AGENT *Indexer, UDM_WIDEWORDLIST *result, UDM_WIDEWORD *uword)
{
  udm_rc_t rc;
  UDM_WIDEWORDLIST *uwordsyn;

  /*
    Generate all possible word forms for uword.
  */
  if (UDM_OK != (rc= UdmAllForms2(Indexer, result, uword)))
    return rc;

  if (UdmVarListFindBool(&Indexer->Conf->Vars, "tl", UDM_FALSE))
  {
    UDM_TRANSLIT_TABLE *tbl[]= {&tr_cyr_lat, &tr_lat_cyr, NULL}, **cur;
    for (cur= tbl; *cur; cur++)
    {
      UDM_WIDEWORDLIST translit;
      UdmWideWordListInit(&translit);
      UdmAllFormsTranslit(Indexer, &translit, uword, *cur);
      if (translit.nwords)
      {
        UDM_WIDEWORD *ww= &translit.Word[0];
        UdmWideWordListAdd(result, ww);
        UdmAllForms2(Indexer, result, ww);
      }
      UdmWideWordListFree(&translit);
    }
  }

  if (UdmVarListFindBool(&Indexer->Conf->Vars, "Dehyphenate", UDM_FALSE))
    UdmAllFormsDehyphenate(Indexer, result, uword);


  if (UdmVarListFindBool(&Indexer->Conf->Vars, "HyphenateNumbers", UDM_FALSE))
    UdmAllFormsHyphenateNumbers(Indexer, result, uword);


  if (!UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1))
    return rc;
  /*
     Combination one: uword is possibly a normalized form.
     Find all uword synonyms and then process then through
     ispell to generate all word forms for the synonyms.
  */
  if ((uwordsyn= UdmSynonymListListFind(&Indexer->Conf->Synonym, uword)))
  {
    UDM_WIDEWORD *ww;
    for (ww= uwordsyn->Word; ww < &uwordsyn->Word[uwordsyn->nwords]; ww++)
    {
      UdmWideWordListAdd(result, ww);
      UdmAllForms2(Indexer, result, ww);
    }
    UdmWideWordListFree(uwordsyn);
    UdmFree(uwordsyn);
  }

  return rc;
}


/*
  Appends synonym parts into string, recursively

  SL                    - List to find synonyms in
  strbeg                - The very beginning of the complex synonym
  str                   - The beginnig of the next part, we'll write here
  str_reminder_size     - Space available
  WWL                   - We'll add found synonyms here
  nwords                - number of words to check (to avoid endless loop)
  order                 - Word "order" to start building phrase at
  phrase_length_limit   - maximum possible complex synonym length
  phrase_length_current - current length of synonym which was
                          collected on the previous recursion step.
*/
static udm_rc_t
UdmComplexSynonymAdd(UDM_AGENT *A, UDM_SYNONYMLIST *SL,
                     char *strbeg, char *str, size_t str_reminder_size,
                     UDM_WIDEWORDLIST *WWL, size_t nwords,
                     size_t order,
                     size_t phrase_length_limit,
                     size_t phrase_length_current)
{
  size_t i;
  for (i= 0; i < nwords; i++)
  {
    UDM_WIDEWORD W= WWL->Word[i];
    if (W.Param.order == order)
    {
      int need_more= (phrase_length_limit > 0);
      size_t len= udm_snprintf(str, str_reminder_size, "%s%s",
                               phrase_length_current > 0 ? " " : "", W.Word.str);
      if (need_more)
      {
        UdmComplexSynonymAdd(A, SL,
                             strbeg, str + len, str_reminder_size - len,
                             WWL, nwords, order + 1,
                             phrase_length_limit - 1,
                             phrase_length_current + 1);
        str[len]= '\0'; /* Remove trailing space and trailing parts */
      }

      if (phrase_length_current > 0) /* Skip single word (on first level) */
      {
        UDM_WIDEWORDLIST Tmp;
        UDM_WIDEWORD WW;
        UdmWideWordListInit(&Tmp);
        WW= W;
        WW.Word.str= strbeg;
        WW.Word.length= strlen(strbeg);
        UdmSynonymListFind(&Tmp, SL, &WW);
        /*
        UdmLog(A, UDM_LOG_DEBUG, "ComplexSynonym: phrlen=%d: '%s' nfound=%d",
               phrase_length_current, strbeg, Tmp.nwords);
        */
        {
          size_t wrd;
          for (wrd= 0; wrd < Tmp.nwords; wrd++)
          {
            if ((WW.Param.order_extra_width= UdmMultiWordPhraseLength(WW.Word.str)))
            {
              UDM_ASSERT(W.Param.order >= phrase_length_current);
              WW.Param.order= W.Param.order - phrase_length_current;
              WW.Param.order_extra_width++;
            }
            else
              WW.Param.order= order; /* Should not really happen */
            /*
            UdmLog(A, UDM_LOG_DEBUG, "FOUND: '%s' width=%d",
                   Tmp.Word[wrd].word, WW.order_width);
            */
            UdmWideWordListAddLike(WWL, &WW.Param, Tmp.Word[wrd].Word.str);
          }
        }
        UdmWideWordListFree(&Tmp);
      }
    }
  }
  return UDM_OK;
}


/*
  Add many-to-one and many-to-many synonyms
*/
udm_rc_t
UdmComplexSynonyms(UDM_AGENT *A, UDM_WIDEWORDLIST *WWL)
{
  size_t nwords= WWL->nwords; /* Remember nwords, to avoid endless loop */
  size_t i;
  UDM_SYNONYMLISTLIST *SSL= &A->Conf->Synonym;
  for (i= 0; i <  SSL->nitems; i++)
  {
    UDM_SYNONYMLIST *SL= &SSL->Item[i];
    char str[256]= "";
    if (SL->max_phrase_length > 0)
    {
      size_t order;
      for (order= 0; order < WWL->nuniq; order++)
        UdmComplexSynonymAdd(A, SL, str, str, sizeof(str),
                             WWL, nwords, order, SL->max_phrase_length, 0);
    }
  }
  return UDM_OK;
}