1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24 #include <errno.h>
25 
26 #include "udm_common.h"
27 #include "udm_utils.h"
28 #include "udm_stopwords.h"
29 
30 
31 /*
32 UDM_STOPWORD
33 *UdmStopListFind(UDM_STOPLIST *List, const char *word, const char *lang)
34 {
35   int low= 0;
36   int high= List->nstopwords - 1;
37 
38   if(!List->StopWord)
39     return(0);
40 
41   while (low <= high)
42   {
43     int middle = (low + high) / 2;
44     int match = strcmp(List->StopWord[middle].word,word);
45     if (match < 0)
46     {
47       low = middle + 1;
48     }
49     else if (match > 0)
50     {
51       high = middle - 1;
52     }
53     else
54     {
55       if (lang==NULL || *lang=='\0' ||
56           !strncasecmp(List->StopWord[middle].lang, lang,
57                        strlen(List->StopWord[middle].lang)))
58         return(&List->StopWord[middle]);
59       return NULL;
60     }
61   }
62   return NULL;
63 }
64 */
65 
66 static int
cmpstop(const void * s1,const void * s2)67 cmpstop(const void *s1,const void *s2)
68 {
69   return(strcmp(((const UDM_STOPWORD*)s1)->word,((const UDM_STOPWORD*)s2)->word));
70 }
71 
72 
73 static void
UdmStopListInit(UDM_STOPLIST * List)74 UdmStopListInit(UDM_STOPLIST *List)
75 {
76   bzero((void*) List, sizeof(UDM_STOPLIST));
77 }
78 
79 
80 void
UdmStopListSort(UDM_STOPLIST * List)81 UdmStopListSort(UDM_STOPLIST *List)
82 {
83   /* Sort stoplist to run binary search later */
84   UdmSort(List->StopWord,List->nstopwords,sizeof(UDM_STOPWORD),cmpstop);
85 }
86 
87 
88 udm_rc_t
UdmStopListAdd(UDM_STOPLIST * List,UDM_STOPWORD * stopword)89 UdmStopListAdd(UDM_STOPLIST *List,UDM_STOPWORD * stopword)
90 {
91   size_t j;
92 
93   /*
94     If the word is already in list, we will not add it again
95     But mark it as "international word", i.e. a word without language
96     It will allow to avoid troubles with language guesser
97   */
98 
99   for (j= 0; j < List->nstopwords; j++)
100   {
101     if (!strcmp(List->StopWord[j].word, stopword->word))
102     {
103       return UDM_OK;
104     }
105   }
106 
107   List->StopWord= (UDM_STOPWORD *)UdmRealloc(List->StopWord,(List->nstopwords+1)*sizeof(UDM_STOPWORD));
108   List->StopWord[List->nstopwords].word= (char*)UdmStrdup(stopword->word);
109   List->nstopwords++;
110 
111   return UDM_ERROR;
112 }
113 
114 
115 void
UdmStopListFree(UDM_STOPLIST * List)116 UdmStopListFree(UDM_STOPLIST *List)
117 {
118   size_t i;
119   for(i= 0; i < List->nstopwords; i++)
120   {
121     UDM_FREE(List->StopWord[i].word);
122   }
123   UDM_FREE(List->StopWord);
124   List->nstopwords= 0;
125 }
126 
127 
128 UDM_API(udm_rc_t)
UdmStopListLoad(UDM_ENV * Conf,const char * fname)129 UdmStopListLoad(UDM_ENV *Conf,const char *fname)
130 {
131   char str[1024];
132   char *lasttok, *lwrd;
133   FILE *stopfile;
134   UDM_STOPWORD stopword;
135   UDM_CHARSET *cs= NULL;
136   UDM_CONV cnv;
137   UDM_STOPLIST StopList;
138 
139   UdmStopListInit(&StopList);
140 
141   if (!(stopfile=fopen(fname,"r")))
142   {
143     udm_snprintf(Conf->errstr, sizeof(Conf->errstr), "Can't open stopwords file '%s' (%s)", fname, strerror(errno));
144     return UDM_ERROR;
145   }
146   if ((lwrd = (char*)UdmMalloc(Conf->WordParam.max_word_len + 1)) == NULL)
147     return UDM_ERROR;
148 
149   bzero((void*)&stopword, sizeof(stopword));
150 
151   while (fgets(str,sizeof(str),stopfile))
152   {
153     if(!str[0])continue;
154     if(str[0]=='#')continue;
155 
156     if(!strncmp(str,"Charset:",8))
157     {
158       char *charset= udm_strtok_r(str + 8, " \t\n\r", &lasttok);
159       if (charset)
160         udm_snprintf(StopList.cset, UDM_STOPLIST_CSETLEN, "%s", charset);
161     }
162     else if (!strncmp(str,"Language:",9))
163     {
164       char *lang= udm_strtok_r(str + 9, " \t\n\r", &lasttok);
165       if (lang)
166         udm_snprintf(StopList.lang, UDM_STOPLIST_LANGLEN, "%s", lang);
167     }
168     else if ((stopword.word= udm_strtok_r(str, "\t\n\r", &lasttok)))
169     {
170 
171       if(!cs)
172       {
173         if (!StopList.cset[0])
174         {
175           sprintf(Conf->errstr,"No charset definition in stopwords file '%s'", fname);
176           UDM_FREE(lwrd);
177           return UDM_ERROR;
178         }
179         else
180         {
181           if(!(cs= UdmGetCharSet(StopList.cset)))
182           {
183             udm_snprintf(Conf->errstr, sizeof(Conf->errstr),
184                          "Unknown charset '%s' in stopwords file '%s'",
185                          StopList.cset, fname);
186             UDM_FREE(lwrd);
187             return UDM_ERROR;
188           }
189           UdmConvInit(&cnv, cs, Conf->lcs);
190         }
191       }
192 
193       UdmConvHTML(&cnv, lwrd, Conf->WordParam.max_word_len,
194                   stopword.word, strlen(stopword.word) + 1);
195       lwrd[Conf->WordParam.max_word_len]= '\0';
196       stopword.word= lwrd;
197       UdmStopListAdd(&StopList, &stopword);
198     }
199   }
200   fclose(stopfile);
201   UdmStopListSort(&StopList);
202   udm_snprintf(StopList.fname,UDM_STOPLIST_FILELEN, "%s", fname);
203   UDM_FREE(lwrd);
204   return UdmStopListListAdd(&Conf->StopWord, &StopList);
205 }
206 
207 
208 void
UdmStopListListInit(UDM_STOPLISTLIST * Lst)209 UdmStopListListInit(UDM_STOPLISTLIST *Lst)
210 {
211   bzero((void*)Lst, sizeof(Lst[0]));
212 }
213 
214 
215 void
UdmStopListListFree(UDM_STOPLISTLIST * Lst)216 UdmStopListListFree(UDM_STOPLISTLIST *Lst)
217 {
218   size_t i;
219   for (i= 0; i < Lst->nitems; i++)
220     UdmStopListFree(&Lst->Item[i]);
221   UDM_FREE(Lst->Item);
222 }
223 
224 
225 udm_rc_t
UdmStopListListAdd(UDM_STOPLISTLIST * Lst,UDM_STOPLIST * Item)226 UdmStopListListAdd(UDM_STOPLISTLIST *Lst, UDM_STOPLIST *Item)
227 {
228   size_t nbytes= (Lst->nitems + 1) * sizeof(UDM_STOPLIST);
229   if (!(Lst->Item= (UDM_STOPLIST*) UdmRealloc(Lst->Item, nbytes)))
230     return UDM_ERROR;
231   Lst->Item[Lst->nitems++]= Item[0];
232   return UDM_OK;
233 }
234 
235 
236 UDM_STOPWORD*
UdmStopListListFind(UDM_STOPLISTLIST * SLL,const char * word,const char * lang)237 UdmStopListListFind(UDM_STOPLISTLIST *SLL,
238                     const char *word, const char *lang)
239 {
240   UDM_STOPWORD Key;
241   size_t i;
242   char tmp[128];
243   Key.word= tmp;
244 
245   udm_snprintf(tmp, sizeof(tmp), "%s", word);
246 
247   for (i= 0; i < SLL->nitems; i++)
248   {
249     UDM_STOPLIST *SL= &SLL->Item[i];
250     UDM_STOPWORD *S;
251     if (lang && lang[0] && strcmp(SL->lang, lang))
252       continue;
253     if ((S= (UDM_STOPWORD*) UdmBSearch(&Key, SL->StopWord, SL->nstopwords,
254                                        sizeof(UDM_STOPWORD), cmpstop)))
255       return S;
256   }
257 
258   return NULL;
259 }
260