1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24 #include <errno.h>
25
26 #include "udm_common.h"
27 #include "udm_utils.h"
28 #include "udm_stopwords.h"
29
30
31 /*
32 UDM_STOPWORD
33 *UdmStopListFind(UDM_STOPLIST *List, const char *word, const char *lang)
34 {
35 int low= 0;
36 int high= List->nstopwords - 1;
37
38 if(!List->StopWord)
39 return(0);
40
41 while (low <= high)
42 {
43 int middle = (low + high) / 2;
44 int match = strcmp(List->StopWord[middle].word,word);
45 if (match < 0)
46 {
47 low = middle + 1;
48 }
49 else if (match > 0)
50 {
51 high = middle - 1;
52 }
53 else
54 {
55 if (lang==NULL || *lang=='\0' ||
56 !strncasecmp(List->StopWord[middle].lang, lang,
57 strlen(List->StopWord[middle].lang)))
58 return(&List->StopWord[middle]);
59 return NULL;
60 }
61 }
62 return NULL;
63 }
64 */
65
66 static int
cmpstop(const void * s1,const void * s2)67 cmpstop(const void *s1,const void *s2)
68 {
69 return(strcmp(((const UDM_STOPWORD*)s1)->word,((const UDM_STOPWORD*)s2)->word));
70 }
71
72
73 static void
UdmStopListInit(UDM_STOPLIST * List)74 UdmStopListInit(UDM_STOPLIST *List)
75 {
76 bzero((void*) List, sizeof(UDM_STOPLIST));
77 }
78
79
80 void
UdmStopListSort(UDM_STOPLIST * List)81 UdmStopListSort(UDM_STOPLIST *List)
82 {
83 /* Sort stoplist to run binary search later */
84 UdmSort(List->StopWord,List->nstopwords,sizeof(UDM_STOPWORD),cmpstop);
85 }
86
87
88 udm_rc_t
UdmStopListAdd(UDM_STOPLIST * List,UDM_STOPWORD * stopword)89 UdmStopListAdd(UDM_STOPLIST *List,UDM_STOPWORD * stopword)
90 {
91 size_t j;
92
93 /*
94 If the word is already in list, we will not add it again
95 But mark it as "international word", i.e. a word without language
96 It will allow to avoid troubles with language guesser
97 */
98
99 for (j= 0; j < List->nstopwords; j++)
100 {
101 if (!strcmp(List->StopWord[j].word, stopword->word))
102 {
103 return UDM_OK;
104 }
105 }
106
107 List->StopWord= (UDM_STOPWORD *)UdmRealloc(List->StopWord,(List->nstopwords+1)*sizeof(UDM_STOPWORD));
108 List->StopWord[List->nstopwords].word= (char*)UdmStrdup(stopword->word);
109 List->nstopwords++;
110
111 return UDM_ERROR;
112 }
113
114
115 void
UdmStopListFree(UDM_STOPLIST * List)116 UdmStopListFree(UDM_STOPLIST *List)
117 {
118 size_t i;
119 for(i= 0; i < List->nstopwords; i++)
120 {
121 UDM_FREE(List->StopWord[i].word);
122 }
123 UDM_FREE(List->StopWord);
124 List->nstopwords= 0;
125 }
126
127
128 UDM_API(udm_rc_t)
UdmStopListLoad(UDM_ENV * Conf,const char * fname)129 UdmStopListLoad(UDM_ENV *Conf,const char *fname)
130 {
131 char str[1024];
132 char *lasttok, *lwrd;
133 FILE *stopfile;
134 UDM_STOPWORD stopword;
135 UDM_CHARSET *cs= NULL;
136 UDM_CONV cnv;
137 UDM_STOPLIST StopList;
138
139 UdmStopListInit(&StopList);
140
141 if (!(stopfile=fopen(fname,"r")))
142 {
143 udm_snprintf(Conf->errstr, sizeof(Conf->errstr), "Can't open stopwords file '%s' (%s)", fname, strerror(errno));
144 return UDM_ERROR;
145 }
146 if ((lwrd = (char*)UdmMalloc(Conf->WordParam.max_word_len + 1)) == NULL)
147 return UDM_ERROR;
148
149 bzero((void*)&stopword, sizeof(stopword));
150
151 while (fgets(str,sizeof(str),stopfile))
152 {
153 if(!str[0])continue;
154 if(str[0]=='#')continue;
155
156 if(!strncmp(str,"Charset:",8))
157 {
158 char *charset= udm_strtok_r(str + 8, " \t\n\r", &lasttok);
159 if (charset)
160 udm_snprintf(StopList.cset, UDM_STOPLIST_CSETLEN, "%s", charset);
161 }
162 else if (!strncmp(str,"Language:",9))
163 {
164 char *lang= udm_strtok_r(str + 9, " \t\n\r", &lasttok);
165 if (lang)
166 udm_snprintf(StopList.lang, UDM_STOPLIST_LANGLEN, "%s", lang);
167 }
168 else if ((stopword.word= udm_strtok_r(str, "\t\n\r", &lasttok)))
169 {
170
171 if(!cs)
172 {
173 if (!StopList.cset[0])
174 {
175 sprintf(Conf->errstr,"No charset definition in stopwords file '%s'", fname);
176 UDM_FREE(lwrd);
177 return UDM_ERROR;
178 }
179 else
180 {
181 if(!(cs= UdmGetCharSet(StopList.cset)))
182 {
183 udm_snprintf(Conf->errstr, sizeof(Conf->errstr),
184 "Unknown charset '%s' in stopwords file '%s'",
185 StopList.cset, fname);
186 UDM_FREE(lwrd);
187 return UDM_ERROR;
188 }
189 UdmConvInit(&cnv, cs, Conf->lcs);
190 }
191 }
192
193 UdmConvHTML(&cnv, lwrd, Conf->WordParam.max_word_len,
194 stopword.word, strlen(stopword.word) + 1);
195 lwrd[Conf->WordParam.max_word_len]= '\0';
196 stopword.word= lwrd;
197 UdmStopListAdd(&StopList, &stopword);
198 }
199 }
200 fclose(stopfile);
201 UdmStopListSort(&StopList);
202 udm_snprintf(StopList.fname,UDM_STOPLIST_FILELEN, "%s", fname);
203 UDM_FREE(lwrd);
204 return UdmStopListListAdd(&Conf->StopWord, &StopList);
205 }
206
207
208 void
UdmStopListListInit(UDM_STOPLISTLIST * Lst)209 UdmStopListListInit(UDM_STOPLISTLIST *Lst)
210 {
211 bzero((void*)Lst, sizeof(Lst[0]));
212 }
213
214
215 void
UdmStopListListFree(UDM_STOPLISTLIST * Lst)216 UdmStopListListFree(UDM_STOPLISTLIST *Lst)
217 {
218 size_t i;
219 for (i= 0; i < Lst->nitems; i++)
220 UdmStopListFree(&Lst->Item[i]);
221 UDM_FREE(Lst->Item);
222 }
223
224
225 udm_rc_t
UdmStopListListAdd(UDM_STOPLISTLIST * Lst,UDM_STOPLIST * Item)226 UdmStopListListAdd(UDM_STOPLISTLIST *Lst, UDM_STOPLIST *Item)
227 {
228 size_t nbytes= (Lst->nitems + 1) * sizeof(UDM_STOPLIST);
229 if (!(Lst->Item= (UDM_STOPLIST*) UdmRealloc(Lst->Item, nbytes)))
230 return UDM_ERROR;
231 Lst->Item[Lst->nitems++]= Item[0];
232 return UDM_OK;
233 }
234
235
236 UDM_STOPWORD*
UdmStopListListFind(UDM_STOPLISTLIST * SLL,const char * word,const char * lang)237 UdmStopListListFind(UDM_STOPLISTLIST *SLL,
238 const char *word, const char *lang)
239 {
240 UDM_STOPWORD Key;
241 size_t i;
242 char tmp[128];
243 Key.word= tmp;
244
245 udm_snprintf(tmp, sizeof(tmp), "%s", word);
246
247 for (i= 0; i < SLL->nitems; i++)
248 {
249 UDM_STOPLIST *SL= &SLL->Item[i];
250 UDM_STOPWORD *S;
251 if (lang && lang[0] && strcmp(SL->lang, lang))
252 continue;
253 if ((S= (UDM_STOPWORD*) UdmBSearch(&Key, SL->StopWord, SL->nstopwords,
254 sizeof(UDM_STOPWORD), cmpstop)))
255 return S;
256 }
257
258 return NULL;
259 }
260