1 /**
2 * @file rule.c (auto surame & areaname & special group)
3 * @author Hightman Mar
4 * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5 * $Id$
6 */
7
8 #ifdef HAVE_CONFIG_H
9 # include "config.h"
10 #endif
11
12 #ifdef WIN32
13 # include "config_win32.h"
14 #endif
15
16 #include "rule.h"
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20
_rule_index_get(rule_t r,const char * name)21 static inline int _rule_index_get(rule_t r, const char *name)
22 {
23 int i;
24 for (i = 0; i < SCWS_RULE_MAX; i++)
25 {
26 if (r->items[i].name[0] == '\0')
27 break;
28
29 if (!strcasecmp(r->items[i].name, name))
30 return i;
31 }
32 return -1;
33 }
34
scws_rule_new(const char * fpath,unsigned char * mblen)35 rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
36 {
37 FILE *fp;
38 rule_t r;
39 rule_item_t cr;
40 int i, j, rbl, aflag;
41 rule_attr_t a, rtail;
42 unsigned char buf[512], *str, *ptr, *qtr;
43
44 /* loaded or open file failed */
45 if ((fp = fopen(fpath, "r")) == NULL)
46 return NULL;
47
48 /* alloc the memory */
49 r = (rule_t) malloc(sizeof(rule_st));
50 memset(r, 0, sizeof(rule_st));
51 r->ref = 1;
52
53 /* quick scan to add the name to list */
54 i = j = rbl = aflag = 0;
55 while (fgets(buf, sizeof(buf) - 1, fp))
56 {
57 if (buf[0] != '[' || !(ptr = strchr(buf, ']')))
58 continue;
59
60 str = buf + 1;
61 *ptr = '\0';
62 if (ptr == str || (ptr - str) > 15 || !strcasecmp(str, "attrs"))
63 continue;
64
65 if (_rule_index_get(r, str) >= 0)
66 continue;
67
68 strcpy(r->items[i].name, str);
69 r->items[i].tf = 5.0;
70 r->items[i].idf = 3.5;
71 strncpy(r->items[i].attr, "un", 2);
72 if (!strcasecmp(str, "special"))
73 r->items[i].bit = SCWS_RULE_SPECIAL;
74 else if (!strcasecmp(str, "nostats"))
75 r->items[i].bit = SCWS_RULE_NOSTATS;
76 else
77 {
78 r->items[i].bit = (1 << j);
79 j++;
80 }
81
82 if (++i >= SCWS_RULE_MAX)
83 break;
84 }
85 rewind(fp);
86
87 /* load the tree data */
88 if ((r->tree = xtree_new(0, 1)) == NULL)
89 {
90 free(r);
91 return NULL;
92 }
93 cr = NULL;
94 while (fgets(buf, sizeof(buf) - 1, fp))
95 {
96 if (buf[0] == ';')
97 continue;
98
99 if (buf[0] == '[')
100 {
101 cr = NULL;
102 str = buf + 1;
103 aflag = 0;
104 if ((ptr = strchr(str, ']')) != NULL)
105 {
106 *ptr = '\0';
107 if (!strcasecmp(str, "attrs"))
108 {
109 aflag = 1;
110 }
111 else if ((i = _rule_index_get(r, str)) >= 0)
112 {
113 rbl = 1; /* default read by line = yes */
114 cr = &r->items[i];
115 }
116 }
117 continue;
118 }
119
120 /* attr flag open? */
121 if (aflag == 1)
122 {
123 /* parse the attr line */
124 str = buf;
125 while (*str == ' ' || *str == '\t') str++;
126 if ((ptr = strchr(str, '+')) == NULL) continue;
127 *ptr++ = '\0';
128 if ((qtr = strchr(ptr, '=')) == NULL) continue;
129 *qtr++ = '\0';
130
131 /* create new memory */
132 a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
133 memset(a, 0, sizeof(struct scws_rule_attr));
134
135 /* get ratio */
136 while (*qtr == ' ' || *qtr == '\t') qtr++;
137 a->ratio = (short) atoi(qtr);
138 if (a->ratio < 1)
139 a->ratio = 1;
140 a->npath[0] = a->npath[1] = 0xff;
141
142 /* read attr1 & npath1? */
143 a->attr1[0] = *str++;
144 if (*str && *str != '(' && *str != ' ' && *str != '\t')
145 a->attr1[1] = *str++;
146 while (*str && *str != '(') str++;
147 if (*str == '(')
148 {
149 str++;
150 if ((qtr = strchr(str, ')')) != NULL)
151 {
152 *qtr = '\0';
153 a->npath[0] = (unsigned char) atoi(str);
154 if (a->npath[0] > 0)
155 a->npath[0]--;
156 else
157 a->npath[0] = 0xff;
158 }
159 }
160
161 /* read attr1 & npath2? */
162 str = ptr;
163 while (*str == ' ' || *str == '\t') str++;
164 a->attr2[0] = *str++;
165 if (*str && *str != '(' && *str != ' ' && *str != '\t')
166 a->attr2[1] = *str++;
167 while (*str && *str != '(') str++;
168 if (*str == '(')
169 {
170 str++;
171 if ((qtr = strchr(str, ')')) != NULL)
172 {
173 *qtr = '\0';
174 a->npath[1] = (unsigned char) atoi(str);
175 if (a->npath[1] > 0)
176 a->npath[1]--;
177 else
178 a->npath[1] = 0xff;
179 }
180 }
181
182 //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
183 // a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
184
185 /* append to the chain list */
186 if (r->attr == NULL)
187 r->attr = rtail = a;
188 else
189 {
190 rtail->next = a;
191 rtail = a;
192 }
193
194 continue;
195 }
196
197 if (cr == NULL)
198 continue;
199
200 /* param set: line|znum|include|exclude|type|tf|idf|attr */
201 if (buf[0] == ':')
202 {
203 str = buf + 1;
204 if (!(ptr = strchr(str, '=')))
205 continue;
206 while (*str == ' ' || *str == '\t') str++;
207
208 qtr = ptr + 1;
209 while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
210 *ptr = '\0';
211 ptr = str;
212 str = qtr;
213 while (*str == ' ' || *str == '\t') str++;
214
215 if (!strcmp(ptr, "line"))
216 rbl = (*str == 'N' || *str == 'n') ? 0 : 1;
217 else if (!strcmp(ptr, "tf"))
218 cr->tf = (float) atof(str);
219 else if (!strcmp(ptr, "idf"))
220 cr->idf = (float) atof(str);
221 else if (!strcmp(ptr, "attr"))
222 strncpy(cr->attr, str, 2);
223 else if (!strcmp(ptr, "znum"))
224 {
225 if ((ptr = strchr(str, ',')) != NULL)
226 {
227 *ptr++ = '\0';
228 while (*ptr == ' ' || *ptr == '\t') ptr++;
229 cr->zmax = atoi(ptr);
230 cr->flag |= SCWS_ZRULE_RANGE;
231 }
232 cr->zmin = atoi(str);
233 }
234 else if (!strcmp(ptr, "type"))
235 {
236 if (!strncmp(str, "prefix", 6))
237 cr->flag |= SCWS_ZRULE_PREFIX;
238 else if (!strncmp(str, "suffix", 6))
239 cr->flag |= SCWS_ZRULE_SUFFIX;
240 }
241 else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
242 {
243 unsigned int *clude;
244
245 if (!strcmp(ptr, "include"))
246 {
247 clude = &cr->inc;
248 cr->flag |= SCWS_ZRULE_INCLUDE;
249 }
250 else
251 {
252 clude = &cr->exc;
253 cr->flag |= SCWS_ZRULE_EXCLUDE;
254 }
255
256 while ((ptr = strchr(str, ',')) != NULL)
257 {
258 while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
259 *ptr = '\0';
260 if ((i = _rule_index_get(r, str)) >= 0)
261 *clude |= r->items[i].bit;
262
263 str = ptr + 1;
264 while (*str == ' ' || *str == '\t' || *str == ',') str++;
265 }
266
267 ptr = strlen(str) + str;
268 while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
269 *ptr = '\0';
270 if (ptr > str && (i = _rule_index_get(r, str)))
271 *clude |= r->items[i].bit;
272 }
273 continue;
274 }
275
276 /* read the entries */
277 str = buf;
278 while (*str == ' ' || *str == '\t') str++;
279 ptr = str + strlen(str);
280 while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
281 *ptr = '\0';
282
283 /* emptry line */
284 if (ptr == str)
285 continue;
286
287 if (rbl)
288 xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
289 else
290 {
291 while (str < ptr)
292 {
293 j = mblen[(*str)];
294
295 #ifdef DEBUG
296 /* try to check repeat */
297 if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
298 fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
299 #endif
300 xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
301 str += j;
302 }
303 }
304 }
305 fclose(fp);
306
307 /* optimize the tree */
308 xtree_optimize(r->tree);
309 return r;
310 }
311
312 /* fork rule */
scws_rule_fork(rule_t r)313 rule_t scws_rule_fork(rule_t r)
314 {
315 if (r != NULL)
316 r->ref++;
317 return r;
318 }
319
320 /* free rule */
scws_rule_free(rule_t r)321 void scws_rule_free(rule_t r)
322 {
323 if (r)
324 {
325 r->ref--;
326 if (r->ref == 0)
327 {
328 rule_attr_t a, b;
329
330 xtree_free(r->tree);
331 a = r->attr;
332 while (a != NULL)
333 {
334 b = a;
335 a = b->next;
336 free(b);
337 }
338 free(r);
339 }
340 }
341 }
342
343 /* get the rule */
scws_rule_get(rule_t r,const char * str,int len)344 rule_item_t scws_rule_get(rule_t r, const char *str, int len)
345 {
346 if (!r)
347 return NULL;
348
349 return((rule_item_t) xtree_nget(r->tree, str, len, NULL));
350 }
351
352 /* check the bit with str */
scws_rule_checkbit(rule_t r,const char * str,int len,unsigned int bit)353 int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit)
354 {
355 rule_item_t ri;
356
357 if (!r)
358 return 0;
359
360 ri = (rule_item_t) xtree_nget(r->tree, str, len, NULL);
361 if ((ri != NULL) && (ri->bit & bit))
362 return 1;
363
364 return 0;
365 }
366
367 /* get rule attr x */
368 #define EQUAL_RULE_ATTR(x,y) ((y[0]=='*'||y[0]==x[0])&&(y[1]=='\0'||y[1]==x[1]))
369 #define EQUAL_RULE_NPATH(x,y) ((y[0]==0xff||y[0]==x[0])&&(y[1]==0xff||y[1]==x[1]))
370
scws_rule_attr_ratio(rule_t r,const char * attr1,const char * attr2,const unsigned char * npath)371 int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
372 {
373 rule_attr_t a;
374 int ret = 1;
375
376 if (!r || (a = r->attr) == NULL)
377 return ret;
378
379 while (a != NULL)
380 {
381 if (EQUAL_RULE_ATTR(attr1, a->attr1) && EQUAL_RULE_ATTR(attr2, a->attr2) && EQUAL_RULE_NPATH(npath, a->npath))
382 {
383 ret = (int) a->ratio;
384 break;
385 }
386 a = a->next;
387 }
388 return ret;
389 }
390
391 #undef EQUAL_RULE_ATTR
392 #undef EQUAL_RULE_NPATH
393
394 /* check the rule */
scws_rule_check(rule_t r,rule_item_t cr,const char * str,int len)395 int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len)
396 {
397 if (!r)
398 return 0;
399
400 if ((cr->flag & SCWS_ZRULE_INCLUDE) && !scws_rule_checkbit(r, str, len, cr->inc))
401 return 0;
402
403 if ((cr->flag & SCWS_ZRULE_EXCLUDE) && scws_rule_checkbit(r, str, len, cr->exc))
404 return 0;
405
406 return 1;
407 }
408