1 /**
2  * @file rule.c (auto surame & areaname & special group)
3  * @author Hightman Mar
4  * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5  * $Id$
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #    include "config.h"
10 #endif
11 
12 #ifdef WIN32
13 #    include "config_win32.h"
14 #endif
15 
16 #include "rule.h"
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 
_rule_index_get(rule_t r,const char * name)21 static inline int _rule_index_get(rule_t r, const char *name)
22 {
23 	int i;
24 	for (i = 0; i < SCWS_RULE_MAX; i++)
25 	{
26 		if (r->items[i].name[0] == '\0')
27 			break;
28 
29 		if (!strcasecmp(r->items[i].name, name))
30 			return i;
31 	}
32 	return -1;
33 }
34 
scws_rule_new(const char * fpath,unsigned char * mblen)35 rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
36 {
37 	FILE *fp;
38 	rule_t r;
39 	rule_item_t cr;
40 	int i, j, rbl, aflag;
41 	rule_attr_t a, rtail;
42 	unsigned char buf[512], *str, *ptr, *qtr;
43 
44 	/* loaded or open file failed */
45 	if ((fp = fopen(fpath, "r")) == NULL)
46 		return NULL;
47 
48 	/* alloc the memory */
49 	r = (rule_t) malloc(sizeof(rule_st));
50 	memset(r, 0, sizeof(rule_st));
51 	r->ref = 1;
52 
53 	/* quick scan to add the name to list */
54 	i = j = rbl = aflag = 0;
55 	while (fgets(buf, sizeof(buf) - 1, fp))
56 	{
57 		if (buf[0] != '[' || !(ptr = strchr(buf, ']')))
58 			continue;
59 
60 		str = buf + 1;
61 		*ptr = '\0';
62 		if (ptr == str || (ptr - str) > 15 || !strcasecmp(str, "attrs"))
63 			continue;
64 
65 		if (_rule_index_get(r, str) >= 0)
66 			continue;
67 
68 		strcpy(r->items[i].name, str);
69 		r->items[i].tf = 5.0;
70 		r->items[i].idf = 3.5;
71 		strncpy(r->items[i].attr, "un", 2);
72 		if (!strcasecmp(str, "special"))
73 			r->items[i].bit = SCWS_RULE_SPECIAL;
74 		else if (!strcasecmp(str, "nostats"))
75 			r->items[i].bit = SCWS_RULE_NOSTATS;
76 		else
77 		{
78 			r->items[i].bit = (1 << j);
79 			j++;
80 		}
81 
82 		if (++i >= SCWS_RULE_MAX)
83 			break;
84 	}
85 	rewind(fp);
86 
87 	/* load the tree data */
88 	if ((r->tree = xtree_new(0, 1)) == NULL)
89 	{
90 		free(r);
91 		return NULL;
92 	}
93 	cr = NULL;
94 	while (fgets(buf, sizeof(buf) - 1, fp))
95 	{
96 		if (buf[0] == ';')
97 			continue;
98 
99 		if (buf[0] == '[')
100 		{
101 			cr = NULL;
102 			str = buf + 1;
103 			aflag = 0;
104 			if ((ptr = strchr(str, ']')) != NULL)
105 			{
106 				*ptr = '\0';
107 				if (!strcasecmp(str, "attrs"))
108 				{
109 					aflag = 1;
110 				}
111 				else if ((i = _rule_index_get(r, str)) >= 0)
112 				{
113 					rbl = 1; /* default read by line = yes */
114 					cr = &r->items[i];
115 				}
116 			}
117 			continue;
118 		}
119 
120 		/* attr flag open? */
121 		if (aflag == 1)
122 		{
123 			/* parse the attr line */
124 			str = buf;
125 			while (*str == ' ' || *str == '\t') str++;
126 			if ((ptr = strchr(str, '+')) == NULL) continue;
127 			*ptr++ = '\0';
128 			if ((qtr = strchr(ptr, '=')) == NULL) continue;
129 			*qtr++ = '\0';
130 
131 			/* create new memory */
132 			a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
133 			memset(a, 0, sizeof(struct scws_rule_attr));
134 
135 			/* get ratio */
136 			while (*qtr == ' ' || *qtr == '\t') qtr++;
137 			a->ratio = (short) atoi(qtr);
138 			if (a->ratio < 1)
139 				a->ratio = 1;
140 			a->npath[0] = a->npath[1] = 0xff;
141 
142 			/* read attr1 & npath1? */
143 			a->attr1[0] = *str++;
144 			if (*str && *str != '(' && *str != ' ' && *str != '\t')
145 				a->attr1[1] = *str++;
146 			while (*str && *str != '(') str++;
147 			if (*str == '(')
148 			{
149 				str++;
150 				if ((qtr = strchr(str, ')')) != NULL)
151 				{
152 					*qtr = '\0';
153 					a->npath[0] = (unsigned char) atoi(str);
154 					if (a->npath[0] > 0)
155 						a->npath[0]--;
156 					else
157 						a->npath[0] = 0xff;
158 				}
159 			}
160 
161 			/* read attr1 & npath2? */
162 			str = ptr;
163 			while (*str == ' ' || *str == '\t') str++;
164 			a->attr2[0] = *str++;
165 			if (*str && *str != '(' && *str != ' ' && *str != '\t')
166 				a->attr2[1] = *str++;
167 			while (*str && *str != '(') str++;
168 			if (*str == '(')
169 			{
170 				str++;
171 				if ((qtr = strchr(str, ')')) != NULL)
172 				{
173 					*qtr = '\0';
174 					a->npath[1] = (unsigned char) atoi(str);
175 					if (a->npath[1] > 0)
176 						a->npath[1]--;
177 					else
178 						a->npath[1] = 0xff;
179 				}
180 			}
181 
182 			//printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
183 			//	a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
184 
185 			/* append to the chain list */
186 			if (r->attr == NULL)
187 				r->attr = rtail = a;
188 			else
189 			{
190 				rtail->next = a;
191 				rtail = a;
192 			}
193 
194 			continue;
195 		}
196 
197 		if (cr == NULL)
198 			continue;
199 
200 		/* param set: line|znum|include|exclude|type|tf|idf|attr */
201 		if (buf[0] == ':')
202 		{
203 			str = buf + 1;
204 			if (!(ptr = strchr(str, '=')))
205 				continue;
206 			while (*str == ' ' || *str == '\t') str++;
207 
208 			qtr = ptr + 1;
209 			while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
210 			*ptr = '\0';
211 			ptr = str;
212 			str = qtr;
213 			while (*str == ' ' || *str == '\t') str++;
214 
215 			if (!strcmp(ptr, "line"))
216 				rbl = (*str == 'N' || *str == 'n') ? 0 : 1;
217 			else if (!strcmp(ptr, "tf"))
218 				cr->tf = (float) atof(str);
219 			else if (!strcmp(ptr, "idf"))
220 				cr->idf = (float) atof(str);
221 			else if (!strcmp(ptr, "attr"))
222 				strncpy(cr->attr, str, 2);
223 			else if (!strcmp(ptr, "znum"))
224 			{
225 				if ((ptr = strchr(str, ',')) != NULL)
226 				{
227 					*ptr++ = '\0';
228 					while (*ptr == ' ' || *ptr == '\t') ptr++;
229 					cr->zmax = atoi(ptr);
230 					cr->flag |= SCWS_ZRULE_RANGE;
231 				}
232 				cr->zmin = atoi(str);
233 			}
234 			else if (!strcmp(ptr, "type"))
235 			{
236 				if (!strncmp(str, "prefix", 6))
237 					cr->flag |= SCWS_ZRULE_PREFIX;
238 				else if (!strncmp(str, "suffix", 6))
239 					cr->flag |= SCWS_ZRULE_SUFFIX;
240 			}
241 			else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
242 			{
243 				unsigned int *clude;
244 
245 				if (!strcmp(ptr, "include"))
246 				{
247 					clude = &cr->inc;
248 					cr->flag |= SCWS_ZRULE_INCLUDE;
249 				}
250 				else
251 				{
252 					clude = &cr->exc;
253 					cr->flag |= SCWS_ZRULE_EXCLUDE;
254 				}
255 
256 				while ((ptr = strchr(str, ',')) != NULL)
257 				{
258 					while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
259 					*ptr = '\0';
260 					if ((i = _rule_index_get(r, str)) >= 0)
261 						*clude |= r->items[i].bit;
262 
263 					str = ptr + 1;
264 					while (*str == ' ' || *str == '\t' || *str == ',') str++;
265 				}
266 
267 				ptr = strlen(str) + str;
268 				while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
269 				*ptr = '\0';
270 				if (ptr > str && (i = _rule_index_get(r, str)))
271 					*clude |= r->items[i].bit;
272 			}
273 			continue;
274 		}
275 
276 		/* read the entries */
277 		str = buf;
278 		while (*str == ' ' || *str == '\t') str++;
279 		ptr = str + strlen(str);
280 		while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
281 		*ptr = '\0';
282 
283 		/* emptry line */
284 		if (ptr == str)
285 			continue;
286 
287 		if (rbl)
288 			xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
289 		else
290 		{
291 			while (str < ptr)
292 			{
293 				j = mblen[(*str)];
294 
295 #ifdef DEBUG
296 				/* try to check repeat */
297 				if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
298 					fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
299 #endif
300 				xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
301 				str += j;
302 			}
303 		}
304 	}
305 	fclose(fp);
306 
307 	/* optimize the tree */
308 	xtree_optimize(r->tree);
309 	return r;
310 }
311 
312 /* fork rule */
scws_rule_fork(rule_t r)313 rule_t scws_rule_fork(rule_t r)
314 {
315 	if (r != NULL)
316 		r->ref++;
317 	return r;
318 }
319 
320 /* free rule */
scws_rule_free(rule_t r)321 void scws_rule_free(rule_t r)
322 {
323 	if (r)
324 	{
325 		r->ref--;
326 		if (r->ref == 0)
327 		{
328 			rule_attr_t a, b;
329 
330 			xtree_free(r->tree);
331 			a = r->attr;
332 			while (a != NULL)
333 			{
334 				b = a;
335 				a = b->next;
336 				free(b);
337 			}
338 			free(r);
339 		}
340 	}
341 }
342 
343 /* get the rule */
scws_rule_get(rule_t r,const char * str,int len)344 rule_item_t scws_rule_get(rule_t r, const char *str, int len)
345 {
346 	if (!r)
347 		return NULL;
348 
349 	return((rule_item_t) xtree_nget(r->tree, str, len, NULL));
350 }
351 
352 /* check the bit with str */
scws_rule_checkbit(rule_t r,const char * str,int len,unsigned int bit)353 int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit)
354 {
355 	rule_item_t ri;
356 
357 	if (!r)
358 		return 0;
359 
360 	ri = (rule_item_t) xtree_nget(r->tree, str, len, NULL);
361 	if ((ri != NULL) && (ri->bit & bit))
362 		return 1;
363 
364 	return 0;
365 }
366 
367 /* get rule attr x */
368 #define	EQUAL_RULE_ATTR(x,y)	((y[0]=='*'||y[0]==x[0])&&(y[1]=='\0'||y[1]==x[1]))
369 #define	EQUAL_RULE_NPATH(x,y)	((y[0]==0xff||y[0]==x[0])&&(y[1]==0xff||y[1]==x[1]))
370 
scws_rule_attr_ratio(rule_t r,const char * attr1,const char * attr2,const unsigned char * npath)371 int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
372 {
373 	rule_attr_t a;
374 	int ret = 1;
375 
376 	if (!r || (a = r->attr) == NULL)
377 		return ret;
378 
379 	while (a != NULL)
380 	{
381 		if (EQUAL_RULE_ATTR(attr1, a->attr1) && EQUAL_RULE_ATTR(attr2, a->attr2) && EQUAL_RULE_NPATH(npath, a->npath))
382 		{
383 			ret = (int) a->ratio;
384 			break;
385 		}
386 		a = a->next;
387 	}
388 	return ret;
389 }
390 
391 #undef EQUAL_RULE_ATTR
392 #undef EQUAL_RULE_NPATH
393 
394 /* check the rule */
scws_rule_check(rule_t r,rule_item_t cr,const char * str,int len)395 int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len)
396 {
397 	if (!r)
398 		return 0;
399 
400 	if ((cr->flag & SCWS_ZRULE_INCLUDE) && !scws_rule_checkbit(r, str, len, cr->inc))
401 		return 0;
402 
403 	if ((cr->flag & SCWS_ZRULE_EXCLUDE) && scws_rule_checkbit(r, str, len, cr->exc))
404 		return 0;
405 
406 	return 1;
407 }
408