1 /*
2  * @file scws.c (core segment functions)
3  * @author Hightman Mar
4  * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5  * $Id  $
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #	include "config.h"
10 #endif
11 
12 #ifdef WIN32
13 #	include "config_win32.h"
14 #endif
15 
16 #include "scws.h"
17 #include "xdict.h"
18 #include "rule.h"
19 #include "charset.h"
20 #include "darray.h"
21 #include "xtree.h"
22 #include <stdio.h>
23 #include <math.h>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 /* quick macro define for frequency usage */
28 #define	SCWS_IS_SPECIAL(x,l)	scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
29 #define	SCWS_IS_NOSTATS(x,l)	scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
30 #define	SCWS_CHARLEN(x)			s->mblen[(x)]
31 #define	SCWS_IS_ALNUM(x)		(((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
32 #define	SCWS_IS_ALPHA(x)		(((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
33 #define	SCWS_IS_UALPHA(x)		((x)>=65&&(x)<=90)
34 #define	SCWS_IS_DIGIT(x)		((x)>=48&&(x)<=57)
35 #define	SCWS_IS_WHEAD(x)		((x) & SCWS_ZFLAG_WHEAD)
36 #define	SCWS_IS_ECHAR(x)		((x) & SCWS_ZFLAG_ENGLISH)
37 #define	SCWS_NO_RULE1(x)		(((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
38 ///#define	SCWS_NO_RULE2(x)		(((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
39 #define	SCWS_NO_RULE2			SCWS_NO_RULE1
40 #define	SCWS_MAX_EWLEN			33
41 ///hightman.070706: char token
42 #define	SCWS_CHAR_TOKEN(x)		((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
43 ///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
44 #define	SCWS_MAX_ZLEN			128
45 #define	SCWS_EN_IDF(x)			(float)(2.5*logf(x))
46 
47 static const char *attr_en = "en";
48 static const char *attr_un = "un";
49 static const char *attr_nr = "nr";
50 static const char *attr_na = "!";
51 
52 /* create scws engine */
scws_new()53 scws_t scws_new()
54 {
55 	scws_t s;
56 	s = (scws_t) malloc(sizeof(scws_st));
57     if (s == NULL)
58         return s;
59 	memset(s, 0, sizeof(scws_st));
60 	s->mblen = charset_table_get(NULL);
61 	s->off = s->len = 0;
62 	s->wend = -1;
63 
64 	return s;
65 }
66 
67 /* hightman.110320: fork scws */
scws_fork(scws_t p)68 scws_t scws_fork(scws_t p)
69 {
70 	scws_t s = scws_new();
71 
72 	if (p != NULL && s != NULL)
73 	{
74 		s->mblen = p->mblen;
75 		s->mode = p->mode;
76 		// fork dict/rules
77 		s->r = scws_rule_fork(p->r);
78 		s->d = xdict_fork(p->d);
79 	}
80 
81 	return s;
82 }
83 
84 /* close & free the engine */
scws_free(scws_t s)85 void scws_free(scws_t s)
86 {
87 	if (s->d)
88 	{
89 		xdict_close(s->d);
90 		s->d = NULL;
91 	}
92 	if (s->r)
93 	{
94 		scws_rule_free(s->r);
95 		s->r = NULL;
96 	}
97 	free(s);
98 }
99 
100 /* add a dict into scws */
scws_add_dict(scws_t s,const char * fpath,int mode)101 int scws_add_dict(scws_t s, const char *fpath, int mode)
102 {
103 	xdict_t xx;
104 	if (mode & SCWS_XDICT_SET)
105 	{
106 		xdict_close(s->d);
107 		mode ^= SCWS_XDICT_SET;
108 		s->d = NULL;
109 	}
110 	xx = s->d;
111 	s->d = xdict_add(s->d, fpath, mode, s->mblen);
112 	return (xx == s->d ? -1 : 0);
113 }
114 
115 /* set the dict & open it */
scws_set_dict(scws_t s,const char * fpath,int mode)116 int scws_set_dict(scws_t s, const char *fpath, int mode)
117 {
118 	return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
119 }
120 
scws_set_charset(scws_t s,const char * cs)121 void scws_set_charset(scws_t s, const char *cs)
122 {
123 	s->mblen = charset_table_get(cs);
124 }
125 
scws_set_rule(scws_t s,const char * fpath)126 void scws_set_rule(scws_t s, const char *fpath)
127 {
128 	if (s->r != NULL)
129 		scws_rule_free(s->r);
130 
131 	s->r = scws_rule_new(fpath, s->mblen);
132 }
133 
134 /* set ignore symbol or multi segments */
scws_set_ignore(scws_t s,int yes)135 void scws_set_ignore(scws_t s, int yes)
136 {
137 	if (yes == SCWS_YEA)
138 		s->mode |= SCWS_IGN_SYMBOL;
139 
140 	if (yes == SCWS_NA)
141 		s->mode &= ~SCWS_IGN_SYMBOL;
142 }
143 
scws_set_multi(scws_t s,int mode)144 void scws_set_multi(scws_t s, int mode)
145 {
146 	s->mode &= ~SCWS_MULTI_MASK;
147 
148 	if (mode & SCWS_MULTI_MASK)
149 		s->mode |= mode;
150 }
151 
scws_set_debug(scws_t s,int yes)152 void scws_set_debug(scws_t s, int yes)
153 {
154 	if (yes == SCWS_YEA)
155 		s->mode |= SCWS_DEBUG;
156 
157 	if (yes == SCWS_NA)
158 		s->mode &= ~SCWS_DEBUG;
159 }
160 
scws_set_duality(scws_t s,int yes)161 void scws_set_duality(scws_t s, int yes)
162 {
163 	if (yes == SCWS_YEA)
164 		s->mode |= SCWS_DUALITY;
165 
166 	if (yes == SCWS_NA)
167 		s->mode &= ~SCWS_DUALITY;
168 }
169 
170 /* send the text buffer & init some others */
scws_send_text(scws_t s,const char * text,int len)171 void scws_send_text(scws_t s, const char *text, int len)
172 {
173 	s->txt = (unsigned char *) text;
174 	s->len = len;
175 	s->off = 0;
176 }
177 
178 /* get some words, if these is not words, return NULL */
179 #define	SCWS_PUT_RES(o,i,l,a)									\
180 do {															\
181 	scws_res_t res;												\
182 	res = (scws_res_t) malloc(sizeof(struct scws_result));		\
183 	res->off = o;												\
184 	res->idf = i;												\
185 	res->len = l;												\
186 	strncpy(res->attr, a, 2);									\
187 	res->attr[2] = '\0';										\
188 	res->next = NULL;											\
189 	if (s->res1 == NULL)										\
190 		s->res1 = s->res0 = res;								\
191 	else														\
192 	{															\
193 		s->res1->next = res;									\
194 		s->res1 = res;											\
195 	}															\
196 } while(0)
197 
198 /* single bytes segment (纯单字节字符) */
199 #define	PFLAG_WITH_MB		0x01
200 #define	PFLAG_ALNUM			0x02
201 #define	PFLAG_VALID			0x04
202 #define	PFLAG_DIGIT			0x08
203 #define	PFLAG_ADDSYM		0x10
204 #define	PFLAG_ALPHA			0x20
205 #define	PFLAG_LONGDIGIT		0x40
206 #define	PFLAG_LONGALPHA		0x80
207 
_str_toupper(char * src,char * dst)208 static void _str_toupper(char *src, char *dst)
209 {
210 	while (*src)
211 	{
212 		*dst++ = *src++;
213 		if (dst[-1] >= 'a' && dst[-1] <= 'z')
214 			dst[-1] ^= 0x20;
215 	}
216 }
217 
_str_tolower(char * src,char * dst)218 static void _str_tolower(char *src, char *dst)
219 {
220 	while (*src)
221 	{
222 		*dst++ = *src++;
223 		if (dst[-1] >= 'A' && dst[-1] <= 'Z')
224 			dst[-1] ^= 0x20;
225 	}
226 }
227 
228 #ifdef HAVE_STRNDUP
229 #define	_mem_ndup		strndup
230 #else
_mem_ndup(const char * src,int len)231 static inline void *_mem_ndup(const char *src, int len)
232 {
233 	char *dst;
234 	dst = malloc(len+1);
235 	memcpy(dst, src, len);
236 	dst[len] = '\0';
237 	return dst;
238 }
239 #endif
240 
_scws_alnum_multi(scws_t s,int start,int wlen)241 static void _scws_alnum_multi(scws_t s, int start, int wlen)
242 {
243 	char chunk[SCWS_MAX_EWLEN];
244 	int i, j, k, ch, pflag;
245 	unsigned char *txt;
246 	float idf;
247 
248 	txt = s->txt;
249 	pflag = 0;
250 	for (i = j = k = 0; i < wlen; i++)
251 	{
252 		ch = txt[start + i];
253 		if (SCWS_IS_DIGIT(ch))
254 		{
255 			if (pflag & PFLAG_DIGIT)
256 				continue;
257 			if (pflag != 0)
258 			{
259 				chunk[j++] = (char) (i-k);
260 				k = i;
261 			}
262 			pflag = PFLAG_DIGIT;
263 		}
264 		else if (SCWS_IS_ALPHA(ch))
265 		{
266 			if (pflag & PFLAG_ALPHA)
267 				continue;
268 			if (pflag != 0)
269 			{
270 				chunk[j++] = (char) (i-k);
271 				k = i;
272 			}
273 			pflag = PFLAG_ALPHA;
274 		}
275 		else
276 		{
277 			if (pflag & PFLAG_ADDSYM)
278 				continue;
279 			if (pflag != 0)
280 			{
281 				chunk[j++] = (char) (i-k);
282 				k = i;
283 			}
284 			pflag = PFLAG_ADDSYM;
285 		}
286 	}
287 
288 	if (j > 0)
289 	{
290 		chunk[j] = (char) (i-k);
291 		ch = start;
292 		for (i = 0; i <= j; i++)
293 		{
294 			if (!SCWS_IS_ALNUM(txt[ch]))
295 			{
296 				// just skip
297 			}
298 			else if (chunk[i] == 1)
299 			{
300 				if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
301 				{
302 					if (!SCWS_IS_ALNUM(txt[ch-1]))
303 					{
304 						idf = SCWS_EN_IDF(chunk[i]);
305 						SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
306 					}
307 					else
308 					{
309 						idf = SCWS_EN_IDF(chunk[i-1]+1);
310 						SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
311 					}
312 				}
313 				if (i < j && (i != 0 || j != 1))
314 				{
315 					if (!SCWS_IS_ALNUM(txt[ch+1]))
316 					{
317 						idf = SCWS_EN_IDF(chunk[i]);
318 						SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
319 					}
320 					else
321 					{
322 						idf = SCWS_EN_IDF(chunk[i+1]+1);
323 						SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
324 					}
325 				}
326 			}
327 			else
328 			{
329 				idf = SCWS_EN_IDF(chunk[i]);
330 				SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
331 			}
332 			ch += chunk[i];
333 		}
334 	}
335 }
336 
_scws_ssegment(scws_t s,int end)337 static void _scws_ssegment(scws_t s, int end)
338 {
339 	int start, wlen, ch, pflag, ipflag = 0;
340 	unsigned char *txt;
341 	float idf;
342 
343 	start = s->off;
344 	wlen = end - start;
345 
346 	/* check special words (need strtoupper) */
347 	if (wlen > 1)
348 	{
349 		txt = (char *) _mem_ndup(s->txt + start, wlen);
350 		_str_toupper(txt, txt);
351 		if (SCWS_IS_SPECIAL(txt, wlen))
352 		{
353 			SCWS_PUT_RES(start, 9.5, wlen, "nz");
354 			free(txt);
355 			return;
356 		}
357 		free(txt);
358 	}
359 
360 	txt = s->txt;
361 	/* check brief words such as S.H.E M.R. */
362 	if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
363 	{
364 		for (ch = start + 2; ch < end; ch++)
365 		{
366 			if (!SCWS_IS_ALPHA(txt[ch])) break;
367 			ch++;
368 			if (ch == end || txt[ch] != '.') break;
369 		}
370 		if (ch == end)
371 		{
372 			SCWS_PUT_RES(start, 7.5, wlen, "nz");
373 			return;
374 		}
375 	}
376 
377 	/* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
378 	while (start < end)
379 	{
380 		ch = txt[start++];
381 		if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
382 			ipflag = 0;
383 		if (SCWS_IS_ALNUM(ch))
384 		{
385 			pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
386 			wlen = 1;
387 			while (start < end)
388 			{
389 				ch = txt[start];
390 				if (pflag & PFLAG_DIGIT)
391 				{
392 					if (!SCWS_IS_DIGIT(ch))
393 					{
394 						// check percent % = 0x25
395 						if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
396 						{
397 							start++;
398 							wlen++;
399 							break;
400 						}
401 						if (ipflag)
402 							break;
403 						// special for IP address or version number? (find out all digit + dot)
404 						if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
405 						{
406 							ipflag = 1;
407 							while(--wlen && txt[--start] != 0x2e);
408 							pflag = 0;
409 							break;
410 						}
411 						// wlen = 1
412 						if (wlen == 1 && SCWS_IS_ALPHA(ch))
413 						{
414 							pflag ^= PFLAG_DIGIT;
415 							pflag |= PFLAG_ADDSYM;
416 							continue;
417 						}
418 						// strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
419 						if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
420 							break;
421 						pflag |= PFLAG_ADDSYM;
422 					}
423 				}
424 				else
425 				{
426 					/* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
427 					if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
428 						pflag |= PFLAG_ADDSYM;
429 					else if (!SCWS_IS_ALPHA(ch))
430 					{
431 						if ((pflag & PFLAG_ADDSYM)
432 							|| !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
433 								|| (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
434 						{
435 							break;
436 						}
437 						pflag |= PFLAG_ADDSYM;
438 					}
439 				}
440 				start++;
441 				wlen++;
442 				if (wlen >= SCWS_MAX_EWLEN)
443 					break;
444 			}
445 			idf = SCWS_EN_IDF(wlen);
446 			SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
447 			if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
448 				_scws_alnum_multi(s, start-wlen, wlen);
449 		}
450 		else if (!(s->mode & SCWS_IGN_SYMBOL))
451 		{
452 			SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
453 		}
454 	}
455 }
456 
457 /* multibyte segment */
_scws_mget_word(scws_t s,int i,int j)458 static int _scws_mget_word(scws_t s, int i, int j)
459 {
460 	int r, k;
461 	word_t item;
462 
463 	if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
464 		return i;
465 
466 	for (r=i, k=i+1; k <= j; k++)
467 	{
468 		item = s->wmap[i][k];
469 		if (item && (item->flag & SCWS_WORD_FULL))
470 		{
471 			r = k;
472 			if (!(item->flag & SCWS_WORD_PART))
473 				break;
474 		}
475 	}
476 	return r;
477 }
478 
_scws_mset_word(scws_t s,int i,int j)479 static void _scws_mset_word(scws_t s, int i, int j)
480 {
481 	word_t item;
482 
483 	item = s->wmap[i][j];
484 	/* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
485 	if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
486       && !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
487 		return;
488 
489 	/* hightman.070701: 散字自动二元聚合 */
490 	if (s->mode & SCWS_DUALITY)
491 	{
492 		int k = s->zis;
493 
494 		if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
495 		{
496 			s->zis = i;
497 			if (k < 0)
498 				return;
499 
500 			i = (k & ~SCWS_ZIS_USED);
501 			if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
502 			{
503 				SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
504 				if (i != (j-1))
505 					return;
506 			}
507 			s->zis |= SCWS_ZIS_USED;
508 		}
509 		else
510 		{
511 			if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
512 			{
513 				k &= ~SCWS_ZIS_USED;
514 				SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
515 			}
516 			if (j > i)
517 				s->wend = j + 1;
518 			s->zis = -1;
519 		}
520 	}
521 
522 	SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
523 
524 	// hightman.070902: multi segment
525 	// step1: split to short words
526 	if ((j-i) > 1)
527 	{
528 		int n, k, m = i;
529 		if (s->mode & SCWS_MULTI_SHORT)
530 		{
531 			while (m < j)
532 			{
533 				k = m;
534 				// hightman.111223: multi short enhanced
535 				for (n = m + 1; n <= j; n++)
536 				{
537 					// 3 chars at most
538 					if ((n == j && m == i) || (n - m) > 2) break;
539 					item = s->wmap[m][n];
540 					if (!item) continue;
541 					// first shortest or last longest word
542 					if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
543 						k = n;
544 					if (!(item->flag & SCWS_WORD_PART)) break;
545 				}
546 				// short word not found, stop to find, passed to next loop
547 				if (k == m)
548 					break;
549 
550 				// save the short word
551 				item = s->wmap[m][k];
552 				SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
553 				// find the next word or go to prev for duality last word
554 				if ((m = k + 1) == j)
555 				{
556 					m--;
557 					break;
558 				}
559 			}
560 		}
561 
562 		if (s->mode & SCWS_MULTI_DUALITY)
563 		{
564 			while (m < j)
565 			{
566 				if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
567 				{
568 					SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
569 					s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
570 				}
571 				else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
572 				{
573 					if (m == i)
574 					{
575 						SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
576 						s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
577 					}
578 					m++;
579 					SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
580 					s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
581 				}
582 				else
583 				{
584 					SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
585 				}
586 				m++;
587 				if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
588 				{
589 					SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
590 					s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
591 				}
592 			}
593 		}
594 	}
595 
596 	// step2, split to single char
597 	if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
598 	{
599 		if ((j - i) == 1 && !s->wmap[i][j])
600 		{
601 			if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
602 			else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
603 			s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
604 		}
605 		do
606 		{
607 			if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
608 				continue;
609 			if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
610 				continue;
611 			SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
612 		}
613 		while (++i <= j);
614 	}
615 }
616 
_scws_mseg_zone(scws_t s,int f,int t)617 static void _scws_mseg_zone(scws_t s, int f, int t)
618 {
619 	unsigned char *mpath, *npath;
620 	word_t **wmap;
621 	int x,i,j,m,n,j2,sz;
622 	double weight, nweight;
623 	char attr1[3];
624 
625 	mpath = npath = NULL;
626 	weight = nweight = (double) 0.0;
627 
628 	wmap = s->wmap;
629 	j2 = 0;
630 	for (x = i = f; i <= t; i++)
631 	{
632 		j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
633 		if (j == i) continue;
634 		// skip NR in NR
635 		if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
636 		if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
637 
638 		/* one word only */
639 		if (i == f && j == t)
640 		{
641 			mpath = (unsigned char *) malloc(2);
642 			mpath[0] = j - i;
643 			mpath[1] = 0xff;
644 			break;
645 		}
646 
647 		if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
648 			continue;
649 
650 		/* create the new path */
651 		wmap[i][j]->flag |= SCWS_WORD_USED;
652 		nweight = (double) wmap[i][j]->tf * pow(j-i,4);
653 
654 		if (npath == NULL)
655 		{
656 			npath = (unsigned char *) malloc(t-f+2);
657 			memset(npath, 0xff, t-f+2);
658 		}
659 
660 		/* lookfor backward */
661 		x = sz = 0;
662 		memset(attr1, 0, sizeof(attr1));
663 		for (m = f; m < i; m = n+1)
664 		{
665 			n = _scws_mget_word(s, m, i-1);
666 			nweight *= wmap[m][n]->tf;
667 			npath[x++] = n - m;
668 			if (n > m)
669 			{
670 				nweight *= pow(n-m,4);
671 				wmap[m][n]->flag |= SCWS_WORD_USED;
672 			}
673 			else sz++;
674 
675 			if (attr1[0] != '\0')
676 				nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
677 			memcpy(attr1, wmap[m][n]->attr, 2);
678 		}
679 
680 		/* my self */
681 		npath[x++] = j - i;
682 
683 		if (attr1[0] != '\0')
684 			nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
685 		memcpy(attr1, wmap[i][j]->attr, 2);
686 
687 		/* lookfor forward */
688 		for (m = j+1; m <= t; m = n+1)
689 		{
690 			n = _scws_mget_word(s, m, t);
691 			nweight *= wmap[m][n]->tf;
692 			npath[x++] = n - m;
693 			if (n > m)
694 			{
695 				nweight *= pow(n-m,4);
696 				wmap[m][n]->flag |= SCWS_WORD_USED;
697 			}
698 			else sz++;
699 
700 			nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
701 			memcpy(attr1, wmap[m][n]->attr, 2);
702 		}
703 
704 		npath[x] = 0xff;
705 		nweight /= pow(x+sz-1,5);
706 
707 		/* draw the path for debug */
708 #ifdef DEBUG
709 		if (s->mode & SCWS_DEBUG)
710 		{
711 			fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
712 				s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
713 			for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
714 			{
715 				n += m;
716 				fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
717 				m = n + 1;
718 			}
719 			fprintf(stderr, "\n--\n");
720 		}
721 #endif
722 
723 		j2 = x = j;
724 		if ((x - i) > 1) i--;
725 		/* check better path */
726 		if (nweight > weight)
727 		{
728 			unsigned char *swap;
729 
730 			weight = nweight;
731 			swap = mpath;
732 			mpath = npath;
733 			npath = swap;
734 		}
735 	}
736 
737 	/* set the result, mpath != NULL */
738 	if (mpath == NULL)
739 		return;
740 
741 	for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
742 	{
743 		n += m;
744 		_scws_mset_word(s, m, n);
745 		m = n + 1;
746 	}
747 
748 	/* 一口.070808: memory leak fixed. */
749 	if (mpath) free(mpath);
750 	if (npath) free(npath);
751 }
752 
753 /* quick define for zrule_checker in loop */
754 #define	___ZRULE_CHECKER1___														\
755 if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag))									\
756 	break;
757 
758 #define	___ZRULE_CHECKER2___														\
759 if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag))										\
760 	break;
761 
762 #define	___ZRULE_CHECKER3___														\
763 if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start))	\
764 	break;
765 
_scws_msegment(scws_t s,int end,int zlen)766 static void _scws_msegment(scws_t s, int end, int zlen)
767 {
768 	word_t **wmap, query;
769 	struct scws_zchar *zmap;
770 	unsigned char *txt;
771 #ifdef HAVE_NAME_RULE	/* 20150403: Remove rules, just deepend on dictionary */
772 	rule_item_t r1;
773 #endif
774 	int i, j, k, ch, clen, start;
775 	pool_t p;
776 
777 	/* pool used to management some dynamic memory */
778 	p = pool_new();
779 
780 	/* create wmap & zmap */
781 	wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
782 	zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
783 	txt = s->txt;
784 	start = s->off;
785 	s->zis = -1;
786 
787 	for (i = 0; start < end; i++)
788 	{
789 		ch = txt[start];
790 		clen = SCWS_CHARLEN(ch);
791 		if (clen == 1)
792 		{
793 			while (start++ < end)
794 			{
795 				ch = txt[start];
796 				if (start == end || SCWS_CHARLEN(txt[start]) > 1)
797 					break;
798 				clen++;
799 			}
800 			wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
801 			wmap[i][i]->tf = 0.5;
802 			wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
803 			strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
804 		}
805 		else
806 		{
807 			query = xdict_query(s->d, txt + start, clen);
808 			wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
809 			if (query == NULL)
810 			{
811 				wmap[i][i]->tf = 0.5;
812 				wmap[i][i]->idf = 0.0;
813 				wmap[i][i]->flag = 0;
814 				strcpy(wmap[i][i]->attr, attr_un);
815 			}
816 			else
817 			{
818 				ch = query->flag;
819 				query->flag = SCWS_WORD_FULL;
820 				memcpy(wmap[i][i], query, sizeof(word_st));
821 				if (query->attr[0] == '#')
822 					wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
823 
824 				if (ch & SCWS_WORD_MALLOCED)
825 					free(query);
826 			}
827 			start += clen;
828 		}
829 
830 		zmap[i].start = start - clen;
831 		zmap[i].end = start;
832 	}
833 
834 	/* fixed real zlength */
835 	zlen = i;
836 
837 	/* create word query table */
838 	for (i = 0; i < zlen; i++)
839 	{
840 		k = 0;
841 		for (j = i+1; j < zlen; j++)
842 		{
843 			query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
844 			if (query == NULL)
845 				break;
846 			ch = query->flag;
847 			if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
848 			{
849 				wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
850 				memcpy(wmap[i][j], query, sizeof(word_st));
851 
852 				wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
853 
854 				for (k = i+1; k <= j; k++)
855 					wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
856 			}
857 
858 			if (ch & SCWS_WORD_MALLOCED)
859 				free(query);
860 
861 			if (!(ch & SCWS_WORD_PART))
862 				break;
863 		}
864 
865 		if (k--)
866 		{
867 			/* set nr2 to some short name */
868 			if ((k == (i+1)))
869 			{
870 				if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
871 					wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
872 				//if (wmap[i][k]->attr[0] == 'n')
873 					//wmap[i][i]->flag |= SCWS_ZFLAG_N2;
874 			}
875 
876 			/* clean the PART flag for the last word */
877 			if (k < j)
878 				wmap[i][k]->flag ^= SCWS_WORD_PART;
879 		}
880 	}
881 
882 	if (s->r == NULL)
883 		goto do_segment;
884 
885 #ifdef HAVE_NAME_RULE	/* 20150403: Remove rules, just deepend on dictionary */
886 	/* auto rule set for name & zone & chinese numeric */
887 
888 	/* one word auto rule check */
889 	for (i = 0; i < zlen; i++)
890 	{
891 		if (SCWS_NO_RULE1(wmap[i][i]->flag))
892 			continue;
893 
894 		r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
895 		if (r1 == NULL)
896 			continue;
897 
898 		clen = r1->zmin > 0 ? r1->zmin : 1;
899 		if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
900 		{
901 			/* prefix, check after (zmin~zmax) */
902 			// 先检查 zmin 字内是否全部符合要求
903 			// 再在 zmax 范围内取得符合要求的字
904 			// int i, j, k, ch, clen, start;
905 			for (ch = 1; ch <= clen; ch++)
906 			{
907 				j = i + ch;
908 				___ZRULE_CHECKER1___
909 				___ZRULE_CHECKER3___
910 			}
911 
912 			if (ch <= clen)
913 				continue;
914 
915 			/* no limit znum or limit to a range */
916 			j = i + ch;
917 			while (1)
918 			{
919 				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
920 					break;
921 				___ZRULE_CHECKER1___
922 				___ZRULE_CHECKER3___
923 				clen++;
924 				j++;
925 			}
926 
927 			// 注意原来2字人名,识别后仍为2字的情况
928 			if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
929 			{
930 				if (clen == 1)
931 					continue;
932 				wmap[i][i+1]->flag |= SCWS_WORD_PART;
933 			}
934 
935 			/* ok, got: i & clen */
936 			k = i + clen;
937 			wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
938 			wmap[i][k]->tf = r1->tf;
939 			wmap[i][k]->idf = r1->idf;
940 			wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
941 			strncpy(wmap[i][k]->attr, r1->attr, 2);
942 
943 			wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
944 			for (j = i+1; j <= k; j++)
945 				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
946 
947 			if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
948 				i = k;
949 
950 			continue;
951 		}
952 
953 		if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
954 		{
955 			/* suffix, check before */
956 			for (ch = 1; ch <= clen; ch++)
957 			{
958 				j = i - ch;
959 				___ZRULE_CHECKER2___
960 				___ZRULE_CHECKER3___
961 			}
962 
963 			if (ch <= clen)
964 				continue;
965 
966 			/* no limit znum or limit to a range */
967 			j = i - ch;
968 			while (1)
969 			{
970 				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
971 					break;
972 				___ZRULE_CHECKER2___
973 				___ZRULE_CHECKER3___
974 				clen++;
975 				j--;
976 			}
977 
978 			/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
979 			k = i - clen;
980 			if (wmap[k][i] != NULL)
981 				continue;
982 
983 			wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
984 			wmap[k][i]->tf = r1->tf;
985 			wmap[k][i]->idf = r1->idf;
986 			wmap[k][i]->flag = SCWS_WORD_FULL;
987 			strncpy(wmap[k][i]->attr, r1->attr, 2);
988 
989 			wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
990 			for (j = k+1; j <= i; j++)
991 			{
992 				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
993 				if ((j != i) && (wmap[k][j] != NULL))
994 					wmap[k][j]->flag |= SCWS_WORD_PART;
995 			}
996 			continue;
997 		}
998 	}
999 
1000 	/* two words auto rule check (欧阳** , **西路) */
1001 	for (i = zlen - 2; i >= 0; i--)
1002 	{
1003 		/* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
1004 		if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
1005 			continue;
1006 
1007 		k = i+1;
1008 		r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
1009 		if (r1 == NULL)
1010 			continue;
1011 
1012 		clen = r1->zmin > 0 ? r1->zmin : 1;
1013 		if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
1014 		{
1015 			for (ch = 1; ch <= clen; ch++)
1016 			{
1017 				j = k + ch;
1018 				___ZRULE_CHECKER1___
1019 				___ZRULE_CHECKER3___
1020 			}
1021 
1022 			if (ch <= clen)
1023 				continue;
1024 
1025 			/* no limit znum or limit to a range */
1026 			j = k + ch;
1027 			while (1)
1028 			{
1029 				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1030 					break;
1031 				___ZRULE_CHECKER1___
1032 				___ZRULE_CHECKER3___
1033 				clen++;
1034 				j++;
1035 			}
1036 
1037 			/* ok, got: i & clen */
1038 			k = k + clen;
1039 			wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
1040 			wmap[i][k]->tf = r1->tf;
1041 			wmap[i][k]->idf = r1->idf;
1042 			wmap[i][k]->flag = SCWS_WORD_FULL;
1043 			strncpy(wmap[i][k]->attr, r1->attr, 2);
1044 
1045 			wmap[i][i+1]->flag |= SCWS_WORD_PART;
1046 			for (j = i+2; j <= k; j++)
1047 				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1048 
1049 			i--;
1050 			continue;
1051 		}
1052 
1053 		if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
1054 		{
1055 			/* suffix, check before */
1056 			for (ch = 1; ch <= clen; ch++)
1057 			{
1058 				j = i - ch;
1059 				___ZRULE_CHECKER2___
1060 				___ZRULE_CHECKER3___
1061 			}
1062 
1063 			if (ch <= clen)
1064 				continue;
1065 
1066 			/* no limit znum or limit to a range */
1067 			j = i - ch;
1068 			while (1)
1069 			{
1070 				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1071 					break;
1072 				___ZRULE_CHECKER2___
1073 				___ZRULE_CHECKER3___
1074 				clen++;
1075 				j--;
1076 			}
1077 
1078 			/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
1079 			k = i - clen;
1080 			i = i + 1;
1081 			wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
1082 			wmap[k][i]->tf = r1->tf;
1083 			wmap[k][i]->idf = r1->idf;
1084 			wmap[k][i]->flag = SCWS_WORD_FULL;
1085 			strncpy(wmap[k][i]->attr, r1->attr, 2);
1086 
1087 			wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
1088 			for (j = k+1; j <= i; j++)
1089 			{
1090 				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1091 				if (wmap[k][j] != NULL)
1092 					wmap[k][j]->flag |= SCWS_WORD_PART;
1093 			}
1094 
1095 			i -= (clen+1);
1096 			continue;
1097 		}
1098 	}
1099 #endif
1100 
1101 	/* real do the segment */
1102 do_segment:
1103 
1104 	/* find the easy break point */
1105 	for (i = 0, j = 0; i < zlen; i++)
1106 	{
1107 		if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
1108 			continue;
1109 
1110 		if (i > j)
1111 			_scws_mseg_zone(s, j, i-1);
1112 
1113 		j = i;
1114 		if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
1115 		{
1116 			_scws_mset_word(s, i, i);
1117 			j++;
1118 		}
1119 	}
1120 
1121 	/* the lastest zone */
1122 	if (i > j)
1123 		_scws_mseg_zone(s, j, i-1);
1124 
1125 	/* the last single for duality */
1126 	if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
1127 	{
1128 		i = s->zis;
1129 		SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
1130 	}
1131 
1132 	/* free the wmap & zmap */
1133 	pool_free(p);
1134 	darray_free((void **) wmap);
1135 }
1136 
scws_get_result(scws_t s)1137 scws_res_t scws_get_result(scws_t s)
1138 {
1139 	int off, len, ch, clen, zlen, pflag;
1140 	unsigned char *txt;
1141 
1142 	off = s->off;
1143 	len = s->len;
1144 	txt = s->txt;
1145 	s->res0 = s->res1 = NULL;
1146 	while ((off < len) && (txt[off] <= 0x20))
1147 	{
1148 		if (txt[off] == 0x0a || txt[off] == 0x0d)
1149 		{
1150 			s->off = off + 1;
1151 			SCWS_PUT_RES(off, 0.0, 1, attr_un);
1152 			return s->res0;
1153 		}
1154 		off++;
1155 	}
1156 
1157 	if (off >= len)
1158 		return NULL;
1159 
1160 	/* try to parse the sentence */
1161 	s->off = off;
1162 	ch = txt[off];
1163 	if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
1164 	{
1165 		s->off++;
1166 		SCWS_PUT_RES(off, 0.0, 1, attr_un);
1167 		return s->res0;
1168 	}
1169 	clen = SCWS_CHARLEN(ch);
1170 	zlen = 1;
1171 	pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
1172 	while ((off = (off+clen)) < len)
1173 	{
1174 		ch = txt[off];
1175 		if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
1176 		clen = SCWS_CHARLEN(ch);
1177 		if (!(pflag & PFLAG_WITH_MB))
1178 		{
1179 			// pure single-byte -> multibyte (2bytes)
1180 			if (clen == 1)
1181 			{
1182 				if (pflag & PFLAG_ALNUM)
1183 				{
1184 					if (SCWS_IS_ALPHA(ch))
1185 					{
1186 						if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
1187 							pflag |= PFLAG_LONGALPHA;
1188 					}
1189 					else if (SCWS_IS_DIGIT(ch))
1190 					{
1191 						if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
1192 							pflag |= PFLAG_LONGDIGIT;
1193 					}
1194 					else
1195 						pflag ^= PFLAG_ALNUM;
1196 				}
1197 			}
1198 			else
1199 			{
1200 				if (!(pflag & PFLAG_ALNUM) || zlen > 2)
1201 					break;
1202 
1203 				pflag |= PFLAG_WITH_MB;
1204 				/* zlen = 1; */
1205 			}
1206 		}
1207 		else if ((pflag & PFLAG_WITH_MB) && clen == 1)
1208 		{
1209 			int i;
1210 
1211 			// mb + single-byte. allowd: alpha+num + 中文
1212 			if (!SCWS_IS_ALNUM(ch))
1213 				break;
1214 
1215 			pflag &= ~PFLAG_VALID;
1216 			// 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
1217 			for (i = off+1; i < (off+3); i++)
1218 			{
1219 				ch = txt[i];
1220 				if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
1221 				{
1222 					pflag |= PFLAG_VALID;
1223 					break;
1224 				}
1225 
1226 				if (!SCWS_IS_ALNUM(ch))
1227 					break;
1228 			}
1229 
1230 			if (!(pflag & PFLAG_VALID))
1231 				break;
1232 
1233 			clen += (i - off - 1);
1234 		}
1235 		/* hightman.070813: add max zlen limit */
1236 		if (++zlen >= SCWS_MAX_ZLEN)
1237 		    break;
1238 	}
1239 
1240 	/* hightman.070624: 处理半个字的问题 */
1241 	if ((ch = off) > len)
1242 		off -= clen;
1243 
1244 	/* do the real segment */
1245 	if (off <= s->off)
1246 		return NULL;
1247 	else if (pflag & PFLAG_WITH_MB)
1248 		_scws_msegment(s, off, zlen);
1249 	else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
1250 		_scws_ssegment(s, off);
1251 	else
1252 	{
1253 		zlen = off - s->off;
1254 		if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
1255 			_scws_alnum_multi(s, s->off, zlen);
1256 		else
1257 		{
1258 			float idf;
1259 
1260 			idf = SCWS_EN_IDF(zlen);
1261 			SCWS_PUT_RES(s->off, idf, zlen, attr_en);
1262 
1263 			/* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
1264 			if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
1265 				_scws_alnum_multi(s, s->off, zlen);
1266 		}
1267 	}
1268 
1269 	/* reutrn the result */
1270 	s->off = (ch > len ? len : off);
1271 	if (s->res0 == NULL)
1272 		return scws_get_result(s);
1273 
1274 	return s->res0;
1275 }
1276 
1277 /* free the result retunned by scws_get_result */
scws_free_result(scws_res_t result)1278 void scws_free_result(scws_res_t result)
1279 {
1280 	scws_res_t cur;
1281 
1282 	while ((cur = result) != NULL)
1283 	{
1284 		result = cur->next;
1285 		free(cur);
1286 	}
1287 }
1288 
1289 /* top words count */
1290 // xattr = ~v,p,c
1291 // xattr = v,pn,c
1292 
_tops_cmp(a,b)1293 static int _tops_cmp(a, b)
1294 	scws_top_t *a,*b;
1295 {
1296 	if ((*b)->weight > (*a)->weight)
1297 		return 1;
1298 	return -1;
1299 }
1300 
_tops_load_node(node_t node,scws_top_t * values,int * start)1301 static void _tops_load_node(node_t node, scws_top_t *values, int *start)
1302 {
1303 	int i = *start;
1304 
1305 	if (node == NULL)
1306 		return;
1307 
1308 	values[i] = node->value;
1309 	values[i]->word = node->key;
1310 
1311 	*start = ++i;
1312 	_tops_load_node(node->left, values, start);
1313 	_tops_load_node(node->right, values, start);
1314 }
1315 
_tops_load_all(xtree_t xt,scws_top_t * values)1316 static void _tops_load_all(xtree_t xt, scws_top_t *values)
1317 {
1318 	int i, start;
1319 
1320 	for (i = 0, start = 0; i < xt->prime; i++)
1321 		_tops_load_node(xt->trees[i], values, &start);
1322 }
1323 
1324 typedef char word_attr[4];
_attr_belong(const char * a,word_attr * at)1325 static inline int _attr_belong(const char *a, word_attr *at)
1326 {
1327 	if ((*at)[0] == '\0') return 1;
1328 	while ((*at)[0])
1329 	{
1330 		if (!strcmp(a, *at)) return 1;
1331 		at++;
1332 	}
1333 	return 0;
1334 }
1335 
1336 /* macro to parse xattr -> xmode, at */
1337 #define	__PARSE_XATTR__		do {						\
1338 	if (xattr == NULL) break;							\
1339 	if (*xattr == '~') { xattr++; xmode = SCWS_YEA; }	\
1340 	if (*xattr == '\0') break;							\
1341 	cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr);	\
1342 	at = (word_attr *) malloc(cnt);						\
1343 	memset(at, 0, cnt);									\
1344 	cnt = 0;											\
1345 	for (cnt = 0; (word = strchr(xattr, ',')); cnt++) {	\
1346 		at[cnt][0] = *xattr++;							\
1347 		at[cnt][1] = xattr == word ? '\0' : *xattr;		\
1348 		xattr = word + 1;								\
1349 	}													\
1350 	strncpy(at[cnt], xattr, 2);							\
1351 } while (0)
1352 
scws_get_tops(scws_t s,int limit,char * xattr)1353 scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
1354 {
1355 	int off, cnt, xmode = SCWS_NA;
1356 	xtree_t xt;
1357 	scws_res_t res, cur;
1358 	scws_top_t top, *list, tail, base;
1359 	char *word;
1360 	word_attr *at = NULL;
1361 
1362 	if (!s || !s->txt || !(xt = xtree_new(0,1)))
1363 		return NULL;
1364 
1365 	__PARSE_XATTR__;
1366 
1367 	// save the offset.
1368 	off = s->off;
1369 	s->off = cnt = 0;
1370 	while ((cur = res = scws_get_result(s)) != NULL)
1371 	{
1372 		do
1373 		{
1374 			if (cur->idf < 0.2 || cur->attr[0] == '#')
1375 				continue;
1376 
1377 			/* check attribute filter */
1378 			if (at != NULL)
1379 			{
1380 				if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1381 					continue;
1382 
1383 				if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1384 					continue;
1385 			}
1386 
1387 			/* check stopwords */
1388 			if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
1389 			{
1390 				word = _mem_ndup(s->txt + cur->off, cur->len);
1391 				_str_tolower(word, word);
1392 				if (SCWS_IS_NOSTATS(word, cur->len))
1393 				{
1394 					free(word);
1395 					continue;
1396 				}
1397 				free(word);
1398 			}
1399 
1400 			/* put to the stats */
1401 			if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1402 			{
1403 				top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
1404 				top->weight = cur->idf;
1405 				top->times = 1;
1406 				strncpy(top->attr, cur->attr, 2);
1407 				xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1408 				cnt++;
1409 			}
1410 			else
1411 			{
1412 				top->weight += cur->idf;
1413 				top->times++;
1414 			}
1415 		}
1416 		while ((cur = cur->next) != NULL);
1417 		scws_free_result(res);
1418 	}
1419 
1420 	// free at
1421 	if (at != NULL)
1422 		free(at);
1423 	top = NULL;
1424 	if (cnt > 0)
1425 	{
1426 		/* sort the list */
1427 		list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
1428 		_tops_load_all(xt, list);
1429 		qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
1430 
1431 		/* save to return pointer */
1432 		if (!limit || limit > cnt)
1433 			limit = cnt;
1434 
1435 		top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
1436 		memcpy(top, list[0], sizeof(struct scws_topword));
1437 		top->word = strdup(list[0]->word);
1438 		top->next = NULL;
1439 
1440 		for (cnt = 1; cnt < limit; cnt++)
1441 		{
1442 			base = (scws_top_t) malloc(sizeof(struct scws_topword));
1443 			memcpy(base, list[cnt], sizeof(struct scws_topword));
1444 			base->word = strdup(list[cnt]->word);
1445 			base->next = NULL;
1446 			tail->next = base;
1447 			tail = base;
1448 		}
1449 		free(list);
1450 	}
1451 
1452 	// restore the offset
1453 	s->off = off;
1454 	xtree_free(xt);
1455 	return top;
1456 }
1457 
1458 // word check by attr.
scws_has_word(scws_t s,char * xattr)1459 int scws_has_word(scws_t s, char *xattr)
1460 {
1461 	int off, cnt, xmode = SCWS_NA;
1462 	scws_res_t res, cur;
1463 	char *word;
1464 	word_attr *at = NULL;
1465 
1466 	if (!s || !s->txt)
1467 		return 0;
1468 
1469 	__PARSE_XATTR__;
1470 
1471 	// save the offset. (cnt -> return_value)
1472 	off = s->off;
1473 	cnt = s->off = 0;
1474 	while (!cnt && (cur = res = scws_get_result(s)) != NULL)
1475 	{
1476 		do
1477 		{
1478 			/* check attribute filter */
1479 			if (at != NULL)
1480 			{
1481 				if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
1482 					cnt = 1;
1483 
1484 				if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
1485 					cnt = 1;
1486 			}
1487 		}
1488 		while (!cnt && (cur = cur->next) != NULL);
1489 		scws_free_result(res);
1490 	}
1491 	// memory leak fixed, thanks to lauxinz
1492 	if (at != NULL)
1493 		free(at);
1494 	s->off = off;
1495 	return cnt;
1496 }
1497 
1498 // get words by attr (rand order)
scws_get_words(scws_t s,char * xattr)1499 scws_top_t scws_get_words(scws_t s, char *xattr)
1500 {
1501 	int off, cnt, xmode = SCWS_NA;
1502 	xtree_t xt;
1503 	scws_res_t res, cur;
1504 	scws_top_t top, tail, base;
1505 	char *word;
1506 	word_attr *at = NULL;
1507 
1508 	if (!s || !s->txt || !(xt = xtree_new(0,1)))
1509 		return NULL;
1510 
1511 	__PARSE_XATTR__;
1512 
1513 	// save the offset.
1514 	off = s->off;
1515 	s->off = 0;
1516 	base = tail = NULL;
1517 	while ((cur = res = scws_get_result(s)) != NULL)
1518 	{
1519 		do
1520 		{
1521 			/* check attribute filter */
1522 			if (at != NULL)
1523 			{
1524 				if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1525 					continue;
1526 
1527 				if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1528 					continue;
1529 			}
1530 
1531 			/* put to the stats */
1532 			if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1533 			{
1534 				top = (scws_top_t) malloc(sizeof(struct scws_topword));
1535 				top->weight = cur->idf;
1536 				top->times = 1;
1537 				top->next = NULL;
1538 				top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
1539 				strncpy(top->attr, cur->attr, 2);
1540 				// add to the chain
1541 				if (tail == NULL)
1542 					base = tail = top;
1543 				else
1544 				{
1545 					tail->next = top;
1546 					tail = top;
1547 				}
1548 				xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1549 			}
1550 			else
1551 			{
1552 				top->weight += cur->idf;
1553 				top->times++;
1554 			}
1555 		}
1556 		while ((cur = cur->next) != NULL);
1557 		scws_free_result(res);
1558 	}
1559 
1560 	// free at & xtree
1561 	if (at != NULL)
1562 		free(at);
1563 	xtree_free(xt);
1564 
1565 	// restore the offset
1566 	s->off = off;
1567 	return base;
1568 }
1569 
scws_free_tops(scws_top_t tops)1570 void scws_free_tops(scws_top_t tops)
1571 {
1572 	scws_top_t cur;
1573 
1574 	while ((cur = tops) != NULL)
1575 	{
1576 		tops = cur->next;
1577 		if (cur->word)
1578 			free(cur->word);
1579 		free(cur);
1580 	}
1581 }
1582