1 /*
2 * @file scws.c (core segment functions)
3 * @author Hightman Mar
4 * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5 * $Id $
6 */
7
8 #ifdef HAVE_CONFIG_H
9 # include "config.h"
10 #endif
11
12 #ifdef WIN32
13 # include "config_win32.h"
14 #endif
15
16 #include "scws.h"
17 #include "xdict.h"
18 #include "rule.h"
19 #include "charset.h"
20 #include "darray.h"
21 #include "xtree.h"
22 #include <stdio.h>
23 #include <math.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 /* quick macro define for frequency usage */
28 #define SCWS_IS_SPECIAL(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
29 #define SCWS_IS_NOSTATS(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
30 #define SCWS_CHARLEN(x) s->mblen[(x)]
31 #define SCWS_IS_ALNUM(x) (((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
32 #define SCWS_IS_ALPHA(x) (((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
33 #define SCWS_IS_UALPHA(x) ((x)>=65&&(x)<=90)
34 #define SCWS_IS_DIGIT(x) ((x)>=48&&(x)<=57)
35 #define SCWS_IS_WHEAD(x) ((x) & SCWS_ZFLAG_WHEAD)
36 #define SCWS_IS_ECHAR(x) ((x) & SCWS_ZFLAG_ENGLISH)
37 #define SCWS_NO_RULE1(x) (((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
38 ///#define SCWS_NO_RULE2(x) (((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
39 #define SCWS_NO_RULE2 SCWS_NO_RULE1
40 #define SCWS_MAX_EWLEN 33
41 ///hightman.070706: char token
42 #define SCWS_CHAR_TOKEN(x) ((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
43 ///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
44 #define SCWS_MAX_ZLEN 128
45 #define SCWS_EN_IDF(x) (float)(2.5*logf(x))
46
47 static const char *attr_en = "en";
48 static const char *attr_un = "un";
49 static const char *attr_nr = "nr";
50 static const char *attr_na = "!";
51
52 /* create scws engine */
scws_new()53 scws_t scws_new()
54 {
55 scws_t s;
56 s = (scws_t) malloc(sizeof(scws_st));
57 if (s == NULL)
58 return s;
59 memset(s, 0, sizeof(scws_st));
60 s->mblen = charset_table_get(NULL);
61 s->off = s->len = 0;
62 s->wend = -1;
63
64 return s;
65 }
66
67 /* hightman.110320: fork scws */
scws_fork(scws_t p)68 scws_t scws_fork(scws_t p)
69 {
70 scws_t s = scws_new();
71
72 if (p != NULL && s != NULL)
73 {
74 s->mblen = p->mblen;
75 s->mode = p->mode;
76 // fork dict/rules
77 s->r = scws_rule_fork(p->r);
78 s->d = xdict_fork(p->d);
79 }
80
81 return s;
82 }
83
84 /* close & free the engine */
scws_free(scws_t s)85 void scws_free(scws_t s)
86 {
87 if (s->d)
88 {
89 xdict_close(s->d);
90 s->d = NULL;
91 }
92 if (s->r)
93 {
94 scws_rule_free(s->r);
95 s->r = NULL;
96 }
97 free(s);
98 }
99
100 /* add a dict into scws */
scws_add_dict(scws_t s,const char * fpath,int mode)101 int scws_add_dict(scws_t s, const char *fpath, int mode)
102 {
103 xdict_t xx;
104 if (mode & SCWS_XDICT_SET)
105 {
106 xdict_close(s->d);
107 mode ^= SCWS_XDICT_SET;
108 s->d = NULL;
109 }
110 xx = s->d;
111 s->d = xdict_add(s->d, fpath, mode, s->mblen);
112 return (xx == s->d ? -1 : 0);
113 }
114
115 /* set the dict & open it */
scws_set_dict(scws_t s,const char * fpath,int mode)116 int scws_set_dict(scws_t s, const char *fpath, int mode)
117 {
118 return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
119 }
120
scws_set_charset(scws_t s,const char * cs)121 void scws_set_charset(scws_t s, const char *cs)
122 {
123 s->mblen = charset_table_get(cs);
124 }
125
scws_set_rule(scws_t s,const char * fpath)126 void scws_set_rule(scws_t s, const char *fpath)
127 {
128 if (s->r != NULL)
129 scws_rule_free(s->r);
130
131 s->r = scws_rule_new(fpath, s->mblen);
132 }
133
134 /* set ignore symbol or multi segments */
scws_set_ignore(scws_t s,int yes)135 void scws_set_ignore(scws_t s, int yes)
136 {
137 if (yes == SCWS_YEA)
138 s->mode |= SCWS_IGN_SYMBOL;
139
140 if (yes == SCWS_NA)
141 s->mode &= ~SCWS_IGN_SYMBOL;
142 }
143
scws_set_multi(scws_t s,int mode)144 void scws_set_multi(scws_t s, int mode)
145 {
146 s->mode &= ~SCWS_MULTI_MASK;
147
148 if (mode & SCWS_MULTI_MASK)
149 s->mode |= mode;
150 }
151
scws_set_debug(scws_t s,int yes)152 void scws_set_debug(scws_t s, int yes)
153 {
154 if (yes == SCWS_YEA)
155 s->mode |= SCWS_DEBUG;
156
157 if (yes == SCWS_NA)
158 s->mode &= ~SCWS_DEBUG;
159 }
160
scws_set_duality(scws_t s,int yes)161 void scws_set_duality(scws_t s, int yes)
162 {
163 if (yes == SCWS_YEA)
164 s->mode |= SCWS_DUALITY;
165
166 if (yes == SCWS_NA)
167 s->mode &= ~SCWS_DUALITY;
168 }
169
170 /* send the text buffer & init some others */
scws_send_text(scws_t s,const char * text,int len)171 void scws_send_text(scws_t s, const char *text, int len)
172 {
173 s->txt = (unsigned char *) text;
174 s->len = len;
175 s->off = 0;
176 }
177
178 /* get some words, if these is not words, return NULL */
179 #define SCWS_PUT_RES(o,i,l,a) \
180 do { \
181 scws_res_t res; \
182 res = (scws_res_t) malloc(sizeof(struct scws_result)); \
183 res->off = o; \
184 res->idf = i; \
185 res->len = l; \
186 strncpy(res->attr, a, 2); \
187 res->attr[2] = '\0'; \
188 res->next = NULL; \
189 if (s->res1 == NULL) \
190 s->res1 = s->res0 = res; \
191 else \
192 { \
193 s->res1->next = res; \
194 s->res1 = res; \
195 } \
196 } while(0)
197
198 /* single bytes segment (纯单字节字符) */
199 #define PFLAG_WITH_MB 0x01
200 #define PFLAG_ALNUM 0x02
201 #define PFLAG_VALID 0x04
202 #define PFLAG_DIGIT 0x08
203 #define PFLAG_ADDSYM 0x10
204 #define PFLAG_ALPHA 0x20
205 #define PFLAG_LONGDIGIT 0x40
206 #define PFLAG_LONGALPHA 0x80
207
_str_toupper(char * src,char * dst)208 static void _str_toupper(char *src, char *dst)
209 {
210 while (*src)
211 {
212 *dst++ = *src++;
213 if (dst[-1] >= 'a' && dst[-1] <= 'z')
214 dst[-1] ^= 0x20;
215 }
216 }
217
_str_tolower(char * src,char * dst)218 static void _str_tolower(char *src, char *dst)
219 {
220 while (*src)
221 {
222 *dst++ = *src++;
223 if (dst[-1] >= 'A' && dst[-1] <= 'Z')
224 dst[-1] ^= 0x20;
225 }
226 }
227
228 #ifdef HAVE_STRNDUP
229 #define _mem_ndup strndup
230 #else
_mem_ndup(const char * src,int len)231 static inline void *_mem_ndup(const char *src, int len)
232 {
233 char *dst;
234 dst = malloc(len+1);
235 memcpy(dst, src, len);
236 dst[len] = '\0';
237 return dst;
238 }
239 #endif
240
_scws_alnum_multi(scws_t s,int start,int wlen)241 static void _scws_alnum_multi(scws_t s, int start, int wlen)
242 {
243 char chunk[SCWS_MAX_EWLEN];
244 int i, j, k, ch, pflag;
245 unsigned char *txt;
246 float idf;
247
248 txt = s->txt;
249 pflag = 0;
250 for (i = j = k = 0; i < wlen; i++)
251 {
252 ch = txt[start + i];
253 if (SCWS_IS_DIGIT(ch))
254 {
255 if (pflag & PFLAG_DIGIT)
256 continue;
257 if (pflag != 0)
258 {
259 chunk[j++] = (char) (i-k);
260 k = i;
261 }
262 pflag = PFLAG_DIGIT;
263 }
264 else if (SCWS_IS_ALPHA(ch))
265 {
266 if (pflag & PFLAG_ALPHA)
267 continue;
268 if (pflag != 0)
269 {
270 chunk[j++] = (char) (i-k);
271 k = i;
272 }
273 pflag = PFLAG_ALPHA;
274 }
275 else
276 {
277 if (pflag & PFLAG_ADDSYM)
278 continue;
279 if (pflag != 0)
280 {
281 chunk[j++] = (char) (i-k);
282 k = i;
283 }
284 pflag = PFLAG_ADDSYM;
285 }
286 }
287
288 if (j > 0)
289 {
290 chunk[j] = (char) (i-k);
291 ch = start;
292 for (i = 0; i <= j; i++)
293 {
294 if (!SCWS_IS_ALNUM(txt[ch]))
295 {
296 // just skip
297 }
298 else if (chunk[i] == 1)
299 {
300 if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
301 {
302 if (!SCWS_IS_ALNUM(txt[ch-1]))
303 {
304 idf = SCWS_EN_IDF(chunk[i]);
305 SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
306 }
307 else
308 {
309 idf = SCWS_EN_IDF(chunk[i-1]+1);
310 SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
311 }
312 }
313 if (i < j && (i != 0 || j != 1))
314 {
315 if (!SCWS_IS_ALNUM(txt[ch+1]))
316 {
317 idf = SCWS_EN_IDF(chunk[i]);
318 SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
319 }
320 else
321 {
322 idf = SCWS_EN_IDF(chunk[i+1]+1);
323 SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
324 }
325 }
326 }
327 else
328 {
329 idf = SCWS_EN_IDF(chunk[i]);
330 SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
331 }
332 ch += chunk[i];
333 }
334 }
335 }
336
_scws_ssegment(scws_t s,int end)337 static void _scws_ssegment(scws_t s, int end)
338 {
339 int start, wlen, ch, pflag, ipflag = 0;
340 unsigned char *txt;
341 float idf;
342
343 start = s->off;
344 wlen = end - start;
345
346 /* check special words (need strtoupper) */
347 if (wlen > 1)
348 {
349 txt = (char *) _mem_ndup(s->txt + start, wlen);
350 _str_toupper(txt, txt);
351 if (SCWS_IS_SPECIAL(txt, wlen))
352 {
353 SCWS_PUT_RES(start, 9.5, wlen, "nz");
354 free(txt);
355 return;
356 }
357 free(txt);
358 }
359
360 txt = s->txt;
361 /* check brief words such as S.H.E M.R. */
362 if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
363 {
364 for (ch = start + 2; ch < end; ch++)
365 {
366 if (!SCWS_IS_ALPHA(txt[ch])) break;
367 ch++;
368 if (ch == end || txt[ch] != '.') break;
369 }
370 if (ch == end)
371 {
372 SCWS_PUT_RES(start, 7.5, wlen, "nz");
373 return;
374 }
375 }
376
377 /* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
378 while (start < end)
379 {
380 ch = txt[start++];
381 if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
382 ipflag = 0;
383 if (SCWS_IS_ALNUM(ch))
384 {
385 pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
386 wlen = 1;
387 while (start < end)
388 {
389 ch = txt[start];
390 if (pflag & PFLAG_DIGIT)
391 {
392 if (!SCWS_IS_DIGIT(ch))
393 {
394 // check percent % = 0x25
395 if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
396 {
397 start++;
398 wlen++;
399 break;
400 }
401 if (ipflag)
402 break;
403 // special for IP address or version number? (find out all digit + dot)
404 if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
405 {
406 ipflag = 1;
407 while(--wlen && txt[--start] != 0x2e);
408 pflag = 0;
409 break;
410 }
411 // wlen = 1
412 if (wlen == 1 && SCWS_IS_ALPHA(ch))
413 {
414 pflag ^= PFLAG_DIGIT;
415 pflag |= PFLAG_ADDSYM;
416 continue;
417 }
418 // strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
419 if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
420 break;
421 pflag |= PFLAG_ADDSYM;
422 }
423 }
424 else
425 {
426 /* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
427 if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
428 pflag |= PFLAG_ADDSYM;
429 else if (!SCWS_IS_ALPHA(ch))
430 {
431 if ((pflag & PFLAG_ADDSYM)
432 || !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
433 || (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
434 {
435 break;
436 }
437 pflag |= PFLAG_ADDSYM;
438 }
439 }
440 start++;
441 wlen++;
442 if (wlen >= SCWS_MAX_EWLEN)
443 break;
444 }
445 idf = SCWS_EN_IDF(wlen);
446 SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
447 if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
448 _scws_alnum_multi(s, start-wlen, wlen);
449 }
450 else if (!(s->mode & SCWS_IGN_SYMBOL))
451 {
452 SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
453 }
454 }
455 }
456
457 /* multibyte segment */
_scws_mget_word(scws_t s,int i,int j)458 static int _scws_mget_word(scws_t s, int i, int j)
459 {
460 int r, k;
461 word_t item;
462
463 if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
464 return i;
465
466 for (r=i, k=i+1; k <= j; k++)
467 {
468 item = s->wmap[i][k];
469 if (item && (item->flag & SCWS_WORD_FULL))
470 {
471 r = k;
472 if (!(item->flag & SCWS_WORD_PART))
473 break;
474 }
475 }
476 return r;
477 }
478
_scws_mset_word(scws_t s,int i,int j)479 static void _scws_mset_word(scws_t s, int i, int j)
480 {
481 word_t item;
482
483 item = s->wmap[i][j];
484 /* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
485 if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
486 && !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
487 return;
488
489 /* hightman.070701: 散字自动二元聚合 */
490 if (s->mode & SCWS_DUALITY)
491 {
492 int k = s->zis;
493
494 if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
495 {
496 s->zis = i;
497 if (k < 0)
498 return;
499
500 i = (k & ~SCWS_ZIS_USED);
501 if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
502 {
503 SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
504 if (i != (j-1))
505 return;
506 }
507 s->zis |= SCWS_ZIS_USED;
508 }
509 else
510 {
511 if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
512 {
513 k &= ~SCWS_ZIS_USED;
514 SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
515 }
516 if (j > i)
517 s->wend = j + 1;
518 s->zis = -1;
519 }
520 }
521
522 SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
523
524 // hightman.070902: multi segment
525 // step1: split to short words
526 if ((j-i) > 1)
527 {
528 int n, k, m = i;
529 if (s->mode & SCWS_MULTI_SHORT)
530 {
531 while (m < j)
532 {
533 k = m;
534 // hightman.111223: multi short enhanced
535 for (n = m + 1; n <= j; n++)
536 {
537 // 3 chars at most
538 if ((n == j && m == i) || (n - m) > 2) break;
539 item = s->wmap[m][n];
540 if (!item) continue;
541 // first shortest or last longest word
542 if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
543 k = n;
544 if (!(item->flag & SCWS_WORD_PART)) break;
545 }
546 // short word not found, stop to find, passed to next loop
547 if (k == m)
548 break;
549
550 // save the short word
551 item = s->wmap[m][k];
552 SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
553 // find the next word or go to prev for duality last word
554 if ((m = k + 1) == j)
555 {
556 m--;
557 break;
558 }
559 }
560 }
561
562 if (s->mode & SCWS_MULTI_DUALITY)
563 {
564 while (m < j)
565 {
566 if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
567 {
568 SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
569 s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
570 }
571 else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
572 {
573 if (m == i)
574 {
575 SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
576 s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
577 }
578 m++;
579 SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
580 s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
581 }
582 else
583 {
584 SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
585 }
586 m++;
587 if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
588 {
589 SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
590 s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
591 }
592 }
593 }
594 }
595
596 // step2, split to single char
597 if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
598 {
599 if ((j - i) == 1 && !s->wmap[i][j])
600 {
601 if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
602 else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
603 s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
604 }
605 do
606 {
607 if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
608 continue;
609 if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
610 continue;
611 SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
612 }
613 while (++i <= j);
614 }
615 }
616
_scws_mseg_zone(scws_t s,int f,int t)617 static void _scws_mseg_zone(scws_t s, int f, int t)
618 {
619 unsigned char *mpath, *npath;
620 word_t **wmap;
621 int x,i,j,m,n,j2,sz;
622 double weight, nweight;
623 char attr1[3];
624
625 mpath = npath = NULL;
626 weight = nweight = (double) 0.0;
627
628 wmap = s->wmap;
629 j2 = 0;
630 for (x = i = f; i <= t; i++)
631 {
632 j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
633 if (j == i) continue;
634 // skip NR in NR
635 if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
636 if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
637
638 /* one word only */
639 if (i == f && j == t)
640 {
641 mpath = (unsigned char *) malloc(2);
642 mpath[0] = j - i;
643 mpath[1] = 0xff;
644 break;
645 }
646
647 if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
648 continue;
649
650 /* create the new path */
651 wmap[i][j]->flag |= SCWS_WORD_USED;
652 nweight = (double) wmap[i][j]->tf * pow(j-i,4);
653
654 if (npath == NULL)
655 {
656 npath = (unsigned char *) malloc(t-f+2);
657 memset(npath, 0xff, t-f+2);
658 }
659
660 /* lookfor backward */
661 x = sz = 0;
662 memset(attr1, 0, sizeof(attr1));
663 for (m = f; m < i; m = n+1)
664 {
665 n = _scws_mget_word(s, m, i-1);
666 nweight *= wmap[m][n]->tf;
667 npath[x++] = n - m;
668 if (n > m)
669 {
670 nweight *= pow(n-m,4);
671 wmap[m][n]->flag |= SCWS_WORD_USED;
672 }
673 else sz++;
674
675 if (attr1[0] != '\0')
676 nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
677 memcpy(attr1, wmap[m][n]->attr, 2);
678 }
679
680 /* my self */
681 npath[x++] = j - i;
682
683 if (attr1[0] != '\0')
684 nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
685 memcpy(attr1, wmap[i][j]->attr, 2);
686
687 /* lookfor forward */
688 for (m = j+1; m <= t; m = n+1)
689 {
690 n = _scws_mget_word(s, m, t);
691 nweight *= wmap[m][n]->tf;
692 npath[x++] = n - m;
693 if (n > m)
694 {
695 nweight *= pow(n-m,4);
696 wmap[m][n]->flag |= SCWS_WORD_USED;
697 }
698 else sz++;
699
700 nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
701 memcpy(attr1, wmap[m][n]->attr, 2);
702 }
703
704 npath[x] = 0xff;
705 nweight /= pow(x+sz-1,5);
706
707 /* draw the path for debug */
708 #ifdef DEBUG
709 if (s->mode & SCWS_DEBUG)
710 {
711 fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
712 s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
713 for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
714 {
715 n += m;
716 fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
717 m = n + 1;
718 }
719 fprintf(stderr, "\n--\n");
720 }
721 #endif
722
723 j2 = x = j;
724 if ((x - i) > 1) i--;
725 /* check better path */
726 if (nweight > weight)
727 {
728 unsigned char *swap;
729
730 weight = nweight;
731 swap = mpath;
732 mpath = npath;
733 npath = swap;
734 }
735 }
736
737 /* set the result, mpath != NULL */
738 if (mpath == NULL)
739 return;
740
741 for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
742 {
743 n += m;
744 _scws_mset_word(s, m, n);
745 m = n + 1;
746 }
747
748 /* 一口.070808: memory leak fixed. */
749 if (mpath) free(mpath);
750 if (npath) free(npath);
751 }
752
753 /* quick define for zrule_checker in loop */
754 #define ___ZRULE_CHECKER1___ \
755 if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag)) \
756 break;
757
758 #define ___ZRULE_CHECKER2___ \
759 if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag)) \
760 break;
761
762 #define ___ZRULE_CHECKER3___ \
763 if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start)) \
764 break;
765
_scws_msegment(scws_t s,int end,int zlen)766 static void _scws_msegment(scws_t s, int end, int zlen)
767 {
768 word_t **wmap, query;
769 struct scws_zchar *zmap;
770 unsigned char *txt;
771 #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
772 rule_item_t r1;
773 #endif
774 int i, j, k, ch, clen, start;
775 pool_t p;
776
777 /* pool used to management some dynamic memory */
778 p = pool_new();
779
780 /* create wmap & zmap */
781 wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
782 zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
783 txt = s->txt;
784 start = s->off;
785 s->zis = -1;
786
787 for (i = 0; start < end; i++)
788 {
789 ch = txt[start];
790 clen = SCWS_CHARLEN(ch);
791 if (clen == 1)
792 {
793 while (start++ < end)
794 {
795 ch = txt[start];
796 if (start == end || SCWS_CHARLEN(txt[start]) > 1)
797 break;
798 clen++;
799 }
800 wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
801 wmap[i][i]->tf = 0.5;
802 wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
803 strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
804 }
805 else
806 {
807 query = xdict_query(s->d, txt + start, clen);
808 wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
809 if (query == NULL)
810 {
811 wmap[i][i]->tf = 0.5;
812 wmap[i][i]->idf = 0.0;
813 wmap[i][i]->flag = 0;
814 strcpy(wmap[i][i]->attr, attr_un);
815 }
816 else
817 {
818 ch = query->flag;
819 query->flag = SCWS_WORD_FULL;
820 memcpy(wmap[i][i], query, sizeof(word_st));
821 if (query->attr[0] == '#')
822 wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
823
824 if (ch & SCWS_WORD_MALLOCED)
825 free(query);
826 }
827 start += clen;
828 }
829
830 zmap[i].start = start - clen;
831 zmap[i].end = start;
832 }
833
834 /* fixed real zlength */
835 zlen = i;
836
837 /* create word query table */
838 for (i = 0; i < zlen; i++)
839 {
840 k = 0;
841 for (j = i+1; j < zlen; j++)
842 {
843 query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
844 if (query == NULL)
845 break;
846 ch = query->flag;
847 if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
848 {
849 wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
850 memcpy(wmap[i][j], query, sizeof(word_st));
851
852 wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
853
854 for (k = i+1; k <= j; k++)
855 wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
856 }
857
858 if (ch & SCWS_WORD_MALLOCED)
859 free(query);
860
861 if (!(ch & SCWS_WORD_PART))
862 break;
863 }
864
865 if (k--)
866 {
867 /* set nr2 to some short name */
868 if ((k == (i+1)))
869 {
870 if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
871 wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
872 //if (wmap[i][k]->attr[0] == 'n')
873 //wmap[i][i]->flag |= SCWS_ZFLAG_N2;
874 }
875
876 /* clean the PART flag for the last word */
877 if (k < j)
878 wmap[i][k]->flag ^= SCWS_WORD_PART;
879 }
880 }
881
882 if (s->r == NULL)
883 goto do_segment;
884
885 #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
886 /* auto rule set for name & zone & chinese numeric */
887
888 /* one word auto rule check */
889 for (i = 0; i < zlen; i++)
890 {
891 if (SCWS_NO_RULE1(wmap[i][i]->flag))
892 continue;
893
894 r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
895 if (r1 == NULL)
896 continue;
897
898 clen = r1->zmin > 0 ? r1->zmin : 1;
899 if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
900 {
901 /* prefix, check after (zmin~zmax) */
902 // 先检查 zmin 字内是否全部符合要求
903 // 再在 zmax 范围内取得符合要求的字
904 // int i, j, k, ch, clen, start;
905 for (ch = 1; ch <= clen; ch++)
906 {
907 j = i + ch;
908 ___ZRULE_CHECKER1___
909 ___ZRULE_CHECKER3___
910 }
911
912 if (ch <= clen)
913 continue;
914
915 /* no limit znum or limit to a range */
916 j = i + ch;
917 while (1)
918 {
919 if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
920 break;
921 ___ZRULE_CHECKER1___
922 ___ZRULE_CHECKER3___
923 clen++;
924 j++;
925 }
926
927 // 注意原来2字人名,识别后仍为2字的情况
928 if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
929 {
930 if (clen == 1)
931 continue;
932 wmap[i][i+1]->flag |= SCWS_WORD_PART;
933 }
934
935 /* ok, got: i & clen */
936 k = i + clen;
937 wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
938 wmap[i][k]->tf = r1->tf;
939 wmap[i][k]->idf = r1->idf;
940 wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
941 strncpy(wmap[i][k]->attr, r1->attr, 2);
942
943 wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
944 for (j = i+1; j <= k; j++)
945 wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
946
947 if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
948 i = k;
949
950 continue;
951 }
952
953 if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
954 {
955 /* suffix, check before */
956 for (ch = 1; ch <= clen; ch++)
957 {
958 j = i - ch;
959 ___ZRULE_CHECKER2___
960 ___ZRULE_CHECKER3___
961 }
962
963 if (ch <= clen)
964 continue;
965
966 /* no limit znum or limit to a range */
967 j = i - ch;
968 while (1)
969 {
970 if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
971 break;
972 ___ZRULE_CHECKER2___
973 ___ZRULE_CHECKER3___
974 clen++;
975 j--;
976 }
977
978 /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
979 k = i - clen;
980 if (wmap[k][i] != NULL)
981 continue;
982
983 wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
984 wmap[k][i]->tf = r1->tf;
985 wmap[k][i]->idf = r1->idf;
986 wmap[k][i]->flag = SCWS_WORD_FULL;
987 strncpy(wmap[k][i]->attr, r1->attr, 2);
988
989 wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
990 for (j = k+1; j <= i; j++)
991 {
992 wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
993 if ((j != i) && (wmap[k][j] != NULL))
994 wmap[k][j]->flag |= SCWS_WORD_PART;
995 }
996 continue;
997 }
998 }
999
1000 /* two words auto rule check (欧阳** , **西路) */
1001 for (i = zlen - 2; i >= 0; i--)
1002 {
1003 /* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
1004 if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
1005 continue;
1006
1007 k = i+1;
1008 r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
1009 if (r1 == NULL)
1010 continue;
1011
1012 clen = r1->zmin > 0 ? r1->zmin : 1;
1013 if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
1014 {
1015 for (ch = 1; ch <= clen; ch++)
1016 {
1017 j = k + ch;
1018 ___ZRULE_CHECKER1___
1019 ___ZRULE_CHECKER3___
1020 }
1021
1022 if (ch <= clen)
1023 continue;
1024
1025 /* no limit znum or limit to a range */
1026 j = k + ch;
1027 while (1)
1028 {
1029 if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1030 break;
1031 ___ZRULE_CHECKER1___
1032 ___ZRULE_CHECKER3___
1033 clen++;
1034 j++;
1035 }
1036
1037 /* ok, got: i & clen */
1038 k = k + clen;
1039 wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
1040 wmap[i][k]->tf = r1->tf;
1041 wmap[i][k]->idf = r1->idf;
1042 wmap[i][k]->flag = SCWS_WORD_FULL;
1043 strncpy(wmap[i][k]->attr, r1->attr, 2);
1044
1045 wmap[i][i+1]->flag |= SCWS_WORD_PART;
1046 for (j = i+2; j <= k; j++)
1047 wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1048
1049 i--;
1050 continue;
1051 }
1052
1053 if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
1054 {
1055 /* suffix, check before */
1056 for (ch = 1; ch <= clen; ch++)
1057 {
1058 j = i - ch;
1059 ___ZRULE_CHECKER2___
1060 ___ZRULE_CHECKER3___
1061 }
1062
1063 if (ch <= clen)
1064 continue;
1065
1066 /* no limit znum or limit to a range */
1067 j = i - ch;
1068 while (1)
1069 {
1070 if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1071 break;
1072 ___ZRULE_CHECKER2___
1073 ___ZRULE_CHECKER3___
1074 clen++;
1075 j--;
1076 }
1077
1078 /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
1079 k = i - clen;
1080 i = i + 1;
1081 wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
1082 wmap[k][i]->tf = r1->tf;
1083 wmap[k][i]->idf = r1->idf;
1084 wmap[k][i]->flag = SCWS_WORD_FULL;
1085 strncpy(wmap[k][i]->attr, r1->attr, 2);
1086
1087 wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
1088 for (j = k+1; j <= i; j++)
1089 {
1090 wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1091 if (wmap[k][j] != NULL)
1092 wmap[k][j]->flag |= SCWS_WORD_PART;
1093 }
1094
1095 i -= (clen+1);
1096 continue;
1097 }
1098 }
1099 #endif
1100
1101 /* real do the segment */
1102 do_segment:
1103
1104 /* find the easy break point */
1105 for (i = 0, j = 0; i < zlen; i++)
1106 {
1107 if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
1108 continue;
1109
1110 if (i > j)
1111 _scws_mseg_zone(s, j, i-1);
1112
1113 j = i;
1114 if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
1115 {
1116 _scws_mset_word(s, i, i);
1117 j++;
1118 }
1119 }
1120
1121 /* the lastest zone */
1122 if (i > j)
1123 _scws_mseg_zone(s, j, i-1);
1124
1125 /* the last single for duality */
1126 if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
1127 {
1128 i = s->zis;
1129 SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
1130 }
1131
1132 /* free the wmap & zmap */
1133 pool_free(p);
1134 darray_free((void **) wmap);
1135 }
1136
scws_get_result(scws_t s)1137 scws_res_t scws_get_result(scws_t s)
1138 {
1139 int off, len, ch, clen, zlen, pflag;
1140 unsigned char *txt;
1141
1142 off = s->off;
1143 len = s->len;
1144 txt = s->txt;
1145 s->res0 = s->res1 = NULL;
1146 while ((off < len) && (txt[off] <= 0x20))
1147 {
1148 if (txt[off] == 0x0a || txt[off] == 0x0d)
1149 {
1150 s->off = off + 1;
1151 SCWS_PUT_RES(off, 0.0, 1, attr_un);
1152 return s->res0;
1153 }
1154 off++;
1155 }
1156
1157 if (off >= len)
1158 return NULL;
1159
1160 /* try to parse the sentence */
1161 s->off = off;
1162 ch = txt[off];
1163 if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
1164 {
1165 s->off++;
1166 SCWS_PUT_RES(off, 0.0, 1, attr_un);
1167 return s->res0;
1168 }
1169 clen = SCWS_CHARLEN(ch);
1170 zlen = 1;
1171 pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
1172 while ((off = (off+clen)) < len)
1173 {
1174 ch = txt[off];
1175 if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
1176 clen = SCWS_CHARLEN(ch);
1177 if (!(pflag & PFLAG_WITH_MB))
1178 {
1179 // pure single-byte -> multibyte (2bytes)
1180 if (clen == 1)
1181 {
1182 if (pflag & PFLAG_ALNUM)
1183 {
1184 if (SCWS_IS_ALPHA(ch))
1185 {
1186 if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
1187 pflag |= PFLAG_LONGALPHA;
1188 }
1189 else if (SCWS_IS_DIGIT(ch))
1190 {
1191 if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
1192 pflag |= PFLAG_LONGDIGIT;
1193 }
1194 else
1195 pflag ^= PFLAG_ALNUM;
1196 }
1197 }
1198 else
1199 {
1200 if (!(pflag & PFLAG_ALNUM) || zlen > 2)
1201 break;
1202
1203 pflag |= PFLAG_WITH_MB;
1204 /* zlen = 1; */
1205 }
1206 }
1207 else if ((pflag & PFLAG_WITH_MB) && clen == 1)
1208 {
1209 int i;
1210
1211 // mb + single-byte. allowd: alpha+num + 中文
1212 if (!SCWS_IS_ALNUM(ch))
1213 break;
1214
1215 pflag &= ~PFLAG_VALID;
1216 // 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
1217 for (i = off+1; i < (off+3); i++)
1218 {
1219 ch = txt[i];
1220 if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
1221 {
1222 pflag |= PFLAG_VALID;
1223 break;
1224 }
1225
1226 if (!SCWS_IS_ALNUM(ch))
1227 break;
1228 }
1229
1230 if (!(pflag & PFLAG_VALID))
1231 break;
1232
1233 clen += (i - off - 1);
1234 }
1235 /* hightman.070813: add max zlen limit */
1236 if (++zlen >= SCWS_MAX_ZLEN)
1237 break;
1238 }
1239
1240 /* hightman.070624: 处理半个字的问题 */
1241 if ((ch = off) > len)
1242 off -= clen;
1243
1244 /* do the real segment */
1245 if (off <= s->off)
1246 return NULL;
1247 else if (pflag & PFLAG_WITH_MB)
1248 _scws_msegment(s, off, zlen);
1249 else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
1250 _scws_ssegment(s, off);
1251 else
1252 {
1253 zlen = off - s->off;
1254 if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
1255 _scws_alnum_multi(s, s->off, zlen);
1256 else
1257 {
1258 float idf;
1259
1260 idf = SCWS_EN_IDF(zlen);
1261 SCWS_PUT_RES(s->off, idf, zlen, attr_en);
1262
1263 /* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
1264 if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
1265 _scws_alnum_multi(s, s->off, zlen);
1266 }
1267 }
1268
1269 /* reutrn the result */
1270 s->off = (ch > len ? len : off);
1271 if (s->res0 == NULL)
1272 return scws_get_result(s);
1273
1274 return s->res0;
1275 }
1276
1277 /* free the result retunned by scws_get_result */
scws_free_result(scws_res_t result)1278 void scws_free_result(scws_res_t result)
1279 {
1280 scws_res_t cur;
1281
1282 while ((cur = result) != NULL)
1283 {
1284 result = cur->next;
1285 free(cur);
1286 }
1287 }
1288
1289 /* top words count */
1290 // xattr = ~v,p,c
1291 // xattr = v,pn,c
1292
_tops_cmp(a,b)1293 static int _tops_cmp(a, b)
1294 scws_top_t *a,*b;
1295 {
1296 if ((*b)->weight > (*a)->weight)
1297 return 1;
1298 return -1;
1299 }
1300
_tops_load_node(node_t node,scws_top_t * values,int * start)1301 static void _tops_load_node(node_t node, scws_top_t *values, int *start)
1302 {
1303 int i = *start;
1304
1305 if (node == NULL)
1306 return;
1307
1308 values[i] = node->value;
1309 values[i]->word = node->key;
1310
1311 *start = ++i;
1312 _tops_load_node(node->left, values, start);
1313 _tops_load_node(node->right, values, start);
1314 }
1315
_tops_load_all(xtree_t xt,scws_top_t * values)1316 static void _tops_load_all(xtree_t xt, scws_top_t *values)
1317 {
1318 int i, start;
1319
1320 for (i = 0, start = 0; i < xt->prime; i++)
1321 _tops_load_node(xt->trees[i], values, &start);
1322 }
1323
1324 typedef char word_attr[4];
_attr_belong(const char * a,word_attr * at)1325 static inline int _attr_belong(const char *a, word_attr *at)
1326 {
1327 if ((*at)[0] == '\0') return 1;
1328 while ((*at)[0])
1329 {
1330 if (!strcmp(a, *at)) return 1;
1331 at++;
1332 }
1333 return 0;
1334 }
1335
1336 /* macro to parse xattr -> xmode, at */
1337 #define __PARSE_XATTR__ do { \
1338 if (xattr == NULL) break; \
1339 if (*xattr == '~') { xattr++; xmode = SCWS_YEA; } \
1340 if (*xattr == '\0') break; \
1341 cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr); \
1342 at = (word_attr *) malloc(cnt); \
1343 memset(at, 0, cnt); \
1344 cnt = 0; \
1345 for (cnt = 0; (word = strchr(xattr, ',')); cnt++) { \
1346 at[cnt][0] = *xattr++; \
1347 at[cnt][1] = xattr == word ? '\0' : *xattr; \
1348 xattr = word + 1; \
1349 } \
1350 strncpy(at[cnt], xattr, 2); \
1351 } while (0)
1352
scws_get_tops(scws_t s,int limit,char * xattr)1353 scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
1354 {
1355 int off, cnt, xmode = SCWS_NA;
1356 xtree_t xt;
1357 scws_res_t res, cur;
1358 scws_top_t top, *list, tail, base;
1359 char *word;
1360 word_attr *at = NULL;
1361
1362 if (!s || !s->txt || !(xt = xtree_new(0,1)))
1363 return NULL;
1364
1365 __PARSE_XATTR__;
1366
1367 // save the offset.
1368 off = s->off;
1369 s->off = cnt = 0;
1370 while ((cur = res = scws_get_result(s)) != NULL)
1371 {
1372 do
1373 {
1374 if (cur->idf < 0.2 || cur->attr[0] == '#')
1375 continue;
1376
1377 /* check attribute filter */
1378 if (at != NULL)
1379 {
1380 if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1381 continue;
1382
1383 if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1384 continue;
1385 }
1386
1387 /* check stopwords */
1388 if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
1389 {
1390 word = _mem_ndup(s->txt + cur->off, cur->len);
1391 _str_tolower(word, word);
1392 if (SCWS_IS_NOSTATS(word, cur->len))
1393 {
1394 free(word);
1395 continue;
1396 }
1397 free(word);
1398 }
1399
1400 /* put to the stats */
1401 if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1402 {
1403 top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
1404 top->weight = cur->idf;
1405 top->times = 1;
1406 strncpy(top->attr, cur->attr, 2);
1407 xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1408 cnt++;
1409 }
1410 else
1411 {
1412 top->weight += cur->idf;
1413 top->times++;
1414 }
1415 }
1416 while ((cur = cur->next) != NULL);
1417 scws_free_result(res);
1418 }
1419
1420 // free at
1421 if (at != NULL)
1422 free(at);
1423 top = NULL;
1424 if (cnt > 0)
1425 {
1426 /* sort the list */
1427 list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
1428 _tops_load_all(xt, list);
1429 qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
1430
1431 /* save to return pointer */
1432 if (!limit || limit > cnt)
1433 limit = cnt;
1434
1435 top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
1436 memcpy(top, list[0], sizeof(struct scws_topword));
1437 top->word = strdup(list[0]->word);
1438 top->next = NULL;
1439
1440 for (cnt = 1; cnt < limit; cnt++)
1441 {
1442 base = (scws_top_t) malloc(sizeof(struct scws_topword));
1443 memcpy(base, list[cnt], sizeof(struct scws_topword));
1444 base->word = strdup(list[cnt]->word);
1445 base->next = NULL;
1446 tail->next = base;
1447 tail = base;
1448 }
1449 free(list);
1450 }
1451
1452 // restore the offset
1453 s->off = off;
1454 xtree_free(xt);
1455 return top;
1456 }
1457
1458 // word check by attr.
scws_has_word(scws_t s,char * xattr)1459 int scws_has_word(scws_t s, char *xattr)
1460 {
1461 int off, cnt, xmode = SCWS_NA;
1462 scws_res_t res, cur;
1463 char *word;
1464 word_attr *at = NULL;
1465
1466 if (!s || !s->txt)
1467 return 0;
1468
1469 __PARSE_XATTR__;
1470
1471 // save the offset. (cnt -> return_value)
1472 off = s->off;
1473 cnt = s->off = 0;
1474 while (!cnt && (cur = res = scws_get_result(s)) != NULL)
1475 {
1476 do
1477 {
1478 /* check attribute filter */
1479 if (at != NULL)
1480 {
1481 if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
1482 cnt = 1;
1483
1484 if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
1485 cnt = 1;
1486 }
1487 }
1488 while (!cnt && (cur = cur->next) != NULL);
1489 scws_free_result(res);
1490 }
1491 // memory leak fixed, thanks to lauxinz
1492 if (at != NULL)
1493 free(at);
1494 s->off = off;
1495 return cnt;
1496 }
1497
1498 // get words by attr (rand order)
scws_get_words(scws_t s,char * xattr)1499 scws_top_t scws_get_words(scws_t s, char *xattr)
1500 {
1501 int off, cnt, xmode = SCWS_NA;
1502 xtree_t xt;
1503 scws_res_t res, cur;
1504 scws_top_t top, tail, base;
1505 char *word;
1506 word_attr *at = NULL;
1507
1508 if (!s || !s->txt || !(xt = xtree_new(0,1)))
1509 return NULL;
1510
1511 __PARSE_XATTR__;
1512
1513 // save the offset.
1514 off = s->off;
1515 s->off = 0;
1516 base = tail = NULL;
1517 while ((cur = res = scws_get_result(s)) != NULL)
1518 {
1519 do
1520 {
1521 /* check attribute filter */
1522 if (at != NULL)
1523 {
1524 if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1525 continue;
1526
1527 if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1528 continue;
1529 }
1530
1531 /* put to the stats */
1532 if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1533 {
1534 top = (scws_top_t) malloc(sizeof(struct scws_topword));
1535 top->weight = cur->idf;
1536 top->times = 1;
1537 top->next = NULL;
1538 top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
1539 strncpy(top->attr, cur->attr, 2);
1540 // add to the chain
1541 if (tail == NULL)
1542 base = tail = top;
1543 else
1544 {
1545 tail->next = top;
1546 tail = top;
1547 }
1548 xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1549 }
1550 else
1551 {
1552 top->weight += cur->idf;
1553 top->times++;
1554 }
1555 }
1556 while ((cur = cur->next) != NULL);
1557 scws_free_result(res);
1558 }
1559
1560 // free at & xtree
1561 if (at != NULL)
1562 free(at);
1563 xtree_free(xt);
1564
1565 // restore the offset
1566 s->off = off;
1567 return base;
1568 }
1569
scws_free_tops(scws_top_t tops)1570 void scws_free_tops(scws_top_t tops)
1571 {
1572 scws_top_t cur;
1573
1574 while ((cur = tops) != NULL)
1575 {
1576 tops = cur->next;
1577 if (cur->word)
1578 free(cur->word);
1579 free(cur);
1580 }
1581 }
1582