1 /*
2 * friso main file implemented the friso main functions.
3 * starts with friso_ in the friso header file "friso.h";
4 *
5 * @author chenxin <chenxin619315@gmail.com>
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>
11
12 #include "friso_API.h"
13 #include "friso_ctype.h"
14 #include "friso.h"
15
16 //-----------------------------------------------------------------
17 // friso instance about function
18 /* {{{ create a new friso configuration variable.
19 */
friso_new(void)20 FRISO_API friso_t friso_new(void) {
21 friso_t e = (friso_t)FRISO_MALLOC(sizeof(friso_entry));
22 if (e == NULL) {
23 ___ALLOCATION_ERROR___
24 }
25
26 e->dic = NULL;
27 e->charset = FRISO_UTF8; // set default charset UTF8.
28
29 return e;
30 }
31 /* }}} */
32
33 /* {{{ creat a new friso with initialize item from a configuration file.
34 *
35 * @return 1 for successfully and 0 for failed.
36 */
friso_init_from_ifile(friso_t friso,friso_config_t config,fstring __ifile)37 FRISO_API int friso_init_from_ifile(friso_t friso, friso_config_t config, fstring __ifile) {
38 FILE *__stream;
39 char __chars__[256], __key__[128], *__line__;
40 char __lexi__[160], lexpath[160];
41 uint_t i, t, __hit__ = 0, __length__;
42
43 char *slimiter = NULL;
44 uint_t flen = 0;
45
46 // get the base part of the path of the __ifile
47 if ((slimiter = strrchr(__ifile, '/')) != NULL) {
48 flen = slimiter - __ifile + 1;
49 }
50
51 // yat, start to parse the friso.ini configuration file
52 if ((__stream = fopen(__ifile, "rb")) != NULL) {
53 // initialize the entry with the value from the ifile.
54 while ((__line__ = file_get_line(__chars__, __stream)) != NULL) {
55 // comments filter.
56 if (__line__[0] == '#') continue;
57 if (__line__[0] == '\t') continue;
58 if (__line__[0] == ' ' || __line__[0] == '\0') continue;
59
60 __length__ = strlen(__line__);
61 for (i = 0; i < __length__; i++) {
62 if (__line__[i] == ' ' || __line__[i] == '\t' || __line__[i] == '=') {
63 break;
64 }
65 __key__[i] = __line__[i];
66 }
67 __key__[i] = '\0';
68
69 // position the euqals char '='.
70 if (__line__[i] == ' ' || __line__[i] == '\t') {
71 for (i++; i < __length__; i++) {
72 if (__line__[i] == '=') {
73 break;
74 }
75 }
76 }
77
78 // clear the left whitespace of the value.
79 for (i++; i < __length__ && (__line__[i] == ' ' || __line__[i] == '\t'); i++)
80 ;
81 for (t = 0; i < __length__; i++, t++) {
82 if (__line__[i] == ' ' || __line__[i] == '\t') {
83 break;
84 }
85 __line__[t] = __line__[i];
86 }
87 __line__[t] = '\0';
88
89 // printf("key=%s, value=%s\n", __key__, __line__ );
90 if (strcmp(__key__, "friso.lex_dir") == 0) {
91 /*
92 * here copy the value of the lex_dir.
93 * cause we need the value of friso.max_len to finish all
94 * the work when we call function friso_dic_load_from_ifile to
95 * initiliaze the friso dictionary.
96 */
97 if (__hit__ == 0) {
98 __hit__ = t;
99 for (t = 0; t < __hit__; t++) {
100 __lexi__[t] = __line__[t];
101 }
102 __lexi__[t] = '\0';
103 }
104 } else if (strcmp(__key__, "friso.max_len") == 0) {
105 config->max_len = (ushort_t)atoi(__line__);
106 } else if (strcmp(__key__, "friso.r_name") == 0) {
107 config->r_name = (ushort_t)atoi(__line__);
108 } else if (strcmp(__key__, "friso.mix_len") == 0) {
109 config->mix_len = (ushort_t)atoi(__line__);
110 } else if (strcmp(__key__, "friso.lna_len") == 0) {
111 config->lna_len = (ushort_t)atoi(__line__);
112 } else if (strcmp(__key__, "friso.add_syn") == 0) {
113 config->add_syn = (ushort_t)atoi(__line__);
114 } else if (strcmp(__key__, "friso.clr_stw") == 0) {
115 config->clr_stw = (ushort_t)atoi(__line__);
116 } else if (strcmp(__key__, "friso.keep_urec") == 0) {
117 config->keep_urec = (uint_t)atoi(__line__);
118 } else if (strcmp(__key__, "friso.spx_out") == 0) {
119 config->spx_out = (ushort_t)atoi(__line__);
120 } else if (strcmp(__key__, "friso.nthreshold") == 0) {
121 config->nthreshold = atoi(__line__);
122 } else if (strcmp(__key__, "friso.mode") == 0) {
123 // config->mode = ( friso_mode_t ) atoi( __line__ );
124 friso_set_mode(config, (friso_mode_t)atoi(__line__));
125 } else if (strcmp(__key__, "friso.charset") == 0) {
126 friso->charset = (friso_charset_t)atoi(__line__);
127 } else if (strcmp(__key__, "friso.en_sseg") == 0) {
128 config->en_sseg = (ushort_t)atoi(__line__);
129 } else if (strcmp(__key__, "friso.st_minl") == 0) {
130 config->st_minl = (ushort_t)atoi(__line__);
131 } else if (strcmp(__key__, "friso.kpuncs") == 0) {
132 // t is the length of the __line__.
133 memcpy(config->kpuncs, __line__, t);
134 // printf("friso_init_from_ifile#kpuncs: %s\n", config->kpuncs);
135 }
136 }
137
138 /*
139 * intialize the friso dictionary here.
140 * use the setting from the ifile parse above
141 * we copied the value in the __lexi__
142 */
143 if (__hit__ != 0) {
144 // add relative path search support
145 //@added: 2014-05-24
146 // convert the relative path to absolute path base on the path of friso.ini
147 // improved at @date: 2014-10-26
148
149 #ifdef FRISO_WINNT
150 if (__lexi__[1] != ':' && flen != 0) {
151 #else
152 if (__lexi__[0] != '/' && flen != 0) {
153 #endif
154 if ((flen + __hit__) > sizeof(lexpath) - 1) {
155 fprintf(stderr, "[Error]: Buffer is not long enough to hold the final lexicon path");
156 fprintf(stderr, " with a length of {%d} at function friso.c#friso_init_from_ifile",
157 flen + __hit__);
158 return 0;
159 }
160
161 memcpy(lexpath, __ifile, flen);
162 memcpy(lexpath + flen, __lexi__, __hit__ - 1);
163 // count the new length
164 flen = flen + __hit__ - 1;
165 if (lexpath[flen - 1] != '/') lexpath[flen] = '/';
166 lexpath[flen + 1] = '\0';
167 } else {
168 memcpy(lexpath, __lexi__, __hit__);
169 lexpath[__hit__] = '\0';
170 if (lexpath[__hit__ - 1] != '/') {
171 lexpath[__hit__] = '/';
172 lexpath[__hit__ + 1] = '\0';
173 }
174 }
175
176 friso->dic = friso_dic_new();
177 // add charset check for max word length counting
178 friso_dic_load_from_ifile(friso, config, lexpath,
179 config->max_len * (friso->charset == FRISO_UTF8 ? 3 : 2));
180 } else {
181 fprintf(stderr, "[Error]: failed get lexicon path, check lex_dir in friso.ini \n");
182 return 0;
183 }
184
185 fclose(__stream);
186 return 1;
187 }
188
189 return 0;
190 }
191 /* }}} */
192
193 /* {{{ friso free functions.
194 * here we have to free its dictionary.
195 */
196 FRISO_API void friso_free(friso_t friso) {
197 // free the dictionary
198 if (friso->dic != NULL) {
199 friso_dic_free(friso->dic);
200 }
201 FRISO_FREE(friso);
202 }
203 /* }}} */
204
205 /* {{{ set the current split mode
206 * view the friso.h#friso_mode_t
207 */
208 FRISO_API void friso_set_mode(friso_config_t config, friso_mode_t mode) {
209 config->mode = mode;
210
211 switch (config->mode) {
212 case __FRISO_SIMPLE_MODE__:
213 config->next_token = next_mmseg_token;
214 config->next_cjk = next_simple_cjk;
215 break;
216 case __FRISO_DETECT_MODE__:
217 config->next_token = next_detect_token;
218 break;
219 default:
220 config->next_token = next_mmseg_token;
221 config->next_cjk = next_complex_cjk;
222 break;
223 }
224 }
225 /* }}} */
226
227 /* {{{ create a new friso configuration entry and initialize
228 * it with default value.*/
229 FRISO_API friso_config_t friso_new_config(void) {
230 friso_config_t cfg = (friso_config_t)FRISO_MALLOC(sizeof(friso_config_entry));
231 if (cfg == NULL) {
232 ___ALLOCATION_ERROR___;
233 }
234
235 // initialize the configuration entry.
236 friso_init_config(cfg);
237
238 return cfg;
239 }
240 /* }}} */
241
242 /* {{{ initialize the specified friso config entry with default value.*/
243 FRISO_API void friso_init_config(friso_config_t cfg) {
244 cfg->max_len = DEFAULT_SEGMENT_LENGTH;
245 cfg->r_name = 1;
246 cfg->mix_len = DEFAULT_MIX_LENGTH;
247 cfg->lna_len = DEFAULT_LNA_LENGTH;
248 cfg->add_syn = 1;
249 cfg->clr_stw = 0;
250 cfg->keep_urec = 0;
251 cfg->spx_out = 0;
252 cfg->en_sseg = 1; // default start the secondary segmentaion.
253 cfg->st_minl = 1; // min length for secondary split sub token.
254 cfg->nthreshold = DEFAULT_NTHRESHOLD;
255 cfg->mode = (friso_mode_t)DEFAULT_SEGMENT_MODE;
256
257 friso_set_mode(cfg, cfg->mode);
258
259 // Zero fill the kpuncs buffer.
260 memset(cfg->kpuncs, 0x00, sizeof(cfg->kpuncs));
261 }
262 /* }}} */
263
264 /* {{{ create a new segment task entry.
265 */
266 FRISO_API friso_task_t friso_new_task() {
267 friso_task_t task = (friso_task_t)FRISO_MALLOC(sizeof(friso_task_entry));
268 if (task == NULL) {
269 ___ALLOCATION_ERROR___
270 }
271
272 // initliaze the segment.
273 task->text = NULL;
274 task->idx = 0;
275 task->length = 0;
276 task->bytes = 0;
277 task->unicode = 0;
278 task->ctrlMask = 0;
279 task->pool = new_link_list();
280 task->sbuf = new_string_buffer();
281 task->token = friso_new_token();
282
283 return task;
284 }
285 /* }}} */
286
287 /* {{{ free the specified task*/
288 FRISO_API void friso_free_task(friso_task_t task) {
289 // free the allocation of the poll link list.
290 if (task->pool != NULL) {
291 free_link_list(task->pool);
292 }
293
294 // release the allocation of the sbuff string_buffer_t.
295 if (task->sbuf != NULL) {
296 free_string_buffer(task->sbuf);
297 }
298
299 // free the allocations of the token.
300 if (task->token != NULL) {
301 friso_free_token(task->token);
302 }
303
304 FRISO_FREE(task);
305 }
306 /* }}} */
307
308 /* {{{ create a new friso token */
309 FRISO_API friso_token_t friso_new_token(void) {
310 friso_token_t token = (friso_token_t)FRISO_MALLOC(sizeof(friso_token_entry));
311 if (token == NULL) {
312 ___ALLOCATION_ERROR___
313 }
314
315 // initialize
316 token->type = (uchar_t)__LEX_OTHER_WORDS__;
317 token->length = 0;
318 token->rlen = 0;
319 token->pos = '\0';
320 token->offset = -1;
321 memset(token->word, 0x00, __HITS_WORD_LENGTH__);
322
323 return token;
324 }
325 /* }}} */
326
327 /* {{{ set the text of the current segmentation.
328 * that means we could re-use the segment.
329 * also we have to reset the idx and the length of the segmentation.
330 * and the most important one - clear the poll link list.
331 */
332 FRISO_API void friso_set_text(friso_task_t task, fstring text) {
333 task->text = text;
334 task->idx = 0; // reset the index
335 task->length = strlen(text);
336 task->pool = link_list_clear(task->pool); // clear the word poll
337 string_buffer_clear(task->sbuf); // crear the string buffer.
338 }
339 /* }}} */
340
341 //--------------------------------------------------------------------
342 // friso core part 1: simple mode tokenize handler functions
343 /* {{{ read the next word from the current position.
344 *
345 * @return int the bytes of the readed word.
346 */
347 __STATIC_API__ uint_t readNextWord(friso_t friso, // friso instance
348 friso_task_t task, // token task
349 uint_t *idx, // current index.
350 fstring __word) // work buffer.
351 {
352 if (friso->charset == FRISO_UTF8) {
353 //@reader: task->unicode = get_utf8_unicode(task->buffer) is moved insite
354 // function utf8_next_word from friso 1.6.0 .
355 return utf8_next_word(task, idx, __word);
356 } else if (friso->charset == FRISO_GBK) {
357 return gbk_next_word(task, idx, __word);
358 }
359
360 return 0; // unknow charset.
361 }
362 /* }}} */
363
364 /* {{{ get the next cjk word from the current position, with simple mode.
365 */
366 FRISO_API lex_entry_t next_simple_cjk(friso_t friso, friso_config_t config, friso_task_t task) {
367 uint_t t, idx = task->idx, __length__;
368 string_buffer_t sb = new_string_buffer_with_string(task->buffer);
369 lex_entry_t e = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer);
370
371 /*
372 * here bak the e->length in the task->token->type.
373 * we will use it to count the task->idx.
374 * for the sake of use less variable.
375 */
376 __length__ = e->length;
377
378 for (t = 1;
379 t < config->max_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
380 t++) {
381 if (friso_whitespace(friso->charset, task)) break;
382 if (!friso_cn_string(friso->charset, task)) break;
383
384 string_buffer_append(sb, task->buffer);
385
386 // check the existence of the word by search the dictionary.
387 if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, sb->buffer)) {
388 e = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer);
389 }
390 }
391
392 // correct the offset of the segment.
393 task->idx += (e->length - __length__);
394 free_string_buffer(sb); // free the buffer
395
396 /*
397 * check the stopwords dictionary,
398 * make sure the current tokenzier is not stopwords.
399 * @warning: friso.clr_stw must be open in friso.ini configuration file.
400 */
401 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, e->word)) {
402 return NULL;
403 }
404
405 return e;
406 }
407 /* }}} */
408
409 //-------------------------------------------------------------------
410 // friso core part 2: basic latin handler functions
411 /* {{{ basic latin segmentation*/
412 /*convert full-width char to half-width*/
413 #define convert_full_to_half(friso, task, convert) \
414 do { \
415 if (friso_fullwidth_en_char(friso->charset, task)) { \
416 if (friso->charset == FRISO_UTF8) \
417 task->unicode -= 65248; \
418 else if (friso->charset == FRISO_GBK) { \
419 task->buffer[0] = ((uchar_t)task->buffer[1]) - 128; \
420 task->buffer[1] = '\0'; \
421 } \
422 convert = 1; \
423 } \
424 } while (0)
425
426 /*convert uppercase char to lowercase char*/
427 #define convert_upper_to_lower(friso, task, convert) \
428 do { \
429 if (friso_uppercase_letter(friso->charset, task)) { \
430 if (friso->charset == FRISO_UTF8) task->unicode += 32; \
431 /* With the above logic(full to half), \
432 * here we just need to check half-width*/ \
433 else if (friso->charset == FRISO_GBK) \
434 task->buffer[0] = task->buffer[0] + 32; \
435 convert = 1; \
436 } \
437 } while (0)
438
439 /* convert the unicode to utf-8 bytes. (FRISO_UTF8) */
440 #define convert_work_apply(friso, task, convert) \
441 do { \
442 if (convert == 1 && friso->charset == FRISO_UTF8) { \
443 memset(task->buffer, 0x00, 7); \
444 unicode_to_utf8(task->unicode, task->buffer); \
445 convert = 0; \
446 } \
447 } while (0)
448
449 // get the next latin word from the current position.
450 __STATIC_API__ lex_entry_t next_basic_latin(friso_t friso, friso_config_t config,
451 friso_task_t task) {
452 int __convert = 0, t = 0, blen = 0;
453 int chkecm = 0, chkunits = 1, wspace = 0;
454
455 /* cause friso will convert full-width numeric and letters
456 * (Not punctuations) to half-width ones. so, here we need
457 * wlen to record the real length of the lex_entry_t.
458 * */
459 uint_t wlen = task->bytes;
460 uint_t idx = task->idx;
461 string_buffer_t sb, tmp = NULL;
462 lex_entry_t e = NULL;
463
464 // condition controller to start the secondary segmente.
465 int ssseg = 0;
466 int fdunits = 0;
467
468 // secondray segmente.
469 int tcount = 1; // number fo different type of char.
470 friso_enchar_t _ctype, _TYPE;
471 task_ssseg_close(task);
472
473 // full-half width and upper-lower case exchange.
474 convert_full_to_half(friso, task, __convert);
475 convert_upper_to_lower(friso, task, __convert);
476 convert_work_apply(friso, task, __convert);
477
478 // creat a new fstring buffer and append the task->buffer insite.
479 sb = new_string_buffer_with_string(task->buffer);
480 _TYPE = friso_enchar_type(friso->charset, task);
481
482 // segmentation.
483 while ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
484 // convert full-width to half-width.
485 convert_full_to_half(friso, task, __convert);
486 _ctype = friso_enchar_type(friso->charset, task);
487
488 if (_ctype == FRISO_EN_WHITESPACE) {
489 wspace = 1;
490 break;
491 }
492
493 if (_ctype == FRISO_EN_PUNCTUATION) {
494 // clear the full-width punctuations.
495 if (task->bytes > 1) break;
496 if (!friso_en_kpunc(config, task->buffer[0])) break;
497 }
498
499 /* check if is an FRISO_EN_NUMERIC, or FRISO_EN_LETTER.
500 * here just need to make sure it is not FRISO_EN_UNKNOW.
501 * */
502 if (_ctype == FRISO_EN_UNKNOW) {
503 if (friso_cn_string(friso->charset, task)) chkecm = 1;
504 break;
505 }
506
507 // upper-lower case convert
508 convert_upper_to_lower(friso, task, __convert);
509 convert_work_apply(friso, task, __convert);
510
511 // sound a little crazy, i did't limit the length of this
512 //@Added: 2015-01-16 night
513 if ((wlen + task->bytes) >= __HITS_WORD_LENGTH__) {
514 break;
515 }
516
517 string_buffer_append(sb, task->buffer);
518 wlen += task->bytes;
519 task->idx += task->bytes;
520
521 /* Char type counter.
522 * make the condition to start the secondary segmentation.
523 *
524 * @TODO: 2013-12-22
525 * */
526 if (_ctype != _TYPE) {
527 tcount++;
528 _TYPE = _ctype;
529 }
530 }
531
532 /*
533 * 1. clear the useless english punctuation
534 * from the end of the buffer.
535 * 2. check the english and punctuation mixed word.
536 *
537 * set _ctype to as the status for the existence of punctuation
538 * at the end of the sb cause we need to plus the tcount
539 * to avoid the secondary check for work like 'c+', 'chenxin.'.
540 */
541 _ctype = 0;
542 for (; sb->length > 0 && sb->buffer[sb->length - 1] != '%' &&
543 is_en_punctuation(friso->charset, sb->buffer[sb->length - 1]);) {
544 // check the english punctuation mixed word.
545 if (friso_dic_match(friso->dic, __LEX_ENPUN_WORDS__, sb->buffer)) {
546 e = friso_dic_get(friso->dic, __LEX_ENPUN_WORDS__, sb->buffer);
547 chkunits = 0;
548 break;
549 }
550
551 // mark the end of the buffer.
552 sb->buffer[--sb->length] = '\0';
553 wlen--;
554 task->idx--;
555
556 /*check and plus the tcount*/
557 if (_ctype == 0) {
558 tcount--;
559 _ctype = 1;
560 }
561 }
562
563 // check the condition to start the secondary segmentation.
564 ssseg = (tcount > 1) && (chkunits == 1);
565
566 // check the tokenize loop is break by whitespace.
567 // no need for all the following work if it is.
568 //@added 2013-11-19
569 if (wspace == 1 || task->idx == task->length) {
570 blen = sb->length;
571 e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
572 e->rlen = wlen;
573 // set the secondary mask.
574 if (ssseg) task_ssseg_open(task);
575 return e;
576 }
577
578 if (chkecm != 1) {
579 /*
580 * check the single words unit.
581 * not only the chinese word but also other kinds of word.
582 * so we can recongnize the complex unit like '℉,℃'' eg..
583 * @date 2013-10-14
584 */
585 if (chkunits && (friso_numeric_string(friso->charset, sb->buffer) ||
586 friso_decimal_string(friso->charset, sb->buffer))) {
587 idx = task->idx;
588 if ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
589 // check the EC dictionary.
590 if (friso_dic_match(friso->dic, __LEX_CJK_UNITS__, task->buffer)) {
591 fdunits = 1;
592 string_buffer_append(sb, task->buffer);
593 wlen += task->bytes;
594 task->idx += task->bytes;
595 }
596 }
597 }
598
599 // set the START_SS_MASK
600 if (fdunits != 1 && ssseg) {
601 task_ssseg_open(task);
602 }
603
604 // creat the lexicon entry and return it.
605 blen = sb->length;
606 e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
607 e->rlen = wlen;
608
609 return e;
610 }
611
612 // Try to find a english chinese mixed word.
613 tmp = new_string_buffer_with_string(sb->buffer);
614 idx = task->idx;
615 for (t = 0;
616 t < config->mix_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
617 t++) {
618 // if ( ! friso_cn_string( friso->charset, task ) ) {
619 // task->idx -= task->bytes;
620 // break;
621 //}
622 // replace with the whitespace check.
623 // more complex mixed words could be find here.
624 // (no only english and chinese mix word)
625 //@date 2013-10-14
626 if (friso_whitespace(friso->charset, task)) {
627 break;
628 }
629
630 string_buffer_append(tmp, task->buffer);
631
632 // check the mixed word dictionary.
633 if (friso_dic_match(friso->dic, __LEX_ECM_WORDS__, tmp->buffer)) {
634 e = friso_dic_get(friso->dic, __LEX_ECM_WORDS__, tmp->buffer);
635 }
636 }
637
638 free_string_buffer(tmp);
639
640 /* e is not NULL does't mean it must be EC mixed word.
641 * it could be an english and punctuation mixed word, like 'c++'
642 * But we don't need to check and set the START_SS_MASK mask here.
643 * */
644 if (e != NULL) {
645 task->idx += (e->length - sb->length);
646 free_string_buffer(sb);
647 return e;
648 }
649
650 // no match for mix word, try to find a single unit.
651 if (chkunits && (friso_numeric_string(friso->charset, sb->buffer) ||
652 friso_decimal_string(friso->charset, sb->buffer))) {
653 idx = task->idx;
654 if ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
655 // check the single chinese units dictionary.
656 if (friso_dic_match(friso->dic, __LEX_CJK_UNITS__, task->buffer)) {
657 fdunits = 1;
658 string_buffer_append(sb, task->buffer);
659 wlen += task->bytes;
660 task->idx += task->bytes;
661 }
662 }
663 }
664
665 // set the START_SS_MASK.
666 if (fdunits != 1 && ssseg) {
667 task_ssseg_open(task);
668 }
669
670 // create the lexicon entry and return it.
671 blen = sb->length;
672 e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
673 e->rlen = wlen;
674
675 return e;
676 }
677 /* }}} */
678
679 //-------------------------------------------------------------------
680 // friso core part 3: mmseg tokenize implements functions
681 // mmseg algorithm implemented functions - start
682
683 /* {{{ get the next match from the current position,
684 * throught the dictionary this will return all the matchs.
685 *
686 * @return friso_array_t that contains all the matchs.
687 */
688 __STATIC_API__ friso_array_t get_next_match(friso_t friso, friso_config_t config, friso_task_t task,
689 uint_t idx) {
690 register uint_t t;
691 string_buffer_t sb = new_string_buffer_with_string(task->buffer);
692
693 // create a match dynamic array.
694 friso_array_t match = new_array_list_with_opacity(config->max_len);
695 array_list_add(match, friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->buffer));
696
697 for (t = 1;
698 t < config->max_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
699 t++) {
700 if (friso_whitespace(friso->charset, task)) break;
701 if (!friso_cn_string(friso->charset, task)) break;
702
703 // append the task->buffer to the buffer.
704 string_buffer_append(sb, task->buffer);
705
706 // check the CJK dictionary.
707 if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, sb->buffer)) {
708 /*
709 * add the lex_entry_t insite.
710 * here is a key point:
711 * we use friso_dic_get function
712 * to get the address of the lex_entry_cdt
713 * that store in the dictionary,
714 * not create a new lex_entry_cdt.
715 * so :
716 * 1.we will not bother to the allocations of
717 * the newly created lex_entry_cdt.
718 * 2.more efficient of course.
719 */
720 array_list_add(match, friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer));
721 }
722 }
723
724 /*buffer allocations clear*/
725 free_string_buffer(sb);
726 // array_list_trim( match );
727
728 return match;
729 }
730 /* }}} */
731
732 /* {{{ chunk for mmseg defines and functions to handle them.*/
733 typedef struct {
734 friso_array_t words;
735 uint_t length;
736 float average_word_length;
737 float word_length_variance;
738 float single_word_dmf;
739 } friso_chunk_entry;
740 typedef friso_chunk_entry *friso_chunk_t;
741 /* }}} */
742
743 /* {{{ create a new chunks*/
744 __STATIC_API__ friso_chunk_t new_chunk(friso_array_t words, uint_t length) {
745 friso_chunk_t chunk = (friso_chunk_t)FRISO_MALLOC(sizeof(friso_chunk_entry));
746 if (chunk == NULL) {
747 ___ALLOCATION_ERROR___
748 }
749
750 chunk->words = words;
751 chunk->length = length;
752 chunk->average_word_length = -1;
753 chunk->word_length_variance = -1;
754 chunk->single_word_dmf = -1;
755
756 return chunk;
757 }
758 /* }}} */
759
760 /* {{{ free the specified chunk */
761 __STATIC_API__ void free_chunk(friso_chunk_t chunk) {
762 FRISO_FREE(chunk);
763 }
764 /* }}} */
765
766 /* {{{ a static function to count the average word length
767 * of the given chunk.
768 */
769 __STATIC_API__ float count_chunk_avl(friso_chunk_t chunk) {
770 chunk->average_word_length = ((float)chunk->length) / chunk->words->length;
771 return chunk->average_word_length;
772 }
773 /* }}} */
774
775 /* {{{ a static function to count the word length variance
776 * of the given chunk.
777 */
778 __STATIC_API__ float count_chunk_var(friso_chunk_t chunk) {
779 float var = 0, tmp = 0; // snapshot
780 register uint_t t;
781 lex_entry_t e;
782
783 for (t = 0; t < chunk->words->length; t++) {
784 e = (lex_entry_t)chunk->words->items[t];
785 tmp = e->length - chunk->average_word_length;
786 var += tmp * tmp;
787 }
788
789 chunk->word_length_variance = var / chunk->words->length;
790
791 return chunk->word_length_variance;
792 }
793 /* }}} */
794
795 /* {{{ a static function to count the single word morpheme degree of freedom
796 * of the given chunk.
797 */
798 __STATIC_API__ float count_chunk_mdf(friso_chunk_t chunk) {
799 float __mdf__ = 0;
800 register uint_t t;
801 lex_entry_t e;
802
803 for (t = 0; t < chunk->words->length; t++) {
804 e = (lex_entry_t)chunk->words->items[t];
805 // single CJK(UTF-8)/chinese(GBK) word.
806 // better add a charset check here, but this will works find.
807 // all CJK words will take 3 bytes with UTF-8 encoding.
808 // all chinese words take 2 bytes with GBK encoding.
809 if (e->length == 3 || e->length == 2) {
810 __mdf__ += (float)log((float)e->fre);
811 }
812 }
813 chunk->single_word_dmf = __mdf__;
814
815 return chunk->single_word_dmf;
816 }
817 /* }}} */
818
819 /* {{{ chunk printer - use for for debug*/
820 #define ___CHUNK_PRINTER___(_chunks_) \
821 for (t = 0; t < _chunks_->length; t++) { \
822 __tmp__ = ((friso_chunk_t)_chunks_->items[t])->words; \
823 for (j = 0; j < __tmp__->length; j++) { \
824 printf("%s/ ", ((lex_entry_t)__tmp__->items[j])->word); \
825 } \
826 putchar('\n'); \
827 } \
828 putchar('\n'); \
829 /* }}} */
830
831 /* {{{ mmseg algorithm core invoke
832 * here,
833 * we use four rules to filter all the chunks to get the best chunk.
834 * and this is the core of the mmseg alogrithm.
835 * 1. maximum match word length.
836 * 2. larget average word length.
837 * 3. smallest word length variance.
838 * 4. largest single word morpheme degrees of freedom.
839 */
840 __STATIC_API__ friso_chunk_t mmseg_core_invoke(friso_array_t chunks) {
841 register uint_t t /*, j*/;
842 float max;
843 friso_chunk_t e;
844 friso_array_t __res__, __tmp__;
845 __res__ = new_array_list_with_opacity(chunks->length);
846
847 // 1.get the maximum matched chunks.
848 // count the maximum length
849 max = (float)((friso_chunk_t)chunks->items[0])->length;
850 for (t = 1; t < chunks->length; t++) {
851 e = (friso_chunk_t)chunks->items[t];
852 if (e->length > max) max = (float)e->length;
853 }
854 // get the chunk items that owns the maximum length.
855 for (t = 0; t < chunks->length; t++) {
856 e = (friso_chunk_t)chunks->items[t];
857 if (e->length >= max) {
858 array_list_add(__res__, e);
859 } else {
860 free_array_list(e->words);
861 free_chunk(e);
862 }
863 }
864 // check the left chunks
865 if (__res__->length == 1) {
866 e = (friso_chunk_t)__res__->items[0];
867 free_array_list(__res__);
868 free_array_list(chunks);
869 return e;
870 } else {
871 __tmp__ = array_list_clear(chunks);
872 chunks = __res__;
873 __res__ = __tmp__;
874 }
875
876 // 2.get the largest average word length chunks.
877 // count the maximum average word length.
878 max = count_chunk_avl((friso_chunk_t)chunks->items[0]);
879 for (t = 1; t < chunks->length; t++) {
880 e = (friso_chunk_t)chunks->items[t];
881 if (count_chunk_avl(e) > max) {
882 max = e->average_word_length;
883 }
884 }
885 // get the chunks items that own the largest average word length.
886 for (t = 0; t < chunks->length; t++) {
887 e = (friso_chunk_t)chunks->items[t];
888 if (e->average_word_length >= max) {
889 array_list_add(__res__, e);
890 } else {
891 free_array_list(e->words);
892 free_chunk(e);
893 }
894 }
895 // check the left chunks
896 if (__res__->length == 1) {
897 e = (friso_chunk_t)__res__->items[0];
898 free_array_list(__res__);
899 free_array_list(chunks);
900 return e;
901 } else {
902 __tmp__ = array_list_clear(chunks);
903 chunks = __res__;
904 __res__ = __tmp__;
905 }
906
907 // 3.get the smallest word length variance chunks
908 // count the smallest word length variance
909 max = count_chunk_var((friso_chunk_t)chunks->items[0]);
910 for (t = 1; t < chunks->length; t++) {
911 e = (friso_chunk_t)chunks->items[t];
912 if (count_chunk_var(e) < max) {
913 max = e->word_length_variance;
914 }
915 }
916 // get the chunks that own the smallest word length variance.
917 for (t = 0; t < chunks->length; t++) {
918 e = (friso_chunk_t)chunks->items[t];
919 if (e->word_length_variance <= max) {
920 array_list_add(__res__, e);
921 } else {
922 free_array_list(e->words);
923 free_chunk(e);
924 }
925 }
926 // check the left chunks
927 if (__res__->length == 1) {
928 e = (friso_chunk_t)__res__->items[0];
929 free_array_list(chunks);
930 free_array_list(__res__);
931 return e;
932 } else {
933 __tmp__ = array_list_clear(chunks);
934 chunks = __res__;
935 __res__ = __tmp__;
936 }
937
938 // 4.get the largest single word morpheme degrees of freedom.
939 // count the maximum single word morpheme degreees of freedom
940 max = count_chunk_mdf((friso_chunk_t)chunks->items[0]);
941 for (t = 1; t < chunks->length; t++) {
942 e = (friso_chunk_t)chunks->items[t];
943 if (count_chunk_mdf(e) > max) {
944 max = e->single_word_dmf;
945 }
946 }
947 // get the chunks that own the largest single word word morpheme degrees of freedom.
948 for (t = 0; t < chunks->length; t++) {
949 e = (friso_chunk_t)chunks->items[t];
950 if (e->single_word_dmf >= max) {
951 array_list_add(__res__, e);
952 } else {
953 free_array_list(e->words);
954 free_chunk(e);
955 }
956 }
957
958 /*
959 * there is still more than one chunks?
960 * well, this rarely happen but still happens.
961 * here we simple return the first chunk as the final result,
962 * and we need to free the all the chunks that __res__
963 * points to except the 1th one.
964 * you have to do two things to totaly free a chunk:
965 * 1. call free_array_list to free the allocations of a chunk's words.
966 * 2. call free_chunk to the free the allocations of a chunk.
967 */
968 for (t = 1; t < __res__->length; t++) {
969 e = (friso_chunk_t)__res__->items[t];
970 free_array_list(e->words);
971 free_chunk(e);
972 }
973
974 e = (friso_chunk_t)__res__->items[0];
975 free_array_list(chunks);
976 free_array_list(__res__);
977
978 return e;
979 }
980 /* }}} */
981
982 /* {{{ get the next cjk word from the current position with complex mode.
983 * this is the core of the mmseg chinese word segemetation algorithm.
984 * we use four rules to filter the matched chunks and get the best one
985 * as the final result.
986 *
987 * @see mmseg_core_invoke( chunks );
988 */
989 FRISO_API lex_entry_t next_complex_cjk(friso_t friso, friso_config_t config, friso_task_t task) {
990 register uint_t x, y, z;
991 /*bakup the task->bytes here*/
992 uint_t __idx__ = task->bytes;
993 lex_entry_t fe, se, te;
994 friso_chunk_t e;
995 friso_array_t words, chunks;
996 friso_array_t smatch, tmatch, fmatch = get_next_match(friso, config, task, task->idx);
997
998 /*
999 * here:
1000 * if the length of the fmatch is 1, mean we don't have to
1001 * continue the following work. ( no matter what we get the same result. )
1002 */
1003 if (fmatch->length == 1) {
1004 fe = ((lex_entry_t)fmatch->items[0]);
1005 free_array_list(fmatch);
1006
1007 /*
1008 * check and clear the stop words .
1009 * @date 2013-06-13
1010 */
1011 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, fe->word)) {
1012 return NULL;
1013 }
1014
1015 return fe;
1016 }
1017
1018 chunks = new_array_list();
1019 task->idx -= __idx__;
1020
1021 for (x = 0; x < fmatch->length; x++) {
1022 /*get the word and try the second layer match*/
1023 fe = (lex_entry_t)array_list_get(fmatch, x);
1024 __idx__ = task->idx + fe->length;
1025 readNextWord(friso, task, &__idx__, task->buffer);
1026
1027 if (task->bytes != 0 && friso_cn_string(friso->charset, task) &&
1028 friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1029 // get the next matchs
1030 smatch = get_next_match(friso, config, task, __idx__);
1031 for (y = 0; y < smatch->length; y++) {
1032 /*get the word and try the third layer match*/
1033 se = (lex_entry_t)array_list_get(smatch, y);
1034 __idx__ = task->idx + fe->length + se->length;
1035 readNextWord(friso, task, &__idx__, task->buffer);
1036
1037 if (task->bytes != 0 && friso_cn_string(friso->charset, task) &&
1038 friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1039 // get the matchs.
1040 tmatch = get_next_match(friso, config, task, __idx__);
1041 for (z = 0; z < tmatch->length; z++) {
1042 te = (lex_entry_t)array_list_get(tmatch, z);
1043 words = new_array_list_with_opacity(3);
1044 array_list_add(words, fe);
1045 array_list_add(words, se);
1046 array_list_add(words, te);
1047 array_list_add(chunks, new_chunk(words, fe->length + se->length + te->length));
1048 }
1049 // free the third matched array list
1050 free_array_list(tmatch);
1051 } else {
1052 words = new_array_list_with_opacity(2);
1053 array_list_add(words, fe);
1054 array_list_add(words, se);
1055 // add the chunk
1056 array_list_add(chunks, new_chunk(words, fe->length + se->length));
1057 }
1058 }
1059 // free the second match array list
1060 free_array_list(smatch);
1061 } else {
1062 words = new_array_list_with_opacity(1);
1063 array_list_add(words, fe);
1064 array_list_add(chunks, new_chunk(words, fe->length));
1065 }
1066 }
1067 // free the first match array list
1068 free_array_list(fmatch);
1069
1070 /*
1071 * filter the chunks with the four rules of the mmseg algorithm
1072 * and get best chunk as the final result.
1073 *
1074 * @see mmseg_core_invoke( chunks );
1075 * @date 2012-12-13
1076 */
1077 if (chunks->length > 1) {
1078 e = mmseg_core_invoke(chunks);
1079 } else {
1080 e = (friso_chunk_t)chunks->items[0];
1081 }
1082
1083 fe = (lex_entry_t)e->words->items[0];
1084 task->idx += fe->length; // reset the idx of the task.
1085 free_array_list(e->words); // free the chunks words allocation
1086 free_chunk(e);
1087
1088 // clear the stop words
1089 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, fe->word)) {
1090 return NULL;
1091 }
1092
1093 return fe;
1094 }
1095 /* }}} */
1096 //----------------end of mmseg core
1097
1098 //-------------------------------------------------------------------------------------
1099 // mmseg core logic controller, output style controller and macro defines
1100 /* {{{ A macro function to check and free
1101 * the lex_entry_t with type of __LEX_OTHER_WORDS__.
1102 */
1103 #define check_free_otlex_entry(lex) \
1104 do { \
1105 if (lex->type == __LEX_OTHER_WORDS__) { \
1106 FRISO_FREE(lex->word); \
1107 free_lex_entry(lex); \
1108 } \
1109 } while (0)
1110 /* }}} */
1111
1112 /* {{{ sphinx style output synonyms words append.
1113 *
1114 * @param task
1115 * @param lex
1116 * */
1117 __STATIC_API__ void token_sphinx_output(friso_task_t task, lex_entry_t lex) {
1118 uint_t i, j, len;
1119 fstring _word;
1120 len = lex->length;
1121
1122 // append the synoyums words.
1123 for (i = 0; i < lex->syn->length; i++) {
1124 _word = (fstring)lex->syn->items[i];
1125 j = strlen(_word);
1126 if ((len + j + 1) >= __HITS_WORD_LENGTH__) break;
1127 memcpy(task->token->word + len, "|", 1);
1128 len += 1;
1129 memcpy(task->token->word + len, _word, j);
1130 len += j;
1131 }
1132
1133 // set the new end of the buffer.
1134 task->token->word[len] = '\0';
1135 }
1136 /* }}} */
1137
1138 /* {{{ normal style output synonyms words append.
1139 *
1140 * @param task
1141 * @param lex
1142 * @param front 1 for add the synoyum words from the head and
1143 * 0 for append from the tail.
1144 * */
1145 __STATIC_API__ void token_normal_output(friso_task_t task, lex_entry_t lex, int front) {
1146 uint_t i;
1147 fstring _word;
1148 lex_entry_t e;
1149
1150 for (i = 0; i < lex->syn->length; i++) {
1151 _word = (fstring)lex->syn->items[i];
1152 e = new_lex_entry(_word, NULL, 0, strlen(_word), __LEX_NCSYN_WORDS__);
1153 e->offset = lex->offset;
1154 // add to the buffer.
1155 if (front) {
1156 link_list_add_first(task->pool, e);
1157 } else {
1158 link_list_add(task->pool, e);
1159 }
1160 }
1161 }
1162 /* }}} */
1163
1164 /* {{{ do the secondary segmentation of the complex english token.
1165 *
1166 * @param friso
1167 * @param config
1168 * @param task
1169 * @param lex
1170 * @param retfw -Wether to return the first word.
1171 * @return lex_entry_t(NULL or the first sub token of the lex)
1172 */
1173 __STATIC_API__ lex_entry_t en_second_seg(friso_t friso, friso_config_t config, friso_task_t task,
1174 lex_entry_t lex, int retfw) {
1175 // printf("sseg: %d\n", (task->ctrlMask & START_SS_MASK));
1176
1177 int j, p = 0, start = 0;
1178 fstring str = lex->word;
1179
1180 lex_entry_t fword = NULL, sword = NULL;
1181
1182 int _ctype, _TYPE = get_enchar_type(str[0]);
1183 string_buffer_clear(task->sbuf);
1184 string_buffer_append_char(task->sbuf, str[0]);
1185
1186 for (j = 1; j < lex->length; j++) {
1187 // get the type of the char
1188 _ctype = get_enchar_type(str[j]);
1189 if (_ctype == FRISO_EN_WHITESPACE) {
1190 _TYPE = FRISO_EN_WHITESPACE;
1191 p++;
1192 continue;
1193 }
1194
1195 if (_ctype == _TYPE) {
1196 string_buffer_append_char(task->sbuf, str[j]);
1197 } else {
1198 start = j - task->sbuf->length - p;
1199
1200 /* If the number of chars of current type
1201 * is larger than config->st_minl then we will
1202 * create a new lex_entry_t and append it to the task->wordPool.
1203 * */
1204 if (task->sbuf->length >= config->st_minl &&
1205 !(config->clr_stw &&
1206 friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->sbuf->buffer))) {
1207 /* the allocation of lex_entry_t and its word
1208 * should be released and the type of the lex_entry_t
1209 * must be __LEX_OTHER_WORDS__.
1210 * */
1211 sword = new_lex_entry(rm_strdup(task->sbuf->buffer), NULL, 0, task->sbuf->length,
1212 __LEX_OTHER_WORDS__);
1213 sword->offset = lex->offset + start;
1214 if (retfw && fword == NULL) {
1215 fword = sword;
1216 } else {
1217 link_list_add(task->pool, sword);
1218 }
1219 }
1220
1221 string_buffer_clear(task->sbuf);
1222 string_buffer_append_char(task->sbuf, str[j]);
1223 p = 0;
1224 _TYPE = _ctype;
1225 }
1226 }
1227
1228 // continue to check the last item.
1229 if (task->sbuf->length >= config->st_minl &&
1230 !(config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->sbuf->buffer))) {
1231 start = j - task->sbuf->length;
1232 sword = new_lex_entry(rm_strdup(task->sbuf->buffer), NULL, 0, task->sbuf->length,
1233 __LEX_OTHER_WORDS__);
1234 sword->offset = j - task->sbuf->length;
1235 if (retfw && fword == NULL) {
1236 fword = sword;
1237 } else {
1238 link_list_add(task->pool, sword);
1239 }
1240 }
1241
1242 return fword;
1243 }
1244 /*}}}*/
1245
1246 /* {{{ english synoyums words check and append macro define.*/
1247 #define append_en_syn(lex, tmp, front) \
1248 do { \
1249 if ((tmp = friso_dic_get(friso->dic, __LEX_EN_WORDS__, lex->word)) != NULL && \
1250 (tmp->syn) != NULL) { \
1251 if (config->spx_out == 1) \
1252 token_sphinx_output(task, tmp); \
1253 else { \
1254 tmp->offset = lex->offset; \
1255 token_normal_output(task, tmp, front); \
1256 } \
1257 } \
1258 } while (0)
1259 /* }}} */
1260
1261 /* {{{ get the next segmentation.
1262 * and also this is the friso enterface function.
1263 *
1264 * @param friso.
1265 * @param config.
1266 * @return task.
1267 */
1268 FRISO_API friso_token_t next_mmseg_token(friso_t friso, friso_config_t config, friso_task_t task) {
1269 uint_t j, len = 0;
1270 string_buffer_t sb = NULL;
1271 lex_entry_t lex = NULL, tmp = NULL, sword = NULL;
1272
1273 /* {{{ task word pool check */
1274 if (!link_list_empty(task->pool)) {
1275 /*
1276 * load word from the word poll if it is not empty.
1277 * this will make the next word more convenient and efficient.
1278 * often synonyms, newly created word will be stored in the poll.
1279 */
1280 lex = (lex_entry_t)link_list_remove_first(task->pool);
1281 memcpy(task->token->word, lex->word, lex->length);
1282 task->token->type = lex->type;
1283 task->token->length = lex->length;
1284 task->token->rlen = lex->rlen;
1285 task->token->offset = lex->offset;
1286 task->token->word[lex->length] = '\0';
1287
1288 /* check and handle the english synonyms words append mask.
1289 * Also we have to close the mask after finish the operation.
1290 *
1291 * 1. we've check the config->add_syn before open the
1292 * _LEX_APPENSYN_MASK mask.
1293 * 2. we should add the synonyms words of the curren
1294 * lex_entry_t from the head.
1295 *
1296 * @since: 1.6.0
1297 * */
1298 if (lex_appensyn_check(lex)) {
1299 lex_appensyn_close(lex);
1300 append_en_syn(lex, tmp, 1);
1301 }
1302
1303 /*
1304 * __LEX_NCSYN_WORDS__:
1305 * these lex_entry_t was created to store the the synonyums words.
1306 * and its word pointed to the lex_entry_t's synonyms word of
1307 * friso->dic, so :
1308 * free the lex_entry_t but not its word here.
1309 *
1310 * __LEX_OTHER_WORDS__:
1311 * newly created lexicon entry, like the chinese and english mixed word.
1312 * during the invoke of function next_basic_latin.
1313 *
1314 * other type:
1315 * they must exist in the dictionary, so just pass them.
1316 */
1317 switch (lex->type) {
1318 case __LEX_OTHER_WORDS__:
1319 FRISO_FREE(lex->word);
1320 free_lex_entry(lex);
1321 break;
1322 case __LEX_NCSYN_WORDS__:
1323 free_lex_entry(lex);
1324 break;
1325 }
1326
1327 return task->token;
1328 }
1329 /* }}} */
1330
1331 while (task->idx < task->length) {
1332 // read the next word from the current position.
1333 task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1334 if (task->bytes == 0) break;
1335
1336 // clear up the whitespace.
1337 if (friso_whitespace(friso->charset, task)) continue;
1338
1339 /* {{{ CJK words recongnize block. */
1340 if (friso_cn_string(friso->charset, task)) {
1341 /* check the dictionary.
1342 * and return the unrecognized CJK char as a single word.
1343 * */
1344 if (!friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1345 memcpy(task->token->word, task->buffer, task->bytes);
1346 task->token->type = __LEX_PUNC_WORDS__;
1347 task->token->length = task->bytes;
1348 task->token->rlen = task->bytes;
1349 task->token->offset = task->idx - task->bytes;
1350 task->token->word[(int)task->bytes] = '\0';
1351 return task->token;
1352 }
1353
1354 // specifield mode split.
1355 // if ( config->mode == __FRISO_COMPLEX_MODE__ )
1356 // lex = next_complex_cjk( friso, config, task );
1357 // else lex = next_simple_cjk( friso, config, task );
1358 lex = config->next_cjk(friso, config, task);
1359
1360 if (lex == NULL) continue; // find a stopwrod.
1361 lex->offset = task->idx - lex->rlen;
1362
1363 /*
1364 * try to find a chinese and english mixed words, like '卡拉ok'
1365 * keep in mind that is not english and chinese mixed words
1366 * like 'x射线'.
1367 *
1368 * @reader:
1369 * 1. only if the char after the current word is an english char.
1370 * 2. if the first point meet, friso will call next_basic_latin() to
1371 * get the next basic latin. (yeah, you have to handle it).
1372 * 3. if match a CE word, set lex to the newly match CE word.
1373 * 4. if no match a CE word, we will have to append the basic latin
1374 * to the pool, and it should after the append of synonyms words.
1375 * 5. do not use the task->buffer and task->unicode as the check
1376 * condition for the CE word identify.
1377 * 6. Add friso_numeric_letter check so can get work like '高3'
1378 *
1379 * @date 2013-09-02
1380 */
1381 if ((task->idx < task->length) && ((int)task->text[task->idx]) > 0 &&
1382 (friso_en_letter(friso->charset, task) || friso_numeric_letter(friso->charset, task))) {
1383 // create a string buffer
1384 sb = new_string_buffer_with_string(lex->word);
1385
1386 // find the next basic latin.
1387 task->buffer[0] = task->text[task->idx++];
1388 task->buffer[1] = '\0';
1389 tmp = next_basic_latin(friso, config, task);
1390 tmp->offset = task->idx - tmp->length;
1391 string_buffer_append(sb, tmp->word);
1392
1393 // check the CE dictionary.
1394 if (friso_dic_match(friso->dic, __LEX_CEM_WORDS__, sb->buffer)) {
1395 j = lex->offset; // bakup the offset.
1396 lex = friso_dic_get(friso->dic, __LEX_CEM_WORDS__, sb->buffer);
1397 lex->offset = j;
1398 check_free_otlex_entry(tmp);
1399 free_string_buffer(sb);
1400 tmp = NULL;
1401 sb = NULL;
1402 }
1403 }
1404
1405 /*
1406 * copy the lex_entry to the result token
1407 *
1408 * @reader: (boodly lession, added 2013-08-31):
1409 * don't bother to handle the task->token->offset problem.
1410 * is has been sovled perfectly above.
1411 */
1412 len = (int)lex->length;
1413 memcpy(task->token->word, lex->word, lex->length);
1414 task->token->type = lex->type;
1415 task->token->length = lex->length;
1416 task->token->rlen = lex->rlen;
1417 task->token->offset = lex->offset;
1418 task->token->word[len] = '\0';
1419
1420 // check and append the synonyms words
1421 if (config->add_syn && lex->syn != NULL) {
1422 if (config->spx_out == 1) {
1423 token_sphinx_output(task, lex);
1424 } else {
1425 token_normal_output(task, lex, 0);
1426 }
1427 }
1428
1429 /* {{{ here: handle the newly found basic latin created when
1430 * we try to find a CE word.
1431 *
1432 * @reader:
1433 * when tmp is not NULL and sb will not be NULL too
1434 * except a CE word is found.
1435 *
1436 * @TODO: finished append the synonyms words on 2013-12-19.
1437 */
1438 if (tmp != NULL && sb != NULL) {
1439 // check the secondary split.
1440 if (config->en_sseg == 1 && task_ssseg_check(task)) {
1441 en_second_seg(friso, config, task, tmp, 0);
1442 }
1443
1444 free_string_buffer(sb);
1445 link_list_add(task->pool, tmp);
1446
1447 // check if append synoyums words.
1448 if (config->add_syn == 1) {
1449 lex_appensyn_open(tmp);
1450 }
1451 }
1452 /* }}} */
1453
1454 return task->token;
1455 }
1456 /* }}} */
1457
1458 /* {{{ basic english/latin recongnize block. */
1459 else if (friso_halfwidth_en_char(friso->charset, task) ||
1460 friso_fullwidth_en_char(friso->charset, task)) {
1461 /*
1462 * handle the english punctuation.
1463 *
1464 * @todo:
1465 * 1. commen all the code of the following if
1466 * and uncomment the continue to clear up the punctuation directly.
1467 *
1468 * @reader:
1469 * 2. keep in mind that ALL the english punctuation will be handled here,
1470 * (when a english punctuation is found during the other process, we will
1471 * reset the task->idx back to it and then back here)
1472 * except the keep punctuation(define in file friso_string.c)
1473 * that will make up a word with the english chars around it.
1474 */
1475 if (friso_en_punctuation(friso->charset, task)) {
1476 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->buffer)) {
1477 continue;
1478 }
1479
1480 // count the punctuation in.
1481 task->token->word[0] = task->buffer[0];
1482 task->token->type = __LEX_PUNC_WORDS__;
1483 task->token->length = task->bytes;
1484 task->token->rlen = task->bytes;
1485 task->token->offset = task->idx - task->bytes;
1486 task->token->word[1] = '\0';
1487 return task->token;
1488
1489 // continue
1490 }
1491
1492 // get the next basic latin word.
1493 lex = next_basic_latin(friso, config, task);
1494 lex->offset = task->idx - lex->rlen;
1495
1496 /* @added: 2013-12-22
1497 * check and do the secondary segmentation work.
1498 * this will split 'qq2013' to 'qq, 2013'
1499 * */
1500 sword = NULL;
1501 if (config->en_sseg == 1 && task_ssseg_check(task)) {
1502 sword = en_second_seg(friso, config, task, lex, 1);
1503 }
1504
1505 // check if it is a stopword.
1506 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, lex->word)) {
1507 // free the newly created lexicon entry.
1508 check_free_otlex_entry(lex);
1509 if (sword == NULL) continue;
1510 lex = sword;
1511 } else if (sword != NULL) {
1512 if (config->add_syn == 1) lex_appensyn_open(lex);
1513 link_list_add(task->pool, lex);
1514
1515 /* If the sub token is not NULL:
1516 * add the lex to the task->pool if it is not NULL
1517 * and return the sub token istead of lex so
1518 * the sub tokens will be output ahead of lex.
1519 * */
1520 lex = sword;
1521 }
1522
1523 // if the token is longer than __HITS_WORD_LENGTH__, drop it
1524 // copy the word to the task token buffer.
1525 // if ( lex->length >= __HITS_WORD_LENGTH__ ) continue;
1526 memcpy(task->token->word, lex->word, lex->length);
1527 task->token->type = lex->type;
1528 task->token->length = lex->length;
1529 task->token->rlen = lex->rlen;
1530 task->token->offset = lex->offset;
1531 task->token->word[lex->length] = '\0';
1532
1533 /* If sword is NULL, continue to check and append
1534 * tye synoyums words for the current lex_entry_t.
1535 * */
1536 if (sword == NULL && config->add_syn == 1) {
1537 append_en_syn(lex, tmp, 0);
1538 }
1539
1540 // free the newly create lex_entry_t
1541 check_free_otlex_entry(lex);
1542
1543 return task->token;
1544 }
1545 /* }}} */
1546
1547 /* {{{ Keep the chinese punctuation.
1548 * @added 2013-08-31) */
1549 else if (friso_cn_punctuation(friso->charset, task)) {
1550 if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->buffer)) {
1551 continue;
1552 }
1553
1554 // count the punctuation in.
1555 memcpy(task->token->word, task->buffer, task->bytes);
1556 task->token->type = __LEX_PUNC_WORDS__;
1557 task->token->length = task->bytes;
1558 task->token->offset = task->idx - task->bytes;
1559 task->token->word[task->bytes] = '\0';
1560 return task->token;
1561 }
1562 /* }}} */
1563 // else if ( friso_letter_number( friso->charset, task ) )
1564 //{
1565 //}
1566 // else if ( friso_other_number( friso->charset, task ) )
1567 //{
1568 //}
1569
1570 /* {{{ keep the unrecognized words?
1571 //@date 2013-10-14 */
1572 else if (config->keep_urec) {
1573 memcpy(task->token->word, task->buffer, task->bytes);
1574 task->token->type = __LEX_UNKNOW_WORDS__;
1575 task->token->length = task->bytes;
1576 task->token->offset = task->idx - task->bytes;
1577 task->token->word[task->bytes] = '\0';
1578 return task->token;
1579 }
1580 /* }}} */
1581 }
1582
1583 return NULL;
1584 }
1585 /* }}} */
1586
1587 //----------------------------------------------------------------------
1588 // detect core logic controller: detect tokenize mode handler functions
1589 /** {{{ get the next splited token with detect mode
1590 * detect mode will only return the words in the dictionary
1591 * with simple forward maximum matching algorithm
1592 */
1593 FRISO_API friso_token_t next_detect_token(friso_t friso, friso_config_t config, friso_task_t task) {
1594 lex_entry_t lex = NULL;
1595 int i, __convert = 0, tbytes, wbytes;
1596
1597 /* {{{ task word pool check */
1598 if (!link_list_empty(task->pool)) {
1599 /*
1600 * load word from the word poll if it is not empty.
1601 * this will make the next word more convenient and efficient.
1602 * often synonyms, newly created word will be stored in the poll.
1603 */
1604 lex = (lex_entry_t)link_list_remove_first(task->pool);
1605 memcpy(task->token->word, lex->word, lex->length);
1606 task->token->type = lex->type;
1607 task->token->length = lex->length;
1608 task->token->rlen = lex->rlen;
1609 task->token->offset = lex->offset;
1610 task->token->word[lex->length] = '\0';
1611
1612 /*
1613 * __LEX_NCSYN_WORDS__:
1614 * these lex_entry_t was created to store the the synonyums words.
1615 * and its word pointed to the lex_entry_t's synonyms word of
1616 * friso->dic, so :
1617 * free the lex_entry_t but not its word here.
1618 */
1619 if (lex->type == __LEX_NCSYN_WORDS__) {
1620 free_lex_entry(lex);
1621 }
1622
1623 return task->token;
1624 }
1625 /* }}} */
1626
1627 while (task->idx < task->length) {
1628 lex = NULL;
1629
1630 // read the next word from the current position.
1631 task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1632 if (task->bytes == 0) break;
1633
1634 // clear up the whitespace.
1635 if (friso_whitespace(friso->charset, task)) continue;
1636
1637 // convert full-width to half-width
1638 // and uppercase to lowercase for english chars
1639 wbytes = 0;
1640 tbytes = task->bytes;
1641 convert_full_to_half(friso, task, __convert);
1642 convert_upper_to_lower(friso, task, __convert);
1643 convert_work_apply(friso, task, __convert);
1644
1645 string_buffer_clear(task->sbuf);
1646 string_buffer_append(task->sbuf, task->buffer);
1647 if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer)) {
1648 lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer);
1649 wbytes = tbytes;
1650 }
1651
1652 for (i = 1; i < config->max_len; i++) {
1653 task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1654 if (task->bytes == 0) break;
1655
1656 // convert full-width to half-width
1657 // and uppercase to lowercase for english chars
1658 tbytes += task->bytes;
1659 convert_full_to_half(friso, task, __convert);
1660 convert_upper_to_lower(friso, task, __convert);
1661 convert_work_apply(friso, task, __convert);
1662 string_buffer_append(task->sbuf, task->buffer);
1663
1664 if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer)) {
1665 lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer);
1666 wbytes = tbytes;
1667 }
1668 }
1669
1670 /*
1671 * matches no word in the dictionary
1672 * reset the task->idx to the correct value
1673 */
1674 if (lex == NULL) {
1675 task->idx -= (tbytes - 1);
1676 continue;
1677 }
1678
1679 // yat, matched a item and tanke it to initialize the returning token
1680 // also we need to push back the none-matched part by reset the task->idx
1681 task->idx -= (tbytes - wbytes);
1682
1683 memcpy(task->token->word, lex->word, lex->length);
1684 task->token->type = __LEX_CJK_WORDS__;
1685 task->token->length = lex->length;
1686 task->token->rlen = wbytes;
1687 task->token->offset = task->idx - wbytes;
1688 task->token->word[(int)lex->length] = '\0';
1689
1690 // check and append the synonyms words
1691 if (config->add_syn && lex->syn != NULL) {
1692 if (config->spx_out == 1) {
1693 token_sphinx_output(task, lex);
1694 } else {
1695 token_normal_output(task, lex, 0);
1696 }
1697 }
1698
1699 return task->token;
1700 }
1701
1702 return NULL;
1703 }
1704 /* }}} */
1705