1 /*
2  * friso main file implemented the friso main functions.
3  *         starts with friso_ in the friso header file "friso.h";
4  *
5  * @author    chenxin <chenxin619315@gmail.com>
6  */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>
11 
12 #include "friso_API.h"
13 #include "friso_ctype.h"
14 #include "friso.h"
15 
16 //-----------------------------------------------------------------
17 // friso instance about function
18 /* {{{ create a new friso configuration variable.
19  */
friso_new(void)20 FRISO_API friso_t friso_new(void) {
21   friso_t e = (friso_t)FRISO_MALLOC(sizeof(friso_entry));
22   if (e == NULL) {
23     ___ALLOCATION_ERROR___
24   }
25 
26   e->dic = NULL;
27   e->charset = FRISO_UTF8;  // set default charset UTF8.
28 
29   return e;
30 }
31 /* }}} */
32 
33 /* {{{ creat a new friso with initialize item from a configuration file.
34  *
35  * @return 1 for successfully and 0 for failed.
36  */
friso_init_from_ifile(friso_t friso,friso_config_t config,fstring __ifile)37 FRISO_API int friso_init_from_ifile(friso_t friso, friso_config_t config, fstring __ifile) {
38   FILE *__stream;
39   char __chars__[256], __key__[128], *__line__;
40   char __lexi__[160], lexpath[160];
41   uint_t i, t, __hit__ = 0, __length__;
42 
43   char *slimiter = NULL;
44   uint_t flen = 0;
45 
46   // get the base part of the path of the __ifile
47   if ((slimiter = strrchr(__ifile, '/')) != NULL) {
48     flen = slimiter - __ifile + 1;
49   }
50 
51   // yat, start to parse the friso.ini configuration file
52   if ((__stream = fopen(__ifile, "rb")) != NULL) {
53     // initialize the entry with the value from the ifile.
54     while ((__line__ = file_get_line(__chars__, __stream)) != NULL) {
55       // comments filter.
56       if (__line__[0] == '#') continue;
57       if (__line__[0] == '\t') continue;
58       if (__line__[0] == ' ' || __line__[0] == '\0') continue;
59 
60       __length__ = strlen(__line__);
61       for (i = 0; i < __length__; i++) {
62         if (__line__[i] == ' ' || __line__[i] == '\t' || __line__[i] == '=') {
63           break;
64         }
65         __key__[i] = __line__[i];
66       }
67       __key__[i] = '\0';
68 
69       // position the euqals char '='.
70       if (__line__[i] == ' ' || __line__[i] == '\t') {
71         for (i++; i < __length__; i++) {
72           if (__line__[i] == '=') {
73             break;
74           }
75         }
76       }
77 
78       // clear the left whitespace of the value.
79       for (i++; i < __length__ && (__line__[i] == ' ' || __line__[i] == '\t'); i++)
80         ;
81       for (t = 0; i < __length__; i++, t++) {
82         if (__line__[i] == ' ' || __line__[i] == '\t') {
83           break;
84         }
85         __line__[t] = __line__[i];
86       }
87       __line__[t] = '\0';
88 
89       // printf("key=%s, value=%s\n", __key__, __line__ );
90       if (strcmp(__key__, "friso.lex_dir") == 0) {
91         /*
92          * here copy the value of the lex_dir.
93          *        cause we need the value of friso.max_len to finish all
94          *    the work when we call function friso_dic_load_from_ifile to
95          *    initiliaze the friso dictionary.
96          */
97         if (__hit__ == 0) {
98           __hit__ = t;
99           for (t = 0; t < __hit__; t++) {
100             __lexi__[t] = __line__[t];
101           }
102           __lexi__[t] = '\0';
103         }
104       } else if (strcmp(__key__, "friso.max_len") == 0) {
105         config->max_len = (ushort_t)atoi(__line__);
106       } else if (strcmp(__key__, "friso.r_name") == 0) {
107         config->r_name = (ushort_t)atoi(__line__);
108       } else if (strcmp(__key__, "friso.mix_len") == 0) {
109         config->mix_len = (ushort_t)atoi(__line__);
110       } else if (strcmp(__key__, "friso.lna_len") == 0) {
111         config->lna_len = (ushort_t)atoi(__line__);
112       } else if (strcmp(__key__, "friso.add_syn") == 0) {
113         config->add_syn = (ushort_t)atoi(__line__);
114       } else if (strcmp(__key__, "friso.clr_stw") == 0) {
115         config->clr_stw = (ushort_t)atoi(__line__);
116       } else if (strcmp(__key__, "friso.keep_urec") == 0) {
117         config->keep_urec = (uint_t)atoi(__line__);
118       } else if (strcmp(__key__, "friso.spx_out") == 0) {
119         config->spx_out = (ushort_t)atoi(__line__);
120       } else if (strcmp(__key__, "friso.nthreshold") == 0) {
121         config->nthreshold = atoi(__line__);
122       } else if (strcmp(__key__, "friso.mode") == 0) {
123         // config->mode = ( friso_mode_t ) atoi( __line__ );
124         friso_set_mode(config, (friso_mode_t)atoi(__line__));
125       } else if (strcmp(__key__, "friso.charset") == 0) {
126         friso->charset = (friso_charset_t)atoi(__line__);
127       } else if (strcmp(__key__, "friso.en_sseg") == 0) {
128         config->en_sseg = (ushort_t)atoi(__line__);
129       } else if (strcmp(__key__, "friso.st_minl") == 0) {
130         config->st_minl = (ushort_t)atoi(__line__);
131       } else if (strcmp(__key__, "friso.kpuncs") == 0) {
132         // t is the length of the __line__.
133         memcpy(config->kpuncs, __line__, t);
134         // printf("friso_init_from_ifile#kpuncs: %s\n", config->kpuncs);
135       }
136     }
137 
138     /*
139      * intialize the friso dictionary here.
140      *        use the setting from the ifile parse above
141      *    we copied the value in the __lexi__
142      */
143     if (__hit__ != 0) {
144       // add relative path search support
145       //@added: 2014-05-24
146       // convert the relative path to absolute path base on the path of friso.ini
147       // improved at @date: 2014-10-26
148 
149 #ifdef FRISO_WINNT
150       if (__lexi__[1] != ':' && flen != 0) {
151 #else
152       if (__lexi__[0] != '/' && flen != 0) {
153 #endif
154         if ((flen + __hit__) > sizeof(lexpath) - 1) {
155           fprintf(stderr, "[Error]: Buffer is not long enough to hold the final lexicon path");
156           fprintf(stderr, " with a length of {%d} at function friso.c#friso_init_from_ifile",
157                   flen + __hit__);
158           return 0;
159         }
160 
161         memcpy(lexpath, __ifile, flen);
162         memcpy(lexpath + flen, __lexi__, __hit__ - 1);
163         // count the new length
164         flen = flen + __hit__ - 1;
165         if (lexpath[flen - 1] != '/') lexpath[flen] = '/';
166         lexpath[flen + 1] = '\0';
167       } else {
168         memcpy(lexpath, __lexi__, __hit__);
169         lexpath[__hit__] = '\0';
170         if (lexpath[__hit__ - 1] != '/') {
171           lexpath[__hit__] = '/';
172           lexpath[__hit__ + 1] = '\0';
173         }
174       }
175 
176       friso->dic = friso_dic_new();
177       // add charset check for max word length counting
178       friso_dic_load_from_ifile(friso, config, lexpath,
179                                 config->max_len * (friso->charset == FRISO_UTF8 ? 3 : 2));
180     } else {
181       fprintf(stderr, "[Error]: failed get lexicon path, check lex_dir in friso.ini \n");
182       return 0;
183     }
184 
185     fclose(__stream);
186     return 1;
187   }
188 
189   return 0;
190 }
191 /* }}} */
192 
193 /* {{{ friso free functions.
194  * here we have to free its dictionary.
195  */
196 FRISO_API void friso_free(friso_t friso) {
197   // free the dictionary
198   if (friso->dic != NULL) {
199     friso_dic_free(friso->dic);
200   }
201   FRISO_FREE(friso);
202 }
203 /* }}} */
204 
205 /* {{{ set the current split mode
206  *    view the friso.h#friso_mode_t
207  */
208 FRISO_API void friso_set_mode(friso_config_t config, friso_mode_t mode) {
209   config->mode = mode;
210 
211   switch (config->mode) {
212     case __FRISO_SIMPLE_MODE__:
213       config->next_token = next_mmseg_token;
214       config->next_cjk = next_simple_cjk;
215       break;
216     case __FRISO_DETECT_MODE__:
217       config->next_token = next_detect_token;
218       break;
219     default:
220       config->next_token = next_mmseg_token;
221       config->next_cjk = next_complex_cjk;
222       break;
223   }
224 }
225 /* }}} */
226 
227 /* {{{ create a new friso configuration entry and initialize
228  * it with default value.*/
229 FRISO_API friso_config_t friso_new_config(void) {
230   friso_config_t cfg = (friso_config_t)FRISO_MALLOC(sizeof(friso_config_entry));
231   if (cfg == NULL) {
232     ___ALLOCATION_ERROR___;
233   }
234 
235   // initialize the configuration entry.
236   friso_init_config(cfg);
237 
238   return cfg;
239 }
240 /* }}} */
241 
242 /* {{{ initialize the specified friso config entry with default value.*/
243 FRISO_API void friso_init_config(friso_config_t cfg) {
244   cfg->max_len = DEFAULT_SEGMENT_LENGTH;
245   cfg->r_name = 1;
246   cfg->mix_len = DEFAULT_MIX_LENGTH;
247   cfg->lna_len = DEFAULT_LNA_LENGTH;
248   cfg->add_syn = 1;
249   cfg->clr_stw = 0;
250   cfg->keep_urec = 0;
251   cfg->spx_out = 0;
252   cfg->en_sseg = 1;  // default start the secondary segmentaion.
253   cfg->st_minl = 1;  // min length for secondary split sub token.
254   cfg->nthreshold = DEFAULT_NTHRESHOLD;
255   cfg->mode = (friso_mode_t)DEFAULT_SEGMENT_MODE;
256 
257   friso_set_mode(cfg, cfg->mode);
258 
259   // Zero fill the kpuncs buffer.
260   memset(cfg->kpuncs, 0x00, sizeof(cfg->kpuncs));
261 }
262 /* }}} */
263 
264 /* {{{ create a new segment task entry.
265  */
266 FRISO_API friso_task_t friso_new_task() {
267   friso_task_t task = (friso_task_t)FRISO_MALLOC(sizeof(friso_task_entry));
268   if (task == NULL) {
269     ___ALLOCATION_ERROR___
270   }
271 
272   // initliaze the segment.
273   task->text = NULL;
274   task->idx = 0;
275   task->length = 0;
276   task->bytes = 0;
277   task->unicode = 0;
278   task->ctrlMask = 0;
279   task->pool = new_link_list();
280   task->sbuf = new_string_buffer();
281   task->token = friso_new_token();
282 
283   return task;
284 }
285 /* }}} */
286 
287 /* {{{ free the specified task*/
288 FRISO_API void friso_free_task(friso_task_t task) {
289   // free the allocation of the poll link list.
290   if (task->pool != NULL) {
291     free_link_list(task->pool);
292   }
293 
294   // release the allocation of the sbuff string_buffer_t.
295   if (task->sbuf != NULL) {
296     free_string_buffer(task->sbuf);
297   }
298 
299   // free the allocations of the token.
300   if (task->token != NULL) {
301     friso_free_token(task->token);
302   }
303 
304   FRISO_FREE(task);
305 }
306 /* }}} */
307 
308 /* {{{ create a new friso token */
309 FRISO_API friso_token_t friso_new_token(void) {
310   friso_token_t token = (friso_token_t)FRISO_MALLOC(sizeof(friso_token_entry));
311   if (token == NULL) {
312     ___ALLOCATION_ERROR___
313   }
314 
315   // initialize
316   token->type = (uchar_t)__LEX_OTHER_WORDS__;
317   token->length = 0;
318   token->rlen = 0;
319   token->pos = '\0';
320   token->offset = -1;
321   memset(token->word, 0x00, __HITS_WORD_LENGTH__);
322 
323   return token;
324 }
325 /* }}} */
326 
327 /* {{{ set the text of the current segmentation.
328  *        that means we could re-use the segment.
329  *    also we have to reset the idx and the length of the segmentation.
330  * and the most important one - clear the poll link list.
331  */
332 FRISO_API void friso_set_text(friso_task_t task, fstring text) {
333   task->text = text;
334   task->idx = 0;  // reset the index
335   task->length = strlen(text);
336   task->pool = link_list_clear(task->pool);  // clear the word poll
337   string_buffer_clear(task->sbuf);           // crear the string buffer.
338 }
339 /* }}} */
340 
341 //--------------------------------------------------------------------
342 // friso core part 1: simple mode tokenize handler functions
343 /* {{{ read the next word from the current position.
344  *
345  * @return    int the bytes of the readed word.
346  */
347 __STATIC_API__ uint_t readNextWord(friso_t friso,      // friso instance
348                                    friso_task_t task,  // token task
349                                    uint_t *idx,        // current index.
350                                    fstring __word)     // work buffer.
351 {
352   if (friso->charset == FRISO_UTF8) {
353     //@reader: task->unicode = get_utf8_unicode(task->buffer) is moved insite
354     //    function utf8_next_word from friso 1.6.0 .
355     return utf8_next_word(task, idx, __word);
356   } else if (friso->charset == FRISO_GBK) {
357     return gbk_next_word(task, idx, __word);
358   }
359 
360   return 0;  // unknow charset.
361 }
362 /* }}} */
363 
364 /* {{{ get the next cjk word from the current position, with simple mode.
365  */
366 FRISO_API lex_entry_t next_simple_cjk(friso_t friso, friso_config_t config, friso_task_t task) {
367   uint_t t, idx = task->idx, __length__;
368   string_buffer_t sb = new_string_buffer_with_string(task->buffer);
369   lex_entry_t e = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer);
370 
371   /*
372    * here bak the e->length in the task->token->type.
373    *        we will use it to count the task->idx.
374    * for the sake of use less variable.
375    */
376   __length__ = e->length;
377 
378   for (t = 1;
379        t < config->max_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
380        t++) {
381     if (friso_whitespace(friso->charset, task)) break;
382     if (!friso_cn_string(friso->charset, task)) break;
383 
384     string_buffer_append(sb, task->buffer);
385 
386     // check the existence of the word by search the dictionary.
387     if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, sb->buffer)) {
388       e = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer);
389     }
390   }
391 
392   // correct the offset of the segment.
393   task->idx += (e->length - __length__);
394   free_string_buffer(sb);  // free the buffer
395 
396   /*
397    * check the stopwords dictionary,
398    *     make sure the current tokenzier is not stopwords.
399    * @warning: friso.clr_stw must be open in friso.ini configuration file.
400    */
401   if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, e->word)) {
402     return NULL;
403   }
404 
405   return e;
406 }
407 /* }}} */
408 
409 //-------------------------------------------------------------------
410 // friso core part 2: basic latin handler functions
411 /* {{{ basic latin segmentation*/
412 /*convert full-width char  to half-width*/
413 #define convert_full_to_half(friso, task, convert)          \
414   do {                                                      \
415     if (friso_fullwidth_en_char(friso->charset, task)) {    \
416       if (friso->charset == FRISO_UTF8)                     \
417         task->unicode -= 65248;                             \
418       else if (friso->charset == FRISO_GBK) {               \
419         task->buffer[0] = ((uchar_t)task->buffer[1]) - 128; \
420         task->buffer[1] = '\0';                             \
421       }                                                     \
422       convert = 1;                                          \
423     }                                                       \
424   } while (0)
425 
426 /*convert uppercase char to lowercase char*/
427 #define convert_upper_to_lower(friso, task, convert)         \
428   do {                                                       \
429     if (friso_uppercase_letter(friso->charset, task)) {      \
430       if (friso->charset == FRISO_UTF8) task->unicode += 32; \
431       /* With the above logic(full to half),                 \
432        * here we just need to check half-width*/             \
433       else if (friso->charset == FRISO_GBK)                  \
434         task->buffer[0] = task->buffer[0] + 32;              \
435       convert = 1;                                           \
436     }                                                        \
437   } while (0)
438 
439 /* convert the unicode to utf-8 bytes. (FRISO_UTF8) */
440 #define convert_work_apply(friso, task, convert)        \
441   do {                                                  \
442     if (convert == 1 && friso->charset == FRISO_UTF8) { \
443       memset(task->buffer, 0x00, 7);                    \
444       unicode_to_utf8(task->unicode, task->buffer);     \
445       convert = 0;                                      \
446     }                                                   \
447   } while (0)
448 
449 // get the next latin word from the current position.
450 __STATIC_API__ lex_entry_t next_basic_latin(friso_t friso, friso_config_t config,
451                                             friso_task_t task) {
452   int __convert = 0, t = 0, blen = 0;
453   int chkecm = 0, chkunits = 1, wspace = 0;
454 
455   /* cause friso will convert full-width numeric and letters
456    *     (Not punctuations) to half-width ones. so, here we need
457    * wlen to record the real length of the lex_entry_t.
458    * */
459   uint_t wlen = task->bytes;
460   uint_t idx = task->idx;
461   string_buffer_t sb, tmp = NULL;
462   lex_entry_t e = NULL;
463 
464   // condition controller to start the secondary segmente.
465   int ssseg = 0;
466   int fdunits = 0;
467 
468   // secondray segmente.
469   int tcount = 1;  // number fo different type of char.
470   friso_enchar_t _ctype, _TYPE;
471   task_ssseg_close(task);
472 
473   // full-half width and upper-lower case exchange.
474   convert_full_to_half(friso, task, __convert);
475   convert_upper_to_lower(friso, task, __convert);
476   convert_work_apply(friso, task, __convert);
477 
478   // creat a new fstring buffer and append the task->buffer insite.
479   sb = new_string_buffer_with_string(task->buffer);
480   _TYPE = friso_enchar_type(friso->charset, task);
481 
482   // segmentation.
483   while ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
484     // convert full-width to half-width.
485     convert_full_to_half(friso, task, __convert);
486     _ctype = friso_enchar_type(friso->charset, task);
487 
488     if (_ctype == FRISO_EN_WHITESPACE) {
489       wspace = 1;
490       break;
491     }
492 
493     if (_ctype == FRISO_EN_PUNCTUATION) {
494       // clear the full-width punctuations.
495       if (task->bytes > 1) break;
496       if (!friso_en_kpunc(config, task->buffer[0])) break;
497     }
498 
499     /* check if is an FRISO_EN_NUMERIC, or FRISO_EN_LETTER.
500      *     here just need to make sure it is not FRISO_EN_UNKNOW.
501      * */
502     if (_ctype == FRISO_EN_UNKNOW) {
503       if (friso_cn_string(friso->charset, task)) chkecm = 1;
504       break;
505     }
506 
507     // upper-lower case convert
508     convert_upper_to_lower(friso, task, __convert);
509     convert_work_apply(friso, task, __convert);
510 
511     // sound a little crazy, i did't limit the length of this
512     //@Added: 2015-01-16 night
513     if ((wlen + task->bytes) >= __HITS_WORD_LENGTH__) {
514       break;
515     }
516 
517     string_buffer_append(sb, task->buffer);
518     wlen += task->bytes;
519     task->idx += task->bytes;
520 
521     /* Char type counter.
522      *     make the condition to start the secondary segmentation.
523      *
524      * @TODO: 2013-12-22
525      * */
526     if (_ctype != _TYPE) {
527       tcount++;
528       _TYPE = _ctype;
529     }
530   }
531 
532   /*
533    * 1. clear the useless english punctuation
534    *         from the end of the buffer.
535    * 2. check the english and punctuation mixed word.
536    *
537    * set _ctype to as the status for the existence of punctuation
538    *     at the end of the sb cause we need to plus the tcount
539    *     to avoid the secondary check for work like 'c+', 'chenxin.'.
540    */
541   _ctype = 0;
542   for (; sb->length > 0 && sb->buffer[sb->length - 1] != '%' &&
543          is_en_punctuation(friso->charset, sb->buffer[sb->length - 1]);) {
544     // check the english punctuation mixed word.
545     if (friso_dic_match(friso->dic, __LEX_ENPUN_WORDS__, sb->buffer)) {
546       e = friso_dic_get(friso->dic, __LEX_ENPUN_WORDS__, sb->buffer);
547       chkunits = 0;
548       break;
549     }
550 
551     // mark the end of the buffer.
552     sb->buffer[--sb->length] = '\0';
553     wlen--;
554     task->idx--;
555 
556     /*check and plus the tcount*/
557     if (_ctype == 0) {
558       tcount--;
559       _ctype = 1;
560     }
561   }
562 
563   // check the condition to start the secondary segmentation.
564   ssseg = (tcount > 1) && (chkunits == 1);
565 
566   // check the tokenize loop is break by whitespace.
567   //    no need for all the following work if it is.
568   //@added 2013-11-19
569   if (wspace == 1 || task->idx == task->length) {
570     blen = sb->length;
571     e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
572     e->rlen = wlen;
573     // set the secondary mask.
574     if (ssseg) task_ssseg_open(task);
575     return e;
576   }
577 
578   if (chkecm != 1) {
579     /*
580      * check the single words unit.
581      *     not only the chinese word but also other kinds of word.
582      * so we can recongnize the complex unit like '℉,℃'' eg..
583      * @date 2013-10-14
584      */
585     if (chkunits && (friso_numeric_string(friso->charset, sb->buffer) ||
586                      friso_decimal_string(friso->charset, sb->buffer))) {
587       idx = task->idx;
588       if ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
589         // check the EC dictionary.
590         if (friso_dic_match(friso->dic, __LEX_CJK_UNITS__, task->buffer)) {
591           fdunits = 1;
592           string_buffer_append(sb, task->buffer);
593           wlen += task->bytes;
594           task->idx += task->bytes;
595         }
596       }
597     }
598 
599     // set the START_SS_MASK
600     if (fdunits != 1 && ssseg) {
601       task_ssseg_open(task);
602     }
603 
604     // creat the lexicon entry and return it.
605     blen = sb->length;
606     e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
607     e->rlen = wlen;
608 
609     return e;
610   }
611 
612   // Try to find a english chinese mixed word.
613   tmp = new_string_buffer_with_string(sb->buffer);
614   idx = task->idx;
615   for (t = 0;
616        t < config->mix_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
617        t++) {
618     // if ( ! friso_cn_string( friso->charset, task ) ) {
619     //    task->idx -= task->bytes;
620     //    break;
621     //}
622     // replace with the whitespace check.
623     // more complex mixed words could be find here.
624     // (no only english and chinese mix word)
625     //@date 2013-10-14
626     if (friso_whitespace(friso->charset, task)) {
627       break;
628     }
629 
630     string_buffer_append(tmp, task->buffer);
631 
632     // check the mixed word dictionary.
633     if (friso_dic_match(friso->dic, __LEX_ECM_WORDS__, tmp->buffer)) {
634       e = friso_dic_get(friso->dic, __LEX_ECM_WORDS__, tmp->buffer);
635     }
636   }
637 
638   free_string_buffer(tmp);
639 
640   /* e is not NULL does't mean it must be EC mixed word.
641    *     it could be an english and punctuation mixed word, like 'c++'
642    * But we don't need to check and set the START_SS_MASK mask here.
643    * */
644   if (e != NULL) {
645     task->idx += (e->length - sb->length);
646     free_string_buffer(sb);
647     return e;
648   }
649 
650   // no match for mix word, try to find a single unit.
651   if (chkunits && (friso_numeric_string(friso->charset, sb->buffer) ||
652                    friso_decimal_string(friso->charset, sb->buffer))) {
653     idx = task->idx;
654     if ((task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0) {
655       // check the single chinese units dictionary.
656       if (friso_dic_match(friso->dic, __LEX_CJK_UNITS__, task->buffer)) {
657         fdunits = 1;
658         string_buffer_append(sb, task->buffer);
659         wlen += task->bytes;
660         task->idx += task->bytes;
661       }
662     }
663   }
664 
665   // set the START_SS_MASK.
666   if (fdunits != 1 && ssseg) {
667     task_ssseg_open(task);
668   }
669 
670   // create the lexicon entry and return it.
671   blen = sb->length;
672   e = new_lex_entry(string_buffer_devote(sb), NULL, 0, blen, __LEX_OTHER_WORDS__);
673   e->rlen = wlen;
674 
675   return e;
676 }
677 /* }}} */
678 
679 //-------------------------------------------------------------------
680 // friso core part 3: mmseg tokenize implements functions
681 // mmseg algorithm implemented functions - start
682 
683 /* {{{ get the next match from the current position,
684  *        throught the dictionary this will return all the matchs.
685  *
686  * @return friso_array_t that contains all the matchs.
687  */
688 __STATIC_API__ friso_array_t get_next_match(friso_t friso, friso_config_t config, friso_task_t task,
689                                             uint_t idx) {
690   register uint_t t;
691   string_buffer_t sb = new_string_buffer_with_string(task->buffer);
692 
693   // create a match dynamic array.
694   friso_array_t match = new_array_list_with_opacity(config->max_len);
695   array_list_add(match, friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->buffer));
696 
697   for (t = 1;
698        t < config->max_len && (task->bytes = readNextWord(friso, task, &idx, task->buffer)) != 0;
699        t++) {
700     if (friso_whitespace(friso->charset, task)) break;
701     if (!friso_cn_string(friso->charset, task)) break;
702 
703     // append the task->buffer to the buffer.
704     string_buffer_append(sb, task->buffer);
705 
706     // check the CJK dictionary.
707     if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, sb->buffer)) {
708       /*
709        * add the lex_entry_t insite.
710        * here is a key point:
711        *        we use friso_dic_get function
712        *        to get the address of the lex_entry_cdt
713        *        that store in the dictionary,
714        *        not create a new lex_entry_cdt.
715        * so :
716        *        1.we will not bother to the allocations of
717        *            the newly created lex_entry_cdt.
718        *        2.more efficient of course.
719        */
720       array_list_add(match, friso_dic_get(friso->dic, __LEX_CJK_WORDS__, sb->buffer));
721     }
722   }
723 
724   /*buffer allocations clear*/
725   free_string_buffer(sb);
726   // array_list_trim( match );
727 
728   return match;
729 }
730 /* }}} */
731 
732 /* {{{ chunk for mmseg defines and functions to handle them.*/
733 typedef struct {
734   friso_array_t words;
735   uint_t length;
736   float average_word_length;
737   float word_length_variance;
738   float single_word_dmf;
739 } friso_chunk_entry;
740 typedef friso_chunk_entry *friso_chunk_t;
741 /* }}} */
742 
743 /* {{{ create a new chunks*/
744 __STATIC_API__ friso_chunk_t new_chunk(friso_array_t words, uint_t length) {
745   friso_chunk_t chunk = (friso_chunk_t)FRISO_MALLOC(sizeof(friso_chunk_entry));
746   if (chunk == NULL) {
747     ___ALLOCATION_ERROR___
748   }
749 
750   chunk->words = words;
751   chunk->length = length;
752   chunk->average_word_length = -1;
753   chunk->word_length_variance = -1;
754   chunk->single_word_dmf = -1;
755 
756   return chunk;
757 }
758 /* }}} */
759 
760 /* {{{ free the specified chunk */
761 __STATIC_API__ void free_chunk(friso_chunk_t chunk) {
762   FRISO_FREE(chunk);
763 }
764 /* }}} */
765 
766 /* {{{ a static function to count the average word length
767  *    of the given chunk.
768  */
769 __STATIC_API__ float count_chunk_avl(friso_chunk_t chunk) {
770   chunk->average_word_length = ((float)chunk->length) / chunk->words->length;
771   return chunk->average_word_length;
772 }
773 /* }}} */
774 
775 /* {{{ a static function to count the word length variance
776  *    of the given chunk.
777  */
778 __STATIC_API__ float count_chunk_var(friso_chunk_t chunk) {
779   float var = 0, tmp = 0;  // snapshot
780   register uint_t t;
781   lex_entry_t e;
782 
783   for (t = 0; t < chunk->words->length; t++) {
784     e = (lex_entry_t)chunk->words->items[t];
785     tmp = e->length - chunk->average_word_length;
786     var += tmp * tmp;
787   }
788 
789   chunk->word_length_variance = var / chunk->words->length;
790 
791   return chunk->word_length_variance;
792 }
793 /* }}} */
794 
795 /* {{{ a static function to count the single word morpheme degree of freedom
796  *    of the given chunk.
797  */
798 __STATIC_API__ float count_chunk_mdf(friso_chunk_t chunk) {
799   float __mdf__ = 0;
800   register uint_t t;
801   lex_entry_t e;
802 
803   for (t = 0; t < chunk->words->length; t++) {
804     e = (lex_entry_t)chunk->words->items[t];
805     // single CJK(UTF-8)/chinese(GBK) word.
806     // better add a charset check here, but this will works find.
807     // all CJK words will take 3 bytes with UTF-8 encoding.
808     // all chinese words take 2 bytes with GBK encoding.
809     if (e->length == 3 || e->length == 2) {
810       __mdf__ += (float)log((float)e->fre);
811     }
812   }
813   chunk->single_word_dmf = __mdf__;
814 
815   return chunk->single_word_dmf;
816 }
817 /* }}} */
818 
819 /* {{{ chunk printer - use for for debug*/
820 #define ___CHUNK_PRINTER___(_chunks_)                         \
821   for (t = 0; t < _chunks_->length; t++) {                    \
822     __tmp__ = ((friso_chunk_t)_chunks_->items[t])->words;     \
823     for (j = 0; j < __tmp__->length; j++) {                   \
824       printf("%s/ ", ((lex_entry_t)__tmp__->items[j])->word); \
825     }                                                         \
826     putchar('\n');                                            \
827   }                                                           \
828   putchar('\n');                                              \
829 /* }}} */
830 
831 /* {{{ mmseg algorithm core invoke
832  * here,
833  * we use four rules to filter all the chunks to get the best chunk.
834  *        and this is the core of the mmseg alogrithm.
835  * 1. maximum match word length.
836  * 2. larget average word length.
837  * 3. smallest word length variance.
838  * 4. largest single word morpheme degrees of freedom.
839  */
840 __STATIC_API__ friso_chunk_t mmseg_core_invoke(friso_array_t chunks) {
841   register uint_t t /*, j*/;
842   float max;
843   friso_chunk_t e;
844   friso_array_t __res__, __tmp__;
845   __res__ = new_array_list_with_opacity(chunks->length);
846 
847   // 1.get the maximum matched chunks.
848   // count the maximum length
849   max = (float)((friso_chunk_t)chunks->items[0])->length;
850   for (t = 1; t < chunks->length; t++) {
851     e = (friso_chunk_t)chunks->items[t];
852     if (e->length > max) max = (float)e->length;
853   }
854   // get the chunk items that owns the maximum length.
855   for (t = 0; t < chunks->length; t++) {
856     e = (friso_chunk_t)chunks->items[t];
857     if (e->length >= max) {
858       array_list_add(__res__, e);
859     } else {
860       free_array_list(e->words);
861       free_chunk(e);
862     }
863   }
864   // check the left chunks
865   if (__res__->length == 1) {
866     e = (friso_chunk_t)__res__->items[0];
867     free_array_list(__res__);
868     free_array_list(chunks);
869     return e;
870   } else {
871     __tmp__ = array_list_clear(chunks);
872     chunks = __res__;
873     __res__ = __tmp__;
874   }
875 
876   // 2.get the largest average word length chunks.
877   // count the maximum average word length.
878   max = count_chunk_avl((friso_chunk_t)chunks->items[0]);
879   for (t = 1; t < chunks->length; t++) {
880     e = (friso_chunk_t)chunks->items[t];
881     if (count_chunk_avl(e) > max) {
882       max = e->average_word_length;
883     }
884   }
885   // get the chunks items that own the largest average word length.
886   for (t = 0; t < chunks->length; t++) {
887     e = (friso_chunk_t)chunks->items[t];
888     if (e->average_word_length >= max) {
889       array_list_add(__res__, e);
890     } else {
891       free_array_list(e->words);
892       free_chunk(e);
893     }
894   }
895   // check the left chunks
896   if (__res__->length == 1) {
897     e = (friso_chunk_t)__res__->items[0];
898     free_array_list(__res__);
899     free_array_list(chunks);
900     return e;
901   } else {
902     __tmp__ = array_list_clear(chunks);
903     chunks = __res__;
904     __res__ = __tmp__;
905   }
906 
907   // 3.get the smallest word length variance chunks
908   // count the smallest word length variance
909   max = count_chunk_var((friso_chunk_t)chunks->items[0]);
910   for (t = 1; t < chunks->length; t++) {
911     e = (friso_chunk_t)chunks->items[t];
912     if (count_chunk_var(e) < max) {
913       max = e->word_length_variance;
914     }
915   }
916   // get the chunks that own the smallest word length variance.
917   for (t = 0; t < chunks->length; t++) {
918     e = (friso_chunk_t)chunks->items[t];
919     if (e->word_length_variance <= max) {
920       array_list_add(__res__, e);
921     } else {
922       free_array_list(e->words);
923       free_chunk(e);
924     }
925   }
926   // check the left chunks
927   if (__res__->length == 1) {
928     e = (friso_chunk_t)__res__->items[0];
929     free_array_list(chunks);
930     free_array_list(__res__);
931     return e;
932   } else {
933     __tmp__ = array_list_clear(chunks);
934     chunks = __res__;
935     __res__ = __tmp__;
936   }
937 
938   // 4.get the largest single word morpheme degrees of freedom.
939   // count the maximum single word morpheme degreees of freedom
940   max = count_chunk_mdf((friso_chunk_t)chunks->items[0]);
941   for (t = 1; t < chunks->length; t++) {
942     e = (friso_chunk_t)chunks->items[t];
943     if (count_chunk_mdf(e) > max) {
944       max = e->single_word_dmf;
945     }
946   }
947   // get the chunks that own the largest single word word morpheme degrees of freedom.
948   for (t = 0; t < chunks->length; t++) {
949     e = (friso_chunk_t)chunks->items[t];
950     if (e->single_word_dmf >= max) {
951       array_list_add(__res__, e);
952     } else {
953       free_array_list(e->words);
954       free_chunk(e);
955     }
956   }
957 
958   /*
959    * there is still more than one chunks?
960    *        well, this rarely happen but still happens.
961    * here we simple return the first chunk as the final result,
962    *         and we need to free the all the chunks that __res__
963    *     points to except the 1th one.
964    * you have to do two things to totaly free a chunk:
965    * 1. call free_array_list to free the allocations of a chunk's words.
966    * 2. call free_chunk to the free the allocations of a chunk.
967    */
968   for (t = 1; t < __res__->length; t++) {
969     e = (friso_chunk_t)__res__->items[t];
970     free_array_list(e->words);
971     free_chunk(e);
972   }
973 
974   e = (friso_chunk_t)__res__->items[0];
975   free_array_list(chunks);
976   free_array_list(__res__);
977 
978   return e;
979 }
980 /* }}} */
981 
982 /* {{{ get the next cjk word from the current position with complex mode.
983  *    this is the core of the mmseg chinese word segemetation algorithm.
984  *    we use four rules to filter the matched chunks and get the best one
985  *        as the final result.
986  *
987  * @see mmseg_core_invoke( chunks );
988  */
989 FRISO_API lex_entry_t next_complex_cjk(friso_t friso, friso_config_t config, friso_task_t task) {
990   register uint_t x, y, z;
991   /*bakup the task->bytes here*/
992   uint_t __idx__ = task->bytes;
993   lex_entry_t fe, se, te;
994   friso_chunk_t e;
995   friso_array_t words, chunks;
996   friso_array_t smatch, tmatch, fmatch = get_next_match(friso, config, task, task->idx);
997 
998   /*
999    * here:
1000    *        if the length of the fmatch is 1, mean we don't have to
1001    * continue the following work. ( no matter what we get the same result. )
1002    */
1003   if (fmatch->length == 1) {
1004     fe = ((lex_entry_t)fmatch->items[0]);
1005     free_array_list(fmatch);
1006 
1007     /*
1008      * check and clear the stop words .
1009      * @date 2013-06-13
1010      */
1011     if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, fe->word)) {
1012       return NULL;
1013     }
1014 
1015     return fe;
1016   }
1017 
1018   chunks = new_array_list();
1019   task->idx -= __idx__;
1020 
1021   for (x = 0; x < fmatch->length; x++) {
1022     /*get the word and try the second layer match*/
1023     fe = (lex_entry_t)array_list_get(fmatch, x);
1024     __idx__ = task->idx + fe->length;
1025     readNextWord(friso, task, &__idx__, task->buffer);
1026 
1027     if (task->bytes != 0 && friso_cn_string(friso->charset, task) &&
1028         friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1029       // get the next matchs
1030       smatch = get_next_match(friso, config, task, __idx__);
1031       for (y = 0; y < smatch->length; y++) {
1032         /*get the word and try the third layer match*/
1033         se = (lex_entry_t)array_list_get(smatch, y);
1034         __idx__ = task->idx + fe->length + se->length;
1035         readNextWord(friso, task, &__idx__, task->buffer);
1036 
1037         if (task->bytes != 0 && friso_cn_string(friso->charset, task) &&
1038             friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1039           // get the matchs.
1040           tmatch = get_next_match(friso, config, task, __idx__);
1041           for (z = 0; z < tmatch->length; z++) {
1042             te = (lex_entry_t)array_list_get(tmatch, z);
1043             words = new_array_list_with_opacity(3);
1044             array_list_add(words, fe);
1045             array_list_add(words, se);
1046             array_list_add(words, te);
1047             array_list_add(chunks, new_chunk(words, fe->length + se->length + te->length));
1048           }
1049           // free the third matched array list
1050           free_array_list(tmatch);
1051         } else {
1052           words = new_array_list_with_opacity(2);
1053           array_list_add(words, fe);
1054           array_list_add(words, se);
1055           // add the chunk
1056           array_list_add(chunks, new_chunk(words, fe->length + se->length));
1057         }
1058       }
1059       // free the second match array list
1060       free_array_list(smatch);
1061     } else {
1062       words = new_array_list_with_opacity(1);
1063       array_list_add(words, fe);
1064       array_list_add(chunks, new_chunk(words, fe->length));
1065     }
1066   }
1067   // free the first match array list
1068   free_array_list(fmatch);
1069 
1070   /*
1071    * filter the chunks with the four rules of the mmseg algorithm
1072    *        and get best chunk as the final result.
1073    *
1074    * @see mmseg_core_invoke( chunks );
1075    * @date 2012-12-13
1076    */
1077   if (chunks->length > 1) {
1078     e = mmseg_core_invoke(chunks);
1079   } else {
1080     e = (friso_chunk_t)chunks->items[0];
1081   }
1082 
1083   fe = (lex_entry_t)e->words->items[0];
1084   task->idx += fe->length;    // reset the idx of the task.
1085   free_array_list(e->words);  // free the chunks words allocation
1086   free_chunk(e);
1087 
1088   // clear the stop words
1089   if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, fe->word)) {
1090     return NULL;
1091   }
1092 
1093   return fe;
1094 }
1095 /* }}} */
1096 //----------------end of mmseg core
1097 
1098 //-------------------------------------------------------------------------------------
1099 // mmseg core logic controller, output style controller and macro defines
1100 /* {{{ A macro function to check and free
1101  *     the lex_entry_t with type of __LEX_OTHER_WORDS__.
1102  */
1103 #define check_free_otlex_entry(lex)         \
1104   do {                                      \
1105     if (lex->type == __LEX_OTHER_WORDS__) { \
1106       FRISO_FREE(lex->word);                \
1107       free_lex_entry(lex);                  \
1108     }                                       \
1109   } while (0)
1110 /* }}} */
1111 
1112 /* {{{ sphinx style output synonyms words append.
1113  *
1114  * @param    task
1115  * @param    lex
1116  * */
1117 __STATIC_API__ void token_sphinx_output(friso_task_t task, lex_entry_t lex) {
1118   uint_t i, j, len;
1119   fstring _word;
1120   len = lex->length;
1121 
1122   // append the synoyums words.
1123   for (i = 0; i < lex->syn->length; i++) {
1124     _word = (fstring)lex->syn->items[i];
1125     j = strlen(_word);
1126     if ((len + j + 1) >= __HITS_WORD_LENGTH__) break;
1127     memcpy(task->token->word + len, "|", 1);
1128     len += 1;
1129     memcpy(task->token->word + len, _word, j);
1130     len += j;
1131   }
1132 
1133   // set the new end of the buffer.
1134   task->token->word[len] = '\0';
1135 }
1136 /* }}} */
1137 
1138 /* {{{ normal style output synonyms words append.
1139  *
1140  * @param    task
1141  * @param    lex
1142  * @param    front    1 for add the synoyum words from the head and
1143  *                     0 for append from the tail.
1144  * */
1145 __STATIC_API__ void token_normal_output(friso_task_t task, lex_entry_t lex, int front) {
1146   uint_t i;
1147   fstring _word;
1148   lex_entry_t e;
1149 
1150   for (i = 0; i < lex->syn->length; i++) {
1151     _word = (fstring)lex->syn->items[i];
1152     e = new_lex_entry(_word, NULL, 0, strlen(_word), __LEX_NCSYN_WORDS__);
1153     e->offset = lex->offset;
1154     // add to the buffer.
1155     if (front) {
1156       link_list_add_first(task->pool, e);
1157     } else {
1158       link_list_add(task->pool, e);
1159     }
1160   }
1161 }
1162 /* }}} */
1163 
1164 /* {{{ do the secondary segmentation of the complex english token.
1165  *
1166  * @param    friso
1167  * @param    config
1168  * @param    task
1169  * @param    lex
1170  * @param    retfw    -Wether to return the first word.
1171  * @return    lex_entry_t(NULL or the first sub token of the lex)
1172  */
1173 __STATIC_API__ lex_entry_t en_second_seg(friso_t friso, friso_config_t config, friso_task_t task,
1174                                          lex_entry_t lex, int retfw) {
1175   // printf("sseg: %d\n", (task->ctrlMask & START_SS_MASK));
1176 
1177   int j, p = 0, start = 0;
1178   fstring str = lex->word;
1179 
1180   lex_entry_t fword = NULL, sword = NULL;
1181 
1182   int _ctype, _TYPE = get_enchar_type(str[0]);
1183   string_buffer_clear(task->sbuf);
1184   string_buffer_append_char(task->sbuf, str[0]);
1185 
1186   for (j = 1; j < lex->length; j++) {
1187     // get the type of the char
1188     _ctype = get_enchar_type(str[j]);
1189     if (_ctype == FRISO_EN_WHITESPACE) {
1190       _TYPE = FRISO_EN_WHITESPACE;
1191       p++;
1192       continue;
1193     }
1194 
1195     if (_ctype == _TYPE) {
1196       string_buffer_append_char(task->sbuf, str[j]);
1197     } else {
1198       start = j - task->sbuf->length - p;
1199 
1200       /* If the number of chars of current type
1201        *     is larger than config->st_minl then we will
1202        *     create a new lex_entry_t and append it to the task->wordPool.
1203        * */
1204       if (task->sbuf->length >= config->st_minl &&
1205           !(config->clr_stw &&
1206             friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->sbuf->buffer))) {
1207         /* the allocation of lex_entry_t and its word
1208          *     should be released and the type of the lex_entry_t
1209          *     must be __LEX_OTHER_WORDS__.
1210          * */
1211         sword = new_lex_entry(rm_strdup(task->sbuf->buffer), NULL, 0, task->sbuf->length,
1212                               __LEX_OTHER_WORDS__);
1213         sword->offset = lex->offset + start;
1214         if (retfw && fword == NULL) {
1215           fword = sword;
1216         } else {
1217           link_list_add(task->pool, sword);
1218         }
1219       }
1220 
1221       string_buffer_clear(task->sbuf);
1222       string_buffer_append_char(task->sbuf, str[j]);
1223       p = 0;
1224       _TYPE = _ctype;
1225     }
1226   }
1227 
1228   // continue to check the last item.
1229   if (task->sbuf->length >= config->st_minl &&
1230       !(config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->sbuf->buffer))) {
1231     start = j - task->sbuf->length;
1232     sword = new_lex_entry(rm_strdup(task->sbuf->buffer), NULL, 0, task->sbuf->length,
1233                           __LEX_OTHER_WORDS__);
1234     sword->offset = j - task->sbuf->length;
1235     if (retfw && fword == NULL) {
1236       fword = sword;
1237     } else {
1238       link_list_add(task->pool, sword);
1239     }
1240   }
1241 
1242   return fword;
1243 }
1244 /*}}}*/
1245 
1246 /* {{{ english synoyums words check and append macro define.*/
1247 #define append_en_syn(lex, tmp, front)                                            \
1248   do {                                                                            \
1249     if ((tmp = friso_dic_get(friso->dic, __LEX_EN_WORDS__, lex->word)) != NULL && \
1250         (tmp->syn) != NULL) {                                                     \
1251       if (config->spx_out == 1)                                                   \
1252         token_sphinx_output(task, tmp);                                           \
1253       else {                                                                      \
1254         tmp->offset = lex->offset;                                                \
1255         token_normal_output(task, tmp, front);                                    \
1256       }                                                                           \
1257     }                                                                             \
1258   } while (0)
1259 /* }}} */
1260 
1261 /* {{{ get the next segmentation.
1262  *     and also this is the friso enterface function.
1263  *
1264  * @param     friso.
1265  * @param    config.
1266  * @return    task.
1267  */
1268 FRISO_API friso_token_t next_mmseg_token(friso_t friso, friso_config_t config, friso_task_t task) {
1269   uint_t j, len = 0;
1270   string_buffer_t sb = NULL;
1271   lex_entry_t lex = NULL, tmp = NULL, sword = NULL;
1272 
1273   /* {{{ task word pool check */
1274   if (!link_list_empty(task->pool)) {
1275     /*
1276      * load word from the word poll if it is not empty.
1277      *  this will make the next word more convenient and efficient.
1278      *     often synonyms, newly created word will be stored in the poll.
1279      */
1280     lex = (lex_entry_t)link_list_remove_first(task->pool);
1281     memcpy(task->token->word, lex->word, lex->length);
1282     task->token->type = lex->type;
1283     task->token->length = lex->length;
1284     task->token->rlen = lex->rlen;
1285     task->token->offset = lex->offset;
1286     task->token->word[lex->length] = '\0';
1287 
1288     /* check and handle the english synonyms words append mask.
1289      *     Also we have to close the mask after finish the operation.
1290      *
1291      * 1. we've check the config->add_syn before open the
1292      *         _LEX_APPENSYN_MASK mask.
1293      * 2. we should add the synonyms words of the curren
1294      *         lex_entry_t from the head.
1295      *
1296      * @since: 1.6.0
1297      * */
1298     if (lex_appensyn_check(lex)) {
1299       lex_appensyn_close(lex);
1300       append_en_syn(lex, tmp, 1);
1301     }
1302 
1303     /*
1304      * __LEX_NCSYN_WORDS__:
1305      *  these lex_entry_t was created to store the the synonyums words.
1306      *     and its word pointed to the lex_entry_t's synonyms word of
1307      *         friso->dic, so :
1308      *     free the lex_entry_t but not its word here.
1309      *
1310      * __LEX_OTHER_WORDS__:
1311      *  newly created lexicon entry, like the chinese and english mixed word.
1312      *     during the invoke of function next_basic_latin.
1313      *
1314      * other type:
1315      *  they must exist in the dictionary, so just pass them.
1316      */
1317     switch (lex->type) {
1318       case __LEX_OTHER_WORDS__:
1319         FRISO_FREE(lex->word);
1320         free_lex_entry(lex);
1321         break;
1322       case __LEX_NCSYN_WORDS__:
1323         free_lex_entry(lex);
1324         break;
1325     }
1326 
1327     return task->token;
1328   }
1329   /* }}} */
1330 
1331   while (task->idx < task->length) {
1332     // read the next word from the current position.
1333     task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1334     if (task->bytes == 0) break;
1335 
1336     // clear up the whitespace.
1337     if (friso_whitespace(friso->charset, task)) continue;
1338 
1339     /* {{{ CJK words recongnize block. */
1340     if (friso_cn_string(friso->charset, task)) {
1341       /* check the dictionary.
1342        * and return the unrecognized CJK char as a single word.
1343        * */
1344       if (!friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->buffer)) {
1345         memcpy(task->token->word, task->buffer, task->bytes);
1346         task->token->type = __LEX_PUNC_WORDS__;
1347         task->token->length = task->bytes;
1348         task->token->rlen = task->bytes;
1349         task->token->offset = task->idx - task->bytes;
1350         task->token->word[(int)task->bytes] = '\0';
1351         return task->token;
1352       }
1353 
1354       // specifield mode split.
1355       // if ( config->mode == __FRISO_COMPLEX_MODE__ )
1356       //    lex = next_complex_cjk( friso, config, task );
1357       // else lex = next_simple_cjk( friso, config, task );
1358       lex = config->next_cjk(friso, config, task);
1359 
1360       if (lex == NULL) continue;  // find a stopwrod.
1361       lex->offset = task->idx - lex->rlen;
1362 
1363       /*
1364        * try to find a chinese and english mixed words, like '卡拉ok'
1365        *     keep in mind that is not english and chinese mixed words
1366        *         like 'x射线'.
1367        *
1368        * @reader:
1369        * 1. only if the char after the current word is an english char.
1370        * 2. if the first point meet, friso will call next_basic_latin() to
1371        *         get the next basic latin. (yeah, you have to handle it).
1372        * 3. if match a CE word, set lex to the newly match CE word.
1373        * 4. if no match a CE word, we will have to append the basic latin
1374        *         to the pool, and it should after the append of synonyms words.
1375        * 5. do not use the task->buffer and task->unicode as the check
1376        *         condition for the CE word identify.
1377        * 6. Add friso_numeric_letter check so can get work like '高3'
1378        *
1379        * @date 2013-09-02
1380        */
1381       if ((task->idx < task->length) && ((int)task->text[task->idx]) > 0 &&
1382           (friso_en_letter(friso->charset, task) || friso_numeric_letter(friso->charset, task))) {
1383         // create a string buffer
1384         sb = new_string_buffer_with_string(lex->word);
1385 
1386         // find the next basic latin.
1387         task->buffer[0] = task->text[task->idx++];
1388         task->buffer[1] = '\0';
1389         tmp = next_basic_latin(friso, config, task);
1390         tmp->offset = task->idx - tmp->length;
1391         string_buffer_append(sb, tmp->word);
1392 
1393         // check the CE dictionary.
1394         if (friso_dic_match(friso->dic, __LEX_CEM_WORDS__, sb->buffer)) {
1395           j = lex->offset;  // bakup the offset.
1396           lex = friso_dic_get(friso->dic, __LEX_CEM_WORDS__, sb->buffer);
1397           lex->offset = j;
1398           check_free_otlex_entry(tmp);
1399           free_string_buffer(sb);
1400           tmp = NULL;
1401           sb = NULL;
1402         }
1403       }
1404 
1405       /*
1406        * copy the lex_entry to the result token
1407        *
1408        * @reader: (boodly lession, added 2013-08-31):
1409        *     don't bother to handle the task->token->offset problem.
1410        *         is has been sovled perfectly above.
1411        */
1412       len = (int)lex->length;
1413       memcpy(task->token->word, lex->word, lex->length);
1414       task->token->type = lex->type;
1415       task->token->length = lex->length;
1416       task->token->rlen = lex->rlen;
1417       task->token->offset = lex->offset;
1418       task->token->word[len] = '\0';
1419 
1420       // check and append the synonyms words
1421       if (config->add_syn && lex->syn != NULL) {
1422         if (config->spx_out == 1) {
1423           token_sphinx_output(task, lex);
1424         } else {
1425           token_normal_output(task, lex, 0);
1426         }
1427       }
1428 
1429       /* {{{ here: handle the newly found basic latin created when
1430        * we try to find a CE word.
1431        *
1432        * @reader:
1433        * when tmp is not NULL and sb will not be NULL too
1434        *     except a CE word is found.
1435        *
1436        * @TODO: finished append the synonyms words on 2013-12-19.
1437        */
1438       if (tmp != NULL && sb != NULL) {
1439         // check the secondary split.
1440         if (config->en_sseg == 1 && task_ssseg_check(task)) {
1441           en_second_seg(friso, config, task, tmp, 0);
1442         }
1443 
1444         free_string_buffer(sb);
1445         link_list_add(task->pool, tmp);
1446 
1447         // check if append synoyums words.
1448         if (config->add_syn == 1) {
1449           lex_appensyn_open(tmp);
1450         }
1451       }
1452       /* }}} */
1453 
1454       return task->token;
1455     }
1456     /* }}} */
1457 
1458     /* {{{ basic english/latin recongnize block. */
1459     else if (friso_halfwidth_en_char(friso->charset, task) ||
1460              friso_fullwidth_en_char(friso->charset, task)) {
1461       /*
1462        * handle the english punctuation.
1463        *
1464        * @todo:
1465        * 1. commen all the code of the following if
1466        *     and uncomment the continue to clear up the punctuation directly.
1467        *
1468        * @reader:
1469        * 2. keep in mind that ALL the english punctuation will be handled here,
1470        *  (when a english punctuation is found during the other process, we will
1471        *      reset the task->idx back to it and then back here)
1472        *     except the keep punctuation(define in file friso_string.c)
1473        *     that will make up a word with the english chars around it.
1474        */
1475       if (friso_en_punctuation(friso->charset, task)) {
1476         if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->buffer)) {
1477           continue;
1478         }
1479 
1480         // count the punctuation in.
1481         task->token->word[0] = task->buffer[0];
1482         task->token->type = __LEX_PUNC_WORDS__;
1483         task->token->length = task->bytes;
1484         task->token->rlen = task->bytes;
1485         task->token->offset = task->idx - task->bytes;
1486         task->token->word[1] = '\0';
1487         return task->token;
1488 
1489         // continue
1490       }
1491 
1492       // get the next basic latin word.
1493       lex = next_basic_latin(friso, config, task);
1494       lex->offset = task->idx - lex->rlen;
1495 
1496       /* @added: 2013-12-22
1497        * check and do the secondary segmentation work.
1498        * this will split 'qq2013' to 'qq, 2013'
1499        * */
1500       sword = NULL;
1501       if (config->en_sseg == 1 && task_ssseg_check(task)) {
1502         sword = en_second_seg(friso, config, task, lex, 1);
1503       }
1504 
1505       // check if it is a stopword.
1506       if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, lex->word)) {
1507         // free the newly created lexicon entry.
1508         check_free_otlex_entry(lex);
1509         if (sword == NULL) continue;
1510         lex = sword;
1511       } else if (sword != NULL) {
1512         if (config->add_syn == 1) lex_appensyn_open(lex);
1513         link_list_add(task->pool, lex);
1514 
1515         /* If the sub token is not NULL:
1516          * add the lex to the task->pool if it is not NULL
1517          * and return the sub token istead of lex so
1518          *     the sub tokens will be output ahead of lex.
1519          * */
1520         lex = sword;
1521       }
1522 
1523       // if the token is longer than __HITS_WORD_LENGTH__, drop it
1524       // copy the word to the task token buffer.
1525       // if ( lex->length >= __HITS_WORD_LENGTH__ ) continue;
1526       memcpy(task->token->word, lex->word, lex->length);
1527       task->token->type = lex->type;
1528       task->token->length = lex->length;
1529       task->token->rlen = lex->rlen;
1530       task->token->offset = lex->offset;
1531       task->token->word[lex->length] = '\0';
1532 
1533       /* If sword is NULL, continue to check and append
1534        * tye synoyums words for the current lex_entry_t.
1535        * */
1536       if (sword == NULL && config->add_syn == 1) {
1537         append_en_syn(lex, tmp, 0);
1538       }
1539 
1540       // free the newly create lex_entry_t
1541       check_free_otlex_entry(lex);
1542 
1543       return task->token;
1544     }
1545     /* }}} */
1546 
1547     /* {{{ Keep the chinese punctuation.
1548      * @added 2013-08-31) */
1549     else if (friso_cn_punctuation(friso->charset, task)) {
1550       if (config->clr_stw && friso_dic_match(friso->dic, __LEX_STOPWORDS__, task->buffer)) {
1551         continue;
1552       }
1553 
1554       // count the punctuation in.
1555       memcpy(task->token->word, task->buffer, task->bytes);
1556       task->token->type = __LEX_PUNC_WORDS__;
1557       task->token->length = task->bytes;
1558       task->token->offset = task->idx - task->bytes;
1559       task->token->word[task->bytes] = '\0';
1560       return task->token;
1561     }
1562     /* }}} */
1563     // else if ( friso_letter_number( friso->charset, task ) )
1564     //{
1565     //}
1566     // else if ( friso_other_number( friso->charset, task ) )
1567     //{
1568     //}
1569 
1570     /* {{{ keep the unrecognized words?
1571     //@date 2013-10-14 */
1572     else if (config->keep_urec) {
1573       memcpy(task->token->word, task->buffer, task->bytes);
1574       task->token->type = __LEX_UNKNOW_WORDS__;
1575       task->token->length = task->bytes;
1576       task->token->offset = task->idx - task->bytes;
1577       task->token->word[task->bytes] = '\0';
1578       return task->token;
1579     }
1580     /* }}} */
1581   }
1582 
1583   return NULL;
1584 }
1585 /* }}} */
1586 
1587 //----------------------------------------------------------------------
1588 // detect core logic controller: detect tokenize mode handler functions
1589 /** {{{ get the next splited token with detect mode
1590  *    detect mode will only return the words in the dictionary
1591  *        with simple forward maximum matching algorithm
1592  */
1593 FRISO_API friso_token_t next_detect_token(friso_t friso, friso_config_t config, friso_task_t task) {
1594   lex_entry_t lex = NULL;
1595   int i, __convert = 0, tbytes, wbytes;
1596 
1597   /* {{{ task word pool check */
1598   if (!link_list_empty(task->pool)) {
1599     /*
1600      * load word from the word poll if it is not empty.
1601      *  this will make the next word more convenient and efficient.
1602      *     often synonyms, newly created word will be stored in the poll.
1603      */
1604     lex = (lex_entry_t)link_list_remove_first(task->pool);
1605     memcpy(task->token->word, lex->word, lex->length);
1606     task->token->type = lex->type;
1607     task->token->length = lex->length;
1608     task->token->rlen = lex->rlen;
1609     task->token->offset = lex->offset;
1610     task->token->word[lex->length] = '\0';
1611 
1612     /*
1613      * __LEX_NCSYN_WORDS__:
1614      *  these lex_entry_t was created to store the the synonyums words.
1615      *     and its word pointed to the lex_entry_t's synonyms word of
1616      *         friso->dic, so :
1617      *     free the lex_entry_t but not its word here.
1618      */
1619     if (lex->type == __LEX_NCSYN_WORDS__) {
1620       free_lex_entry(lex);
1621     }
1622 
1623     return task->token;
1624   }
1625   /* }}} */
1626 
1627   while (task->idx < task->length) {
1628     lex = NULL;
1629 
1630     // read the next word from the current position.
1631     task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1632     if (task->bytes == 0) break;
1633 
1634     // clear up the whitespace.
1635     if (friso_whitespace(friso->charset, task)) continue;
1636 
1637     // convert full-width to half-width
1638     // and uppercase to lowercase for english chars
1639     wbytes = 0;
1640     tbytes = task->bytes;
1641     convert_full_to_half(friso, task, __convert);
1642     convert_upper_to_lower(friso, task, __convert);
1643     convert_work_apply(friso, task, __convert);
1644 
1645     string_buffer_clear(task->sbuf);
1646     string_buffer_append(task->sbuf, task->buffer);
1647     if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer)) {
1648       lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer);
1649       wbytes = tbytes;
1650     }
1651 
1652     for (i = 1; i < config->max_len; i++) {
1653       task->bytes = readNextWord(friso, task, &task->idx, task->buffer);
1654       if (task->bytes == 0) break;
1655 
1656       // convert full-width to half-width
1657       // and uppercase to lowercase for english chars
1658       tbytes += task->bytes;
1659       convert_full_to_half(friso, task, __convert);
1660       convert_upper_to_lower(friso, task, __convert);
1661       convert_work_apply(friso, task, __convert);
1662       string_buffer_append(task->sbuf, task->buffer);
1663 
1664       if (friso_dic_match(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer)) {
1665         lex = friso_dic_get(friso->dic, __LEX_CJK_WORDS__, task->sbuf->buffer);
1666         wbytes = tbytes;
1667       }
1668     }
1669 
1670     /*
1671      * matches no word in the dictionary
1672      *         reset the task->idx to the correct value
1673      */
1674     if (lex == NULL) {
1675       task->idx -= (tbytes - 1);
1676       continue;
1677     }
1678 
1679     // yat, matched a item and tanke it to initialize the returning token
1680     //    also we need to push back the none-matched part by reset the task->idx
1681     task->idx -= (tbytes - wbytes);
1682 
1683     memcpy(task->token->word, lex->word, lex->length);
1684     task->token->type = __LEX_CJK_WORDS__;
1685     task->token->length = lex->length;
1686     task->token->rlen = wbytes;
1687     task->token->offset = task->idx - wbytes;
1688     task->token->word[(int)lex->length] = '\0';
1689 
1690     // check and append the synonyms words
1691     if (config->add_syn && lex->syn != NULL) {
1692       if (config->spx_out == 1) {
1693         token_sphinx_output(task, lex);
1694       } else {
1695         token_normal_output(task, lex, 0);
1696       }
1697     }
1698 
1699     return task->token;
1700   }
1701 
1702   return NULL;
1703 }
1704 /* }}} */
1705