1 /* Copyright(C) 2004 Brazil
2 
3   This library is free software; you can redistribute it and/or
4   modify it under the terms of the GNU Lesser General Public
5   License as published by the Free Software Foundation; either
6   version 2.1 of the License, or (at your option) any later version.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 #include "senna_in.h"
18 #include <string.h>
19 #include <ctype.h>
20 #include "lex.h"
21 
22 /* ngram */
23 
24 inline static sen_lex *
sen_ngram_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)25 sen_ngram_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
26 {
27   sen_lex *lex;
28   sen_ctx *ctx = nstr->ctx;
29   if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
30   lex->sym = sym;
31 #ifndef NO_MECAB
32   lex->mecab = NULL;
33 #endif /* NO_MECAB */
34   lex->buf = NULL;
35   lex->token = NULL;
36   lex->tlen = 0;
37   lex->pos = -1;
38   lex->skip = 1;
39   lex->tail = 0;
40   lex->flags = flags;
41   lex->status = sen_lex_doing;
42   lex->encoding = sym->encoding;
43   lex->nstr = nstr;
44   lex->orig = (unsigned char *)nstr->norm;
45   lex->next = (unsigned char *)nstr->norm;
46   lex->uni_alpha = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_ALPHA));
47   lex->uni_digit = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_DIGIT));
48   lex->uni_symbol = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_SYMBOL));
49   lex->force_prefix = 0;
50   return lex;
51 }
52 
53 #define LEX_TOKEN(lex,str,len) do {\
54   if ((lex)->tlen < len) {\
55     char *buf = SEN_REALLOC((lex)->token, (len) + 1);\
56     if (!(buf)) { (lex)->status = sen_lex_done; return SEN_SYM_NIL; }\
57     (lex)->token = buf;\
58     (lex)->tlen = len;\
59   }\
60   memcpy((lex)->token, str, len);\
61   (lex)->token[len] = '\0';\
62 } while (0)
63 
64 inline static sen_id
sen_ngram_next(sen_lex * lex)65 sen_ngram_next(sen_lex *lex)
66 {
67   sen_id tid;
68   sen_sym *sym = lex->sym;
69   sen_ctx *ctx = lex->nstr->ctx;
70   uint_least8_t *cp = NULL;
71   int32_t len = 0, pos;
72   const unsigned char *p, *q, *r;
73   if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
74   lex->force_prefix = 0;
75   for (p = lex->next, pos = lex->pos + lex->skip; *p; p = r, pos++) {
76     if (lex->nstr->ctypes) { cp = lex->nstr->ctypes + pos; }
77     if (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) {
78       for (len = 1, r = p;;len++) {
79         size_t cl;
80         if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
81         r += cl;
82         if (SEN_NSTR_ISBLANK(*cp)) { break; }
83         if (SEN_NSTR_CTYPE(*++cp) != sen_str_alpha) { break; }
84       }
85       {
86         size_t blen = r - p;
87         if (!blen) {
88           lex->status = sen_lex_done;
89           return SEN_SYM_NIL;
90         }
91         LEX_TOKEN(lex, p, blen);
92         tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
93         lex->skip = len;
94       }
95     } else if (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) {
96       for (len = 1, r = p;;len++) {
97         size_t cl;
98         if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
99         r += cl;
100         if (SEN_NSTR_ISBLANK(*cp)) { break; }
101         if (SEN_NSTR_CTYPE(*++cp) != sen_str_digit) { break; }
102       }
103       {
104         size_t blen = r - p;
105         if (!blen) {
106           lex->status = sen_lex_done;
107           return SEN_SYM_NIL;
108         }
109         LEX_TOKEN(lex, p, blen);
110         tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
111         lex->skip = len;
112       }
113     } else if (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol) {
114       for (len = 1, r = p;;len++) {
115         size_t cl;
116         if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
117         r += cl;
118         if (SEN_NSTR_ISBLANK(*cp)) { break; }
119         if (SEN_NSTR_CTYPE(*++cp) != sen_str_symbol) { break; }
120       }
121       {
122         size_t blen = r - p;
123         if (!blen) {
124           lex->status = sen_lex_done;
125           return SEN_SYM_NIL;
126         }
127         LEX_TOKEN(lex, p, blen);
128         tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
129         lex->skip = len;
130       }
131     } else {
132       size_t cl;
133 #ifdef PRE_DEFINED_UNSPLIT_WORDS
134       {
135         const unsigned char *key = NULL;
136         if ((tid = sen_sym_common_prefix_search(sym, p))) {
137           if (!(key = _sen_sym_key(sym, tid))) {
138             lex->status = sen_lex_not_found;
139             return SEN_SYM_NIL;
140           }
141           len = sen_str_len(key, lex->encoding, NULL);
142         }
143         r = p + sen_str_charlen(p, lex->encoding);
144         if (tid && (len > 1 || r == p)) {
145           if (r != p && pos + len - 1 <= lex->tail) { continue; }
146           p += strlen(key);
147           if (!*p && !(lex->flags & SEN_LEX_UPD)) { lex->status = sen_lex_done; }
148         }
149       }
150 #endif /* PRE_DEFINED_UNSPLIT_WORDS */
151       if (!(cl = sen_str_charlen((char *)p, lex->encoding))) {
152         lex->status = sen_lex_done;
153         return SEN_SYM_NIL;
154       }
155       r = p + cl;
156       {
157         int blankp = 0;
158         for (len = 1, q = r; len < SEN_LEX_NGRAM_UNIT_SIZE; len++) {
159           if (cp) {
160             if (SEN_NSTR_ISBLANK(*cp)) { blankp++; break; }
161             cp++;
162           }
163           if (!(cl = sen_str_charlen((char *)q, lex->encoding)) ||
164               (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) ||
165               (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) ||
166               (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol)) {
167             break;
168           }
169           q += cl;
170         }
171         if (blankp && !(lex->flags & SEN_LEX_UPD)) { continue; }
172       }
173       if ((!cl || !*q) && !(lex->flags & SEN_LEX_UPD)) { lex->status = sen_lex_done; }
174       if (len < SEN_LEX_NGRAM_UNIT_SIZE) { lex->force_prefix = 1; }
175       {
176         size_t blen = q - p;
177         if (!blen) {
178           lex->status = sen_lex_done;
179           return SEN_SYM_NIL;
180         }
181         LEX_TOKEN(lex, p, blen);
182         tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
183         lex->skip = 1;
184       }
185     }
186     lex->pos = pos;
187     lex->len = len;
188     lex->tail = pos + len - 1;
189     lex->next = r;
190     // printf("tid=%d pos=%d tail=%d (%s) %s\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid), r);
191     // printf("tid=%d pos=%d tail=%d (%s)\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid));
192     if (!tid) {
193       lex->status = sen_lex_not_found;
194     } else {
195       if (!*r) { lex->status = sen_lex_done; }
196     }
197     return tid;
198   }
199   lex->status = sen_lex_done;
200   return SEN_SYM_NIL;
201 }
202 
203 /* mecab */
204 
205 #ifndef NO_MECAB
206 
207 static mecab_t *sole_mecab;
208 static sen_mutex sole_mecab_lock;
209 
210 static char *sen_lex_default_mecab_argv[] = {"", "-Owakati"};
211 
212 static int sen_lex_mecab_argc = 2;
213 static char **sen_lex_mecab_argv = sen_lex_default_mecab_argv;
214 
215 #define SOLE_MECAB_CONFIRM do {\
216   if (!sole_mecab) {\
217     MUTEX_LOCK(sole_mecab_lock);\
218     if (!sole_mecab) { sole_mecab = mecab_new(sen_lex_mecab_argc, sen_lex_mecab_argv); }\
219     MUTEX_UNLOCK(sole_mecab_lock);\
220   }\
221 } while(0)
222 
223 inline static sen_lex *
sen_mecab_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)224 sen_mecab_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
225 {
226   unsigned int bufsize, maxtrial = 10, len;
227   char *buf, *s, *p;
228   char mecab_err[256];
229   sen_lex *lex;
230   sen_ctx *ctx = nstr->ctx;
231   if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
232   lex->sym = sym;
233   // sen_log("(%s)", str);
234   SOLE_MECAB_CONFIRM;
235   if (!sole_mecab) {
236     SEN_LOG(sen_log_alert, "mecab_new failed on sen_mecab_open");
237     return NULL;
238   }
239   lex->mecab = sole_mecab;
240   lex->buf = NULL;
241   lex->token = NULL;
242   lex->tlen = 0;
243   // if (!(lex->mecab = mecab_new3())) {
244   lex->pos = -1;
245   lex->offset = 0;
246   lex->len = 0;
247   lex->flags = flags;
248   lex->status = sen_lex_doing;
249   lex->encoding = sym->encoding;
250   lex->nstr = nstr;
251   len = nstr->norm_blen;
252   mecab_err[sizeof(mecab_err) - 1] = '\0';
253   for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
254     if(!(buf = SEN_MALLOC(bufsize + 1))) {
255       SEN_LOG(sen_log_alert, "buffer allocation on sen_mecab_open failed !");
256       SEN_FREE(lex);
257       return NULL;
258     }
259     MUTEX_LOCK(sole_mecab_lock);
260     s = mecab_sparse_tostr3(lex->mecab, (char *)nstr->norm, len, buf, bufsize);
261     if (!s) {
262       strncpy(mecab_err, mecab_strerror(lex->mecab), sizeof(mecab_err) - 1);
263     }
264     MUTEX_UNLOCK(sole_mecab_lock);
265     if (s) { break; }
266     SEN_FREE(buf);
267     if (strstr(mecab_err, "output buffer overflow") == NULL) {
268       break;
269     }
270   }
271   if (!s) {
272     SEN_LOG(sen_log_alert, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s", len, bufsize, mecab_err);
273     sen_lex_close(lex);
274     return NULL;
275   }
276   // certain version of mecab returns trailing lf or spaces.
277   for (p = buf + strlen(buf) - 1; buf <= p && (*p == '\n' || isspace(*(unsigned char *)p)); p--) { *p = '\0'; }
278   //sen_log("sparsed='%s'", s);
279   lex->orig = (unsigned char *)nstr->norm;
280   lex->buf = (unsigned char *)buf;
281   lex->next = (unsigned char *)buf;
282   lex->force_prefix = 0;
283   return lex;
284 }
285 
286 inline static sen_id
sen_mecab_next(sen_lex * lex)287 sen_mecab_next(sen_lex *lex)
288 {
289   sen_id tid;
290   sen_sym *sym = lex->sym;
291   sen_ctx *ctx = lex->nstr->ctx;
292   uint32_t size;
293   int32_t len, offset = lex->offset + lex->len;
294   const unsigned char *p;
295   if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
296   for (p = lex->next, len = 0;;) {
297     size_t cl;
298     if (!(cl = sen_str_charlen((char *)p, lex->encoding)) ||
299         sen_isspace(p, lex->encoding)) {
300       break;
301     }
302     p += cl;
303     len++;
304   }
305   if (!len) {
306     lex->status = sen_lex_done;
307     return SEN_SYM_NIL;
308   }
309   size = (uint32_t)(p - lex->next);
310   LEX_TOKEN(lex, lex->next, size);
311   tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
312   {
313     int cl;
314     while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
315     lex->next = p;
316     lex->offset = offset;
317     lex->len = len;
318   }
319   if (tid == SEN_SYM_NIL) {
320     lex->status = sen_lex_not_found;
321   } else {
322     if (!*p) { lex->status = sen_lex_done; }
323   }
324   lex->pos++;
325   return tid;
326 }
327 
328 sen_rc
sen_lex_set_mecab_args(int argc,char ** argv)329 sen_lex_set_mecab_args(int argc, char **argv)
330 {
331   sen_lex_mecab_argc = argc;
332   sen_lex_mecab_argv = argv;
333   if (sole_mecab) {
334     SEN_LOG(sen_log_alert, "mecab already initialized");
335     return sen_invalid_argument;
336   }
337   SOLE_MECAB_CONFIRM;
338   return sen_success;
339 }
340 
341 #endif /* NO_MECAB */
342 
343 /* delimited */
344 
345 inline static sen_lex *
sen_delimited_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)346 sen_delimited_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
347 {
348   int cl;
349   sen_lex *lex;
350   sen_ctx *ctx = nstr->ctx;
351   const char *p;
352   if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
353   lex->sym = sym;
354 #ifndef NO_MECAB
355   lex->mecab = NULL;
356 #endif /* NO_MECAB */
357   lex->buf = NULL;
358   lex->token = NULL;
359   lex->tlen = 0;
360   lex->pos = -1;
361   lex->skip = 1;
362   lex->tail = 0;
363   lex->flags = flags;
364   lex->status = sen_lex_doing;
365   lex->encoding = sym->encoding;
366   lex->nstr = nstr;
367   p = nstr->norm;
368   lex->orig = (unsigned char *)p;
369   while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
370   lex->next = (unsigned char *)p;
371   lex->offset = 0;
372   lex->len = 0;
373   if (!*p) { lex->status = sen_lex_done; }
374   lex->force_prefix = 0;
375   return lex;
376 }
377 
378 inline static sen_id
sen_delimited_next(sen_lex * lex)379 sen_delimited_next(sen_lex *lex)
380 {
381   sen_id tid;
382   sen_sym *sym = lex->sym;
383   sen_ctx *ctx = lex->nstr->ctx;
384   uint32_t size;
385   int32_t len, offset = lex->offset + lex->len;
386   const unsigned char *p;
387   if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
388   for (p = lex->next, len = 0;;) {
389     size_t cl;
390     if (!(cl = sen_str_charlen((char *)p, lex->encoding)) ||
391         sen_isspace(p, lex->encoding)) {
392       break;
393     }
394     p += cl;
395     len++;
396   }
397   if (!len) {
398     lex->status = sen_lex_done;
399     return SEN_SYM_NIL;
400   }
401   size = (uint32_t)(p - lex->next);
402   LEX_TOKEN(lex, lex->next, size);
403   tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
404   {
405     int cl;
406     while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
407     lex->next = p;
408     lex->offset = offset;
409     lex->len = len;
410   }
411   if (tid == SEN_SYM_NIL) {
412     lex->status = sen_lex_not_found;
413   } else {
414     if (!*p) { lex->status = sen_lex_done; }
415   }
416   lex->pos++;
417   return tid;
418 }
419 
420 /* external */
421 
422 sen_rc
sen_lex_init(void)423 sen_lex_init(void)
424 {
425 #ifndef NO_MECAB
426   // char *arg[] = {"", "-Owakati"};
427   // return mecab_load_dictionary(2, arg) ? sen_success : sen_external_error;
428   sole_mecab = NULL;
429   MUTEX_INIT(sole_mecab_lock);
430 #endif /* NO_MECAB */
431   return sen_success;
432 }
433 
434 sen_rc
sen_lex_fin(void)435 sen_lex_fin(void)
436 {
437 #ifndef NO_MECAB
438   if (sole_mecab) {
439     mecab_destroy(sole_mecab);
440     sole_mecab = NULL;
441   }
442   MUTEX_DESTROY(sole_mecab_lock);
443 #endif /* NO_MECAB */
444   return sen_success;
445 }
446 
447 sen_lex *
sen_lex_open(sen_sym * sym,const char * str,size_t str_len,uint8_t flags)448 sen_lex_open(sen_sym *sym, const char *str, size_t str_len, uint8_t flags)
449 {
450   sen_nstr *nstr;
451   int nflag, type;
452   if (!sym) {
453     SEN_LOG(sen_log_warning, "sym is null at sen_lex_open");
454     return NULL;
455   }
456   type = sym->flags & SEN_INDEX_TOKENIZER_MASK;
457   nflag = (type == SEN_INDEX_NGRAM ? SEN_STR_REMOVEBLANK|SEN_STR_WITH_CTYPES : 0);
458   if (sym->flags & SEN_INDEX_NORMALIZE) {
459     if (!(nstr = sen_nstr_open(str, str_len, sym->encoding, nflag))) {
460       SEN_LOG(sen_log_alert, "sen_nstr_open failed at sen_lex_open");
461       return NULL;
462     }
463   } else {
464     if (!(nstr = sen_fakenstr_open(str, str_len, sym->encoding, nflag))) {
465       SEN_LOG(sen_log_alert, "sen_fakenstr_open failed at sen_lex_open");
466       return NULL;
467     }
468   }
469   switch (type) {
470   case SEN_INDEX_MORPH_ANALYSE :
471 #ifdef NO_MECAB
472     return NULL;
473 #else /* NO_MECAB */
474     return sen_mecab_open(sym, nstr, flags);
475 #endif /* NO_MECAB */
476   case SEN_INDEX_NGRAM :
477     return sen_ngram_open(sym, nstr, flags);
478   case SEN_INDEX_DELIMITED :
479     return sen_delimited_open(sym, nstr, flags);
480   default :
481     return NULL;
482   }
483 }
484 
485 sen_rc
sen_lex_next(sen_lex * lex)486 sen_lex_next(sen_lex *lex)
487 {
488   /* if (!lex) { return sen_invalid_argument; } */
489   switch ((lex->sym->flags & SEN_INDEX_TOKENIZER_MASK)) {
490   case SEN_INDEX_MORPH_ANALYSE :
491 #ifdef NO_MECAB
492     return sen_invalid_argument;
493 #else /* NO_MECAB */
494     return sen_mecab_next(lex);
495 #endif /* NO_MECAB */
496   case SEN_INDEX_NGRAM :
497     return sen_ngram_next(lex);
498   case SEN_INDEX_DELIMITED :
499     return sen_delimited_next(lex);
500   default :
501     return sen_invalid_argument;
502   }
503 }
504 
505 sen_rc
sen_lex_close(sen_lex * lex)506 sen_lex_close(sen_lex *lex)
507 {
508   if (lex) {
509     sen_ctx *ctx = lex->nstr->ctx;
510     if (lex->nstr) { sen_nstr_close(lex->nstr); }
511     // if (lex->mecab) { mecab_destroy(lex->mecab); }
512     if (lex->buf) { SEN_FREE(lex->buf); }
513     if (lex->token) { SEN_REALLOC(lex->token, 0); }
514     SEN_FREE(lex);
515     return sen_success;
516   } else {
517     return sen_invalid_argument;
518   }
519 }
520 
521 sen_rc
sen_lex_validate(sen_sym * sym)522 sen_lex_validate(sen_sym *sym)
523 {
524   if (!sym) {
525     SEN_LOG(sen_log_warning, "sym is null on sen_lex_validate");
526     return sen_invalid_argument;
527   }
528 #ifndef NO_MECAB
529 #ifdef USE_MECAB_DICINFO
530   if ((sym->flags & SEN_INDEX_TOKENIZER_MASK) == SEN_INDEX_MORPH_ANALYSE) {
531     sen_encoding enc;
532     const mecab_dictionary_info_t *di;
533 
534     SOLE_MECAB_CONFIRM;
535     if (!sole_mecab) {
536       SEN_LOG(sen_log_alert, "mecab_new failed on sen_lex_validate");
537       return sen_external_error;
538     }
539     di = mecab_dictionary_info(sole_mecab);
540     if (!di || !di->charset) {
541       SEN_LOG(sen_log_alert, "mecab_dictionary_info failed on sen_lex_validate");
542       return sen_external_error;
543     }
544     switch (di->charset[0]) {
545       case 'u':
546         enc = sen_enc_utf8;
547         break;
548       case 'e':
549         enc = sen_enc_euc_jp;
550         break;
551       case 'c': /* cp932 */
552       case 's':
553         enc = sen_enc_sjis;
554         break;
555       default:
556         SEN_LOG(sen_log_alert, "unknown encoding %s on sen_lex_validate", di->charset);
557         return sen_external_error;
558     }
559     if (enc != sym->encoding) {
560       SEN_LOG(sen_log_alert,
561               "dictionary encoding %s is differ from sym encoding %s",
562               di->charset, sen_enctostr(sym->encoding));
563       return sen_abnormal_error;
564     }
565   }
566 #endif /* USE_MECAB_DICINFO */
567 #endif /* NO_MECAB */
568   return sen_success;
569 }
570