1 /* Copyright(C) 2004 Brazil
2
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17 #include "senna_in.h"
18 #include <string.h>
19 #include <ctype.h>
20 #include "lex.h"
21
22 /* ngram */
23
24 inline static sen_lex *
sen_ngram_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)25 sen_ngram_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
26 {
27 sen_lex *lex;
28 sen_ctx *ctx = nstr->ctx;
29 if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
30 lex->sym = sym;
31 #ifndef NO_MECAB
32 lex->mecab = NULL;
33 #endif /* NO_MECAB */
34 lex->buf = NULL;
35 lex->token = NULL;
36 lex->tlen = 0;
37 lex->pos = -1;
38 lex->skip = 1;
39 lex->tail = 0;
40 lex->flags = flags;
41 lex->status = sen_lex_doing;
42 lex->encoding = sym->encoding;
43 lex->nstr = nstr;
44 lex->orig = (unsigned char *)nstr->norm;
45 lex->next = (unsigned char *)nstr->norm;
46 lex->uni_alpha = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_ALPHA));
47 lex->uni_digit = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_DIGIT));
48 lex->uni_symbol = (nstr->ctypes && !(lex->sym->flags & SEN_INDEX_SPLIT_SYMBOL));
49 lex->force_prefix = 0;
50 return lex;
51 }
52
53 #define LEX_TOKEN(lex,str,len) do {\
54 if ((lex)->tlen < len) {\
55 char *buf = SEN_REALLOC((lex)->token, (len) + 1);\
56 if (!(buf)) { (lex)->status = sen_lex_done; return SEN_SYM_NIL; }\
57 (lex)->token = buf;\
58 (lex)->tlen = len;\
59 }\
60 memcpy((lex)->token, str, len);\
61 (lex)->token[len] = '\0';\
62 } while (0)
63
64 inline static sen_id
sen_ngram_next(sen_lex * lex)65 sen_ngram_next(sen_lex *lex)
66 {
67 sen_id tid;
68 sen_sym *sym = lex->sym;
69 sen_ctx *ctx = lex->nstr->ctx;
70 uint_least8_t *cp = NULL;
71 int32_t len = 0, pos;
72 const unsigned char *p, *q, *r;
73 if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
74 lex->force_prefix = 0;
75 for (p = lex->next, pos = lex->pos + lex->skip; *p; p = r, pos++) {
76 if (lex->nstr->ctypes) { cp = lex->nstr->ctypes + pos; }
77 if (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) {
78 for (len = 1, r = p;;len++) {
79 size_t cl;
80 if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
81 r += cl;
82 if (SEN_NSTR_ISBLANK(*cp)) { break; }
83 if (SEN_NSTR_CTYPE(*++cp) != sen_str_alpha) { break; }
84 }
85 {
86 size_t blen = r - p;
87 if (!blen) {
88 lex->status = sen_lex_done;
89 return SEN_SYM_NIL;
90 }
91 LEX_TOKEN(lex, p, blen);
92 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
93 lex->skip = len;
94 }
95 } else if (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) {
96 for (len = 1, r = p;;len++) {
97 size_t cl;
98 if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
99 r += cl;
100 if (SEN_NSTR_ISBLANK(*cp)) { break; }
101 if (SEN_NSTR_CTYPE(*++cp) != sen_str_digit) { break; }
102 }
103 {
104 size_t blen = r - p;
105 if (!blen) {
106 lex->status = sen_lex_done;
107 return SEN_SYM_NIL;
108 }
109 LEX_TOKEN(lex, p, blen);
110 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
111 lex->skip = len;
112 }
113 } else if (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol) {
114 for (len = 1, r = p;;len++) {
115 size_t cl;
116 if (!(cl = sen_str_charlen((char *)r, lex->encoding))) { break; }
117 r += cl;
118 if (SEN_NSTR_ISBLANK(*cp)) { break; }
119 if (SEN_NSTR_CTYPE(*++cp) != sen_str_symbol) { break; }
120 }
121 {
122 size_t blen = r - p;
123 if (!blen) {
124 lex->status = sen_lex_done;
125 return SEN_SYM_NIL;
126 }
127 LEX_TOKEN(lex, p, blen);
128 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
129 lex->skip = len;
130 }
131 } else {
132 size_t cl;
133 #ifdef PRE_DEFINED_UNSPLIT_WORDS
134 {
135 const unsigned char *key = NULL;
136 if ((tid = sen_sym_common_prefix_search(sym, p))) {
137 if (!(key = _sen_sym_key(sym, tid))) {
138 lex->status = sen_lex_not_found;
139 return SEN_SYM_NIL;
140 }
141 len = sen_str_len(key, lex->encoding, NULL);
142 }
143 r = p + sen_str_charlen(p, lex->encoding);
144 if (tid && (len > 1 || r == p)) {
145 if (r != p && pos + len - 1 <= lex->tail) { continue; }
146 p += strlen(key);
147 if (!*p && !(lex->flags & SEN_LEX_UPD)) { lex->status = sen_lex_done; }
148 }
149 }
150 #endif /* PRE_DEFINED_UNSPLIT_WORDS */
151 if (!(cl = sen_str_charlen((char *)p, lex->encoding))) {
152 lex->status = sen_lex_done;
153 return SEN_SYM_NIL;
154 }
155 r = p + cl;
156 {
157 int blankp = 0;
158 for (len = 1, q = r; len < SEN_LEX_NGRAM_UNIT_SIZE; len++) {
159 if (cp) {
160 if (SEN_NSTR_ISBLANK(*cp)) { blankp++; break; }
161 cp++;
162 }
163 if (!(cl = sen_str_charlen((char *)q, lex->encoding)) ||
164 (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) ||
165 (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) ||
166 (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol)) {
167 break;
168 }
169 q += cl;
170 }
171 if (blankp && !(lex->flags & SEN_LEX_UPD)) { continue; }
172 }
173 if ((!cl || !*q) && !(lex->flags & SEN_LEX_UPD)) { lex->status = sen_lex_done; }
174 if (len < SEN_LEX_NGRAM_UNIT_SIZE) { lex->force_prefix = 1; }
175 {
176 size_t blen = q - p;
177 if (!blen) {
178 lex->status = sen_lex_done;
179 return SEN_SYM_NIL;
180 }
181 LEX_TOKEN(lex, p, blen);
182 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
183 lex->skip = 1;
184 }
185 }
186 lex->pos = pos;
187 lex->len = len;
188 lex->tail = pos + len - 1;
189 lex->next = r;
190 // printf("tid=%d pos=%d tail=%d (%s) %s\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid), r);
191 // printf("tid=%d pos=%d tail=%d (%s)\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid));
192 if (!tid) {
193 lex->status = sen_lex_not_found;
194 } else {
195 if (!*r) { lex->status = sen_lex_done; }
196 }
197 return tid;
198 }
199 lex->status = sen_lex_done;
200 return SEN_SYM_NIL;
201 }
202
203 /* mecab */
204
205 #ifndef NO_MECAB
206
207 static mecab_t *sole_mecab;
208 static sen_mutex sole_mecab_lock;
209
210 static char *sen_lex_default_mecab_argv[] = {"", "-Owakati"};
211
212 static int sen_lex_mecab_argc = 2;
213 static char **sen_lex_mecab_argv = sen_lex_default_mecab_argv;
214
215 #define SOLE_MECAB_CONFIRM do {\
216 if (!sole_mecab) {\
217 MUTEX_LOCK(sole_mecab_lock);\
218 if (!sole_mecab) { sole_mecab = mecab_new(sen_lex_mecab_argc, sen_lex_mecab_argv); }\
219 MUTEX_UNLOCK(sole_mecab_lock);\
220 }\
221 } while(0)
222
223 inline static sen_lex *
sen_mecab_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)224 sen_mecab_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
225 {
226 unsigned int bufsize, maxtrial = 10, len;
227 char *buf, *s, *p;
228 char mecab_err[256];
229 sen_lex *lex;
230 sen_ctx *ctx = nstr->ctx;
231 if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
232 lex->sym = sym;
233 // sen_log("(%s)", str);
234 SOLE_MECAB_CONFIRM;
235 if (!sole_mecab) {
236 SEN_LOG(sen_log_alert, "mecab_new failed on sen_mecab_open");
237 return NULL;
238 }
239 lex->mecab = sole_mecab;
240 lex->buf = NULL;
241 lex->token = NULL;
242 lex->tlen = 0;
243 // if (!(lex->mecab = mecab_new3())) {
244 lex->pos = -1;
245 lex->offset = 0;
246 lex->len = 0;
247 lex->flags = flags;
248 lex->status = sen_lex_doing;
249 lex->encoding = sym->encoding;
250 lex->nstr = nstr;
251 len = nstr->norm_blen;
252 mecab_err[sizeof(mecab_err) - 1] = '\0';
253 for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
254 if(!(buf = SEN_MALLOC(bufsize + 1))) {
255 SEN_LOG(sen_log_alert, "buffer allocation on sen_mecab_open failed !");
256 SEN_FREE(lex);
257 return NULL;
258 }
259 MUTEX_LOCK(sole_mecab_lock);
260 s = mecab_sparse_tostr3(lex->mecab, (char *)nstr->norm, len, buf, bufsize);
261 if (!s) {
262 strncpy(mecab_err, mecab_strerror(lex->mecab), sizeof(mecab_err) - 1);
263 }
264 MUTEX_UNLOCK(sole_mecab_lock);
265 if (s) { break; }
266 SEN_FREE(buf);
267 if (strstr(mecab_err, "output buffer overflow") == NULL) {
268 break;
269 }
270 }
271 if (!s) {
272 SEN_LOG(sen_log_alert, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s", len, bufsize, mecab_err);
273 sen_lex_close(lex);
274 return NULL;
275 }
276 // certain version of mecab returns trailing lf or spaces.
277 for (p = buf + strlen(buf) - 1; buf <= p && (*p == '\n' || isspace(*(unsigned char *)p)); p--) { *p = '\0'; }
278 //sen_log("sparsed='%s'", s);
279 lex->orig = (unsigned char *)nstr->norm;
280 lex->buf = (unsigned char *)buf;
281 lex->next = (unsigned char *)buf;
282 lex->force_prefix = 0;
283 return lex;
284 }
285
286 inline static sen_id
sen_mecab_next(sen_lex * lex)287 sen_mecab_next(sen_lex *lex)
288 {
289 sen_id tid;
290 sen_sym *sym = lex->sym;
291 sen_ctx *ctx = lex->nstr->ctx;
292 uint32_t size;
293 int32_t len, offset = lex->offset + lex->len;
294 const unsigned char *p;
295 if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
296 for (p = lex->next, len = 0;;) {
297 size_t cl;
298 if (!(cl = sen_str_charlen((char *)p, lex->encoding)) ||
299 sen_isspace(p, lex->encoding)) {
300 break;
301 }
302 p += cl;
303 len++;
304 }
305 if (!len) {
306 lex->status = sen_lex_done;
307 return SEN_SYM_NIL;
308 }
309 size = (uint32_t)(p - lex->next);
310 LEX_TOKEN(lex, lex->next, size);
311 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
312 {
313 int cl;
314 while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
315 lex->next = p;
316 lex->offset = offset;
317 lex->len = len;
318 }
319 if (tid == SEN_SYM_NIL) {
320 lex->status = sen_lex_not_found;
321 } else {
322 if (!*p) { lex->status = sen_lex_done; }
323 }
324 lex->pos++;
325 return tid;
326 }
327
328 sen_rc
sen_lex_set_mecab_args(int argc,char ** argv)329 sen_lex_set_mecab_args(int argc, char **argv)
330 {
331 sen_lex_mecab_argc = argc;
332 sen_lex_mecab_argv = argv;
333 if (sole_mecab) {
334 SEN_LOG(sen_log_alert, "mecab already initialized");
335 return sen_invalid_argument;
336 }
337 SOLE_MECAB_CONFIRM;
338 return sen_success;
339 }
340
341 #endif /* NO_MECAB */
342
343 /* delimited */
344
345 inline static sen_lex *
sen_delimited_open(sen_sym * sym,sen_nstr * nstr,uint8_t flags)346 sen_delimited_open(sen_sym *sym, sen_nstr *nstr, uint8_t flags)
347 {
348 int cl;
349 sen_lex *lex;
350 sen_ctx *ctx = nstr->ctx;
351 const char *p;
352 if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
353 lex->sym = sym;
354 #ifndef NO_MECAB
355 lex->mecab = NULL;
356 #endif /* NO_MECAB */
357 lex->buf = NULL;
358 lex->token = NULL;
359 lex->tlen = 0;
360 lex->pos = -1;
361 lex->skip = 1;
362 lex->tail = 0;
363 lex->flags = flags;
364 lex->status = sen_lex_doing;
365 lex->encoding = sym->encoding;
366 lex->nstr = nstr;
367 p = nstr->norm;
368 lex->orig = (unsigned char *)p;
369 while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
370 lex->next = (unsigned char *)p;
371 lex->offset = 0;
372 lex->len = 0;
373 if (!*p) { lex->status = sen_lex_done; }
374 lex->force_prefix = 0;
375 return lex;
376 }
377
378 inline static sen_id
sen_delimited_next(sen_lex * lex)379 sen_delimited_next(sen_lex *lex)
380 {
381 sen_id tid;
382 sen_sym *sym = lex->sym;
383 sen_ctx *ctx = lex->nstr->ctx;
384 uint32_t size;
385 int32_t len, offset = lex->offset + lex->len;
386 const unsigned char *p;
387 if (lex->status == sen_lex_done) { return SEN_SYM_NIL; }
388 for (p = lex->next, len = 0;;) {
389 size_t cl;
390 if (!(cl = sen_str_charlen((char *)p, lex->encoding)) ||
391 sen_isspace(p, lex->encoding)) {
392 break;
393 }
394 p += cl;
395 len++;
396 }
397 if (!len) {
398 lex->status = sen_lex_done;
399 return SEN_SYM_NIL;
400 }
401 size = (uint32_t)(p - lex->next);
402 LEX_TOKEN(lex, lex->next, size);
403 tid = (lex->flags & SEN_LEX_ADD) ? sen_sym_get(sym, lex->token) : sen_sym_at(sym, lex->token);
404 {
405 int cl;
406 while ((cl = sen_isspace(p, lex->encoding))) { p += cl; }
407 lex->next = p;
408 lex->offset = offset;
409 lex->len = len;
410 }
411 if (tid == SEN_SYM_NIL) {
412 lex->status = sen_lex_not_found;
413 } else {
414 if (!*p) { lex->status = sen_lex_done; }
415 }
416 lex->pos++;
417 return tid;
418 }
419
420 /* external */
421
422 sen_rc
sen_lex_init(void)423 sen_lex_init(void)
424 {
425 #ifndef NO_MECAB
426 // char *arg[] = {"", "-Owakati"};
427 // return mecab_load_dictionary(2, arg) ? sen_success : sen_external_error;
428 sole_mecab = NULL;
429 MUTEX_INIT(sole_mecab_lock);
430 #endif /* NO_MECAB */
431 return sen_success;
432 }
433
434 sen_rc
sen_lex_fin(void)435 sen_lex_fin(void)
436 {
437 #ifndef NO_MECAB
438 if (sole_mecab) {
439 mecab_destroy(sole_mecab);
440 sole_mecab = NULL;
441 }
442 MUTEX_DESTROY(sole_mecab_lock);
443 #endif /* NO_MECAB */
444 return sen_success;
445 }
446
447 sen_lex *
sen_lex_open(sen_sym * sym,const char * str,size_t str_len,uint8_t flags)448 sen_lex_open(sen_sym *sym, const char *str, size_t str_len, uint8_t flags)
449 {
450 sen_nstr *nstr;
451 int nflag, type;
452 if (!sym) {
453 SEN_LOG(sen_log_warning, "sym is null at sen_lex_open");
454 return NULL;
455 }
456 type = sym->flags & SEN_INDEX_TOKENIZER_MASK;
457 nflag = (type == SEN_INDEX_NGRAM ? SEN_STR_REMOVEBLANK|SEN_STR_WITH_CTYPES : 0);
458 if (sym->flags & SEN_INDEX_NORMALIZE) {
459 if (!(nstr = sen_nstr_open(str, str_len, sym->encoding, nflag))) {
460 SEN_LOG(sen_log_alert, "sen_nstr_open failed at sen_lex_open");
461 return NULL;
462 }
463 } else {
464 if (!(nstr = sen_fakenstr_open(str, str_len, sym->encoding, nflag))) {
465 SEN_LOG(sen_log_alert, "sen_fakenstr_open failed at sen_lex_open");
466 return NULL;
467 }
468 }
469 switch (type) {
470 case SEN_INDEX_MORPH_ANALYSE :
471 #ifdef NO_MECAB
472 return NULL;
473 #else /* NO_MECAB */
474 return sen_mecab_open(sym, nstr, flags);
475 #endif /* NO_MECAB */
476 case SEN_INDEX_NGRAM :
477 return sen_ngram_open(sym, nstr, flags);
478 case SEN_INDEX_DELIMITED :
479 return sen_delimited_open(sym, nstr, flags);
480 default :
481 return NULL;
482 }
483 }
484
485 sen_rc
sen_lex_next(sen_lex * lex)486 sen_lex_next(sen_lex *lex)
487 {
488 /* if (!lex) { return sen_invalid_argument; } */
489 switch ((lex->sym->flags & SEN_INDEX_TOKENIZER_MASK)) {
490 case SEN_INDEX_MORPH_ANALYSE :
491 #ifdef NO_MECAB
492 return sen_invalid_argument;
493 #else /* NO_MECAB */
494 return sen_mecab_next(lex);
495 #endif /* NO_MECAB */
496 case SEN_INDEX_NGRAM :
497 return sen_ngram_next(lex);
498 case SEN_INDEX_DELIMITED :
499 return sen_delimited_next(lex);
500 default :
501 return sen_invalid_argument;
502 }
503 }
504
505 sen_rc
sen_lex_close(sen_lex * lex)506 sen_lex_close(sen_lex *lex)
507 {
508 if (lex) {
509 sen_ctx *ctx = lex->nstr->ctx;
510 if (lex->nstr) { sen_nstr_close(lex->nstr); }
511 // if (lex->mecab) { mecab_destroy(lex->mecab); }
512 if (lex->buf) { SEN_FREE(lex->buf); }
513 if (lex->token) { SEN_REALLOC(lex->token, 0); }
514 SEN_FREE(lex);
515 return sen_success;
516 } else {
517 return sen_invalid_argument;
518 }
519 }
520
521 sen_rc
sen_lex_validate(sen_sym * sym)522 sen_lex_validate(sen_sym *sym)
523 {
524 if (!sym) {
525 SEN_LOG(sen_log_warning, "sym is null on sen_lex_validate");
526 return sen_invalid_argument;
527 }
528 #ifndef NO_MECAB
529 #ifdef USE_MECAB_DICINFO
530 if ((sym->flags & SEN_INDEX_TOKENIZER_MASK) == SEN_INDEX_MORPH_ANALYSE) {
531 sen_encoding enc;
532 const mecab_dictionary_info_t *di;
533
534 SOLE_MECAB_CONFIRM;
535 if (!sole_mecab) {
536 SEN_LOG(sen_log_alert, "mecab_new failed on sen_lex_validate");
537 return sen_external_error;
538 }
539 di = mecab_dictionary_info(sole_mecab);
540 if (!di || !di->charset) {
541 SEN_LOG(sen_log_alert, "mecab_dictionary_info failed on sen_lex_validate");
542 return sen_external_error;
543 }
544 switch (di->charset[0]) {
545 case 'u':
546 enc = sen_enc_utf8;
547 break;
548 case 'e':
549 enc = sen_enc_euc_jp;
550 break;
551 case 'c': /* cp932 */
552 case 's':
553 enc = sen_enc_sjis;
554 break;
555 default:
556 SEN_LOG(sen_log_alert, "unknown encoding %s on sen_lex_validate", di->charset);
557 return sen_external_error;
558 }
559 if (enc != sym->encoding) {
560 SEN_LOG(sen_log_alert,
561 "dictionary encoding %s is differ from sym encoding %s",
562 di->charset, sen_enctostr(sym->encoding));
563 return sen_abnormal_error;
564 }
565 }
566 #endif /* USE_MECAB_DICINFO */
567 #endif /* NO_MECAB */
568 return sen_success;
569 }
570