1 /*
2 * Anthyの辞書ライブラリの中心
3 *
4 * anthy_get_seq_ent_from_xstr()で辞書をひく
5 *
6 * Copyright (C) 2000-2007 TABATA Yusuke
7 * Copyright (C) 2005-2006 YOSHIDA Yuichi
8 *
9 */
10 /*
11 This library is free software; you can redistribute it and/or
12 modify it under the terms of the GNU Lesser General Public
13 License as published by the Free Software Foundation; either
14 version 2 of the License, or (at your option) any later version.
15
16 This library is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License for more details.
20
21 You should have received a copy of the GNU Lesser General Public
22 License along with this library; if not, write to the Free Software
23 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 #include <stdlib.h>
26 #include <string.h>
27
28 #include <anthy/anthy.h>
29 #include <anthy/dic.h>
30 #include <anthy/conf.h>
31 #include <anthy/record.h>
32 #include <anthy/alloc.h>
33 #include <anthy/logger.h>
34 #include <anthy/xchar.h>
35 #include <anthy/feature_set.h>
36 #include <anthy/textdic.h>
37
38 #include <anthy/diclib.h>
39
40 #include "dic_ent.h"
41 #include "dic_personality.h"
42 #include "dic_main.h"
43
44 /**/
45 static int dic_init_count;
46
47 /* 辞書 */
48 /* 全personalityで共有されるファイル辞書 */
49 static struct word_dic *master_dic_file;
50
51 /* 各パーソナリティごとの辞書 */
52 struct mem_dic *anthy_current_personal_dic_cache;/* キャッシュ */
53 /**/
54 struct record_stat *anthy_current_record;
55
56 struct seq_ent *
anthy_validate_seq_ent(struct seq_ent * seq,xstr * xs,int is_reverse)57 anthy_validate_seq_ent(struct seq_ent *seq, xstr *xs, int is_reverse)
58 {
59 if (!seq) {
60 return NULL;
61 }
62 if (seq->nr_dic_ents == 0 && seq->nr_compound_ents == 0) {
63 /* 無効なエントリを作成したのでcacheから削除 */
64 anthy_mem_dic_release_seq_ent(anthy_current_personal_dic_cache,
65 xs, is_reverse);
66 return NULL;
67 }
68
69 return seq;
70 }
71
72 struct seq_ent *
anthy_cache_get_seq_ent(xstr * xs,int is_reverse)73 anthy_cache_get_seq_ent(xstr *xs, int is_reverse)
74 {
75 struct seq_ent *seq;
76
77 /* キャッシュ中に既にあればそれを返す */
78 seq = anthy_mem_dic_find_seq_ent_by_xstr(anthy_current_personal_dic_cache,
79 xs, is_reverse);
80 if (seq) {
81 return seq;
82 }
83
84 /* キャッシュ中に無いので確保 */
85 return anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
86 xs, is_reverse);
87 }
88
89 int
anthy_dic_check_word_relation(int from,int to)90 anthy_dic_check_word_relation(int from, int to)
91 {
92 return anthy_word_dic_check_word_relation(master_dic_file, from, to);
93 }
94
95 static seq_ent_t
do_get_seq_ent_from_xstr(xstr * xs,int is_reverse)96 do_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
97 {
98 struct seq_ent *seq;
99 /* キャッシュから取り出す */
100 seq = anthy_cache_get_seq_ent(xs, is_reverse);
101 seq = anthy_validate_seq_ent(seq, xs, is_reverse);
102 if (!seq) {
103 /* 数字などの辞書に無い文字列を検索する */
104 return anthy_get_ext_seq_ent_from_xstr(xs, is_reverse);
105 }
106 return seq;
107 }
108
109 static xstr *
convert_vu(xstr * xs)110 convert_vu(xstr *xs)
111 {
112 int i, v = 0;
113 int j;
114
115 /* 「ヴ」の出現を数える */
116 for (i = 0; i < xs->len; i++) {
117 if (xs->str[i] == KK_VU) {
118 v++;
119 }
120 }
121 if (v > 0) {
122 xstr *nx = malloc(sizeof(xstr));
123 nx->len = xs->len + v;
124 nx->str = malloc(sizeof(xchar)*nx->len);
125 j = 0;
126 /* 「ヴ」を「う゛」に変換しつつコピーする */
127 for (i = 0; i < xs->len; i++) {
128 if (xs->str[i] == KK_VU) {
129 nx->str[j] = HK_U;
130 j++;
131 nx->str[j] = HK_DDOT;
132 j++;
133 } else {
134 nx->str[j] = xs->str[i];
135 j++;
136 }
137 }
138 return nx;
139 }
140 return NULL;
141 }
142
143 seq_ent_t
anthy_get_seq_ent_from_xstr(xstr * xs,int is_reverse)144 anthy_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
145 {
146 struct seq_ent *se;
147
148 if (!xs) {
149 return NULL;
150 }
151 if (!is_reverse) {
152 xstr *nx = convert_vu(xs);
153 /* 「ヴ」の混ざった順変換の場合、「う゛」に直して検索する
154 * 上位のレイヤーではユーザの与えた文字列をそのまま保持することが
155 * 期待されるので、変換はここで行なう。
156 */
157 if (nx) {
158 se = do_get_seq_ent_from_xstr(nx, 0);
159 anthy_free_xstr(nx);
160 return se;
161 }
162 }
163 /* 「ヴ」が出現しない、もしくは逆変換の場合 */
164 return do_get_seq_ent_from_xstr(xs, is_reverse);
165 }
166
167 static void
gang_elm_dtor(void * p)168 gang_elm_dtor(void *p)
169 {
170 struct gang_elm *ge = p;
171 free(ge->key);
172 }
173
174 static int
find_gang_elm(allocator ator,struct gang_elm * head,xstr * xs)175 find_gang_elm(allocator ator, struct gang_elm *head, xstr *xs)
176 {
177 char *str = anthy_xstr_to_cstr(xs, ANTHY_UTF8_ENCODING);
178 struct gang_elm *ge;
179 for (ge = head->tmp.next; ge; ge = ge->tmp.next) {
180 if (!strcmp(ge->key, str)) {
181 free(str);
182 return 0;
183 }
184 }
185 ge = anthy_smalloc(ator);
186 ge->xs = *xs;
187 ge->key = str;
188 ge->tmp.next = head->tmp.next;
189 head->tmp.next = ge;
190 return 1;
191 }
192
193 static int
gang_elm_compare_func(const void * p1,const void * p2)194 gang_elm_compare_func(const void *p1, const void *p2)
195 {
196 const struct gang_elm * const *s1 = p1;
197 const struct gang_elm * const *s2 = p2;
198 return strcmp((*s1)->key, (*s2)->key);
199 }
200
201 struct gang_scan_context {
202 /**/
203 int nr;
204 struct gang_elm **array;
205 /**/
206 int nth;
207 };
208
209 static int
is_ext_ent(struct seq_ent * seq)210 is_ext_ent(struct seq_ent *seq)
211 {
212 if (!seq->md) {
213 return 1;
214 }
215 return 0;
216 }
217
218 static void
scan_misc_dic(struct gang_elm ** array,int nr,int is_reverse)219 scan_misc_dic(struct gang_elm **array, int nr, int is_reverse)
220 {
221 int i;
222 for (i = 0; i < nr; i++) {
223 xstr *xs = &array[i]->xs;
224 struct seq_ent *seq;
225 seq = anthy_cache_get_seq_ent(xs, is_reverse);
226 /* 個人辞書からの取得(未知語辞書) */
227 if (seq) {
228 anthy_copy_words_from_private_dic(seq, xs, is_reverse);
229 anthy_validate_seq_ent(seq, xs, is_reverse);
230 }
231 }
232 }
233
234 static void
load_word(xstr * xs,const char * n,int is_reverse)235 load_word(xstr *xs, const char *n, int is_reverse)
236 {
237 struct seq_ent *seq = anthy_get_seq_ent_from_xstr(xs, 0);
238 xstr *word_xs;
239 wtype_t wt;
240 struct word_line wl;
241 if (!seq || is_ext_ent(seq)) {
242 seq = anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
243 xs, is_reverse);
244 }
245 if (anthy_parse_word_line(n, &wl)) {
246 return ;
247 }
248 word_xs = anthy_cstr_to_xstr(wl.word, ANTHY_UTF8_ENCODING);
249 if (anthy_type_to_wtype(wl.wt, &wt)) {
250 anthy_mem_dic_push_back_dic_ent(seq, 0, word_xs, wt,
251 NULL, wl.freq, 0);
252 }
253
254 anthy_free_xstr(word_xs);
255 }
256
257 static int
gang_scan(void * p,long offset,const char * key,const char * n)258 gang_scan(void *p, long offset, const char *key, const char *n)
259 {
260 struct gang_scan_context *gsc = p;
261 struct gang_elm *elm;
262 int r;
263 (void)offset;
264 while (1) {
265 if (gsc->nth >= gsc->nr) {
266 return 0;
267 }
268 elm = gsc->array[gsc->nth];
269 r = strcmp(elm->key, key);
270 if (r == 0) {
271 /* find it */
272 load_word(&elm->xs, n, 0);
273 /* go next in dictionary */
274 return 0;
275 } else if (r > 0) {
276 /* go next in dictionary */
277 return 0;
278 } else {
279 /* go next in lookup */
280 gsc->nth ++;
281 }
282 }
283 return 0;
284 }
285
286 static void
scan_dict(const char * td,int nr,struct gang_elm ** array)287 scan_dict(const char *td, int nr, struct gang_elm **array)
288 {
289 struct gang_scan_context gsc;
290 gsc.nr = nr;
291 gsc.array = array;
292 gsc.nth = 0;
293 anthy_textdic_scan(td, 0, &gsc, gang_scan);
294 }
295
296 struct scan_arg {
297 struct gang_elm **array;
298 int nr;
299 };
300
301 static void
request_scan(const char * tdname,void * arg)302 request_scan(const char *tdname, void *arg)
303 {
304 struct scan_arg *sarg = (struct scan_arg *)arg;
305 scan_dict(tdname, sarg->nr, sarg->array);
306 }
307
308 static void
do_gang_load_dic(xstr * sentence,int is_reverse)309 do_gang_load_dic(xstr *sentence, int is_reverse)
310 {
311 allocator ator = anthy_create_allocator(sizeof(struct gang_elm),
312 gang_elm_dtor);
313 int from, len;
314 xstr xs;
315 int i, nr;
316 struct gang_elm head;
317 struct gang_elm **array, *cur;
318 struct scan_arg sarg;
319 head.tmp.next = NULL;
320 nr = 0;
321 for (from = 0; from < sentence->len ; from ++) {
322 for (len = 1; len < 32 && from + len <= sentence->len; len ++) {
323 xs.str = &sentence->str[from];
324 xs.len = len;
325 nr += find_gang_elm(ator, &head, &xs);
326 }
327 }
328 array = malloc(sizeof(struct gang_elm *) * nr);
329 cur = head.tmp.next;
330 for (i = 0; i < nr; i++) {
331 array[i] = cur;
332 cur = cur->tmp.next;
333 }
334 qsort(array, nr, sizeof(struct gang_elm *), gang_elm_compare_func);
335 /**/
336 anthy_gang_fill_seq_ent(master_dic_file, array, nr, is_reverse);
337 /**/
338 scan_misc_dic(array, nr, is_reverse);
339 /* 個人辞書から読む */
340 sarg.nr = nr;
341 sarg.array = array;
342 anthy_ask_scan(request_scan, (void *)&sarg);
343 /**/
344 free(array);
345 anthy_free_allocator(ator);
346 }
347
348 void
anthy_gang_load_dic(xstr * sentence,int is_reverse)349 anthy_gang_load_dic(xstr *sentence, int is_reverse)
350 {
351 xstr *nx;
352 if (!is_reverse && (nx = convert_vu(sentence))) {
353 do_gang_load_dic(nx, is_reverse);
354 anthy_free_xstr(nx);
355 } else {
356 do_gang_load_dic(sentence, is_reverse);
357 }
358 }
359
360 /*
361 * seq_entの取得
362 ************************
363 * seq_entの各種情報の取得
364 */
365 int
anthy_get_nr_dic_ents(seq_ent_t se,xstr * xs)366 anthy_get_nr_dic_ents(seq_ent_t se, xstr *xs)
367 {
368 if (!se) {
369 return 0;
370 }
371 if (!xs) {
372 return se->nr_dic_ents;
373 }
374 return se->nr_dic_ents + anthy_get_nr_dic_ents_of_ext_ent(se, xs);
375 }
376
377 int
anthy_get_nth_dic_ent_str(seq_ent_t se,xstr * orig,int n,xstr * x)378 anthy_get_nth_dic_ent_str(seq_ent_t se, xstr *orig,
379 int n, xstr *x)
380 {
381 if (!se || (n < 0)) { /* INDEPPAIR学習による交換先が見つからなかった時に不正なメモリアクセスをするバグの修正(通称「いちおく」の件) */
382 x->str = NULL; /* 不正なメモリアクセスやメモリの多重解放をするバグの修正 */
383 x->len = 0;
384 return -1;
385 }
386 if (n >= se->nr_dic_ents) {
387 return anthy_get_nth_dic_ent_str_of_ext_ent(se, orig,
388 n - se->nr_dic_ents, x);
389 }
390 x->len = se->dic_ents[n]->str.len;
391 x->str = anthy_xstr_dup_str(&se->dic_ents[n]->str);
392 return 0;
393 }
394
395 int
anthy_get_nth_dic_ent_is_compound(seq_ent_t se,int nth)396 anthy_get_nth_dic_ent_is_compound(seq_ent_t se, int nth)
397 {
398 if (!se || nth >= se->nr_dic_ents)
399 return 0;
400
401 return se->dic_ents[nth]->is_compound;
402 }
403
404 #define MAGIC_FREQ 100
405 int
anthy_get_nth_dic_ent_freq(seq_ent_t se,int nth)406 anthy_get_nth_dic_ent_freq(seq_ent_t se, int nth)
407 {
408 if (!se)
409 return 0;
410 else if (!se->dic_ents || nth >= se->nr_dic_ents)
411 return MAGIC_FREQ;
412
413 return se->dic_ents[nth]->freq;
414 }
415
416 int
anthy_get_nth_dic_ent_wtype(seq_ent_t se,xstr * xs,int n,wtype_t * w)417 anthy_get_nth_dic_ent_wtype (seq_ent_t se, xstr *xs, int n, wtype_t *w)
418 {
419 if (!se) {
420 *w = anthy_wt_none;
421 return -1;
422 }
423
424 if (n >= se->nr_dic_ents)
425 return anthy_get_nth_dic_ent_wtype_of_ext_ent(xs, w);
426
427 *w = se->dic_ents[n]->type;
428 return 0;
429 }
430
431 int
anthy_get_seq_ent_pos(seq_ent_t se,int pos)432 anthy_get_seq_ent_pos(seq_ent_t se, int pos)
433 {
434 int i, v=0;
435 if (!se) {
436 return 0;
437 }
438 if (se->nr_dic_ents == 0) {
439 return anthy_get_ext_seq_ent_pos(se, pos);
440 }
441 for (i = 0; i < se->nr_dic_ents; i++) {
442 if (anthy_wtype_get_pos(se->dic_ents[i]->type) == pos) {
443 v += se->dic_ents[i]->freq;
444 if (v == 0) {
445 v = 1;
446 }
447 }
448 }
449 return v;
450 }
451
452 /*
453 * wtの品詞を持つ単語の中で最大の頻度を持つものを返す
454 */
455 int
anthy_get_seq_ent_wtype_freq(seq_ent_t seq,wtype_t wt)456 anthy_get_seq_ent_wtype_freq (seq_ent_t seq, wtype_t wt)
457 {
458 int i, f;
459
460 if (!seq) {
461 return 0;
462 }
463 /**/
464 if (seq->nr_dic_ents == 0) {
465 return anthy_get_ext_seq_ent_wtype(seq, wt);
466 }
467
468 f = 0;
469 /* 単語 */
470 for (i = 0; i < seq->nr_dic_ents; i++) {
471 if (seq->dic_ents[i]->order == 0 &&
472 anthy_wtype_equal (wt, seq->dic_ents[i]->type)) {
473 if (f < seq->dic_ents[i]->freq) {
474 f = seq->dic_ents[i]->freq;
475 }
476 }
477 }
478 return f;
479 }
480
481 /*
482 * wtの品詞を持つ複合語の中で最大の頻度を持つものを返す
483 */
484 int
anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se,wtype_t wt)485 anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se, wtype_t wt)
486 {
487 int i,f;
488 if (!se) {
489 return 0;
490 }
491 /**/
492 f = 0;
493 for (i = 0; i < se->nr_dic_ents; i++) {
494 if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
495 continue;
496 }
497 if (anthy_wtype_equal (wt, se->dic_ents[i]->type)) {
498 if (f < se->dic_ents[i]->freq) {
499 f = se->dic_ents[i]->freq;
500 }
501 }
502 }
503 return f;
504 }
505
506 int
anthy_get_seq_ent_indep(seq_ent_t se)507 anthy_get_seq_ent_indep(seq_ent_t se)
508 {
509 int i;
510 if (!se) {
511 return 0;
512 }
513 if (se->nr_dic_ents == 0) {
514 return anthy_get_ext_seq_ent_indep(se);
515 }
516 for (i = 0; i < se->nr_dic_ents; i++) {
517 if (anthy_wtype_get_indep(se->dic_ents[i]->type)) {
518 return 1;
519 }
520 }
521 return 0;
522 }
523
524 int
anthy_has_compound_ents(seq_ent_t se)525 anthy_has_compound_ents(seq_ent_t se)
526 {
527 if (!se) {
528 return 0;
529 }
530 return se->nr_compound_ents;
531 }
532
533 /* compundでない候補を持っているか */
534 int
anthy_has_non_compound_ents(seq_ent_t se)535 anthy_has_non_compound_ents(seq_ent_t se)
536 {
537 if (!se) {
538 return 0;
539 }
540 if (se->nr_dic_ents == 0) {
541 return 1;
542 }
543 return se->nr_dic_ents - se->nr_compound_ents;
544 }
545
546 compound_ent_t
anthy_get_nth_compound_ent(seq_ent_t se,int nth)547 anthy_get_nth_compound_ent(seq_ent_t se, int nth)
548 {
549 if (!se) {
550 return NULL;
551 }
552 if (nth >= 0 && nth < se->nr_dic_ents) {
553 return se->dic_ents[nth];
554 }
555 return NULL;
556 }
557
558 struct elm_compound {
559 int len;
560 xstr str;
561 };
562
563 /* 要素に対応する読みの長さを返す */
564 static int
get_element_len(xchar xc)565 get_element_len(xchar xc)
566 {
567 if (xc > '0' && xc <= '9') {
568 return xc - '0';
569 }
570 if (xc >= 'a' && xc <= 'z') {
571 return xc - 'a' + 10;
572 }
573 return 0;
574 }
575
576 static struct elm_compound *
get_nth_elm_compound(compound_ent_t ce,struct elm_compound * elm,int nth)577 get_nth_elm_compound(compound_ent_t ce, struct elm_compound *elm, int nth)
578 {
579 int off = 0;
580 int i, j;
581 for (i = 0; i <= nth; i++) {
582 /* nth番目の要素の先頭へ移動する */
583 while (!(ce->str.str[off] == '_' &&
584 get_element_len(ce->str.str[off+1]) > 0)) {
585 off ++;
586 if (off + 1 >= ce->str.len) {
587 return NULL;
588 }
589 }
590 /* 構造体へ情報を取り込む */
591 elm->len = get_element_len(ce->str.str[off+1]);
592 elm->str.str = &ce->str.str[off+2];
593 elm->str.len = ce->str.len - off - 2;
594 for (j = 0; j < elm->str.len; j++) {
595 if (elm->str.str[j] == '_') {
596 elm->str.len = j;
597 break;
598 }
599 }
600 off ++;
601 }
602 return elm;
603 }
604
605 int
anthy_compound_get_nr_segments(compound_ent_t ce)606 anthy_compound_get_nr_segments(compound_ent_t ce)
607 {
608 struct elm_compound elm;
609 int i;
610 if (!ce) {
611 return 0;
612 }
613 for (i = 0; get_nth_elm_compound(ce, &elm, i); i++);
614 return i;
615 }
616
617 int
anthy_compound_get_nth_segment_len(compound_ent_t ce,int nth)618 anthy_compound_get_nth_segment_len(compound_ent_t ce, int nth)
619 {
620 struct elm_compound elm;
621 if (get_nth_elm_compound(ce, &elm, nth)) {
622 return elm.len;
623 }
624 return 0;
625 }
626
627 int
anthy_compound_get_nth_segment_xstr(compound_ent_t ce,int nth,xstr * xs)628 anthy_compound_get_nth_segment_xstr(compound_ent_t ce, int nth, xstr *xs)
629 {
630 struct elm_compound elm;
631 if (get_nth_elm_compound(ce, &elm, nth)) {
632 if (xs) {
633 *xs = elm.str;
634 return 0;
635 }
636 }
637 return -1;
638 }
639
640 int
anthy_compound_get_wtype(compound_ent_t ce,wtype_t * w)641 anthy_compound_get_wtype(compound_ent_t ce, wtype_t *w)
642 {
643 *w = ce->type;
644 return 0;
645 }
646
647 int
anthy_compound_get_freq(compound_ent_t ce)648 anthy_compound_get_freq(compound_ent_t ce)
649 {
650 return ce->freq;
651 }
652
653 /* フロントエンドから呼ばれる */
654 void
anthy_lock_dic(void)655 anthy_lock_dic(void)
656 {
657 anthy_priv_dic_lock();
658 }
659
660 /* フロントエンドから呼ばれる */
661 void
anthy_unlock_dic(void)662 anthy_unlock_dic(void)
663 {
664 anthy_priv_dic_unlock();
665 }
666
667
668 dic_session_t
anthy_dic_create_session(void)669 anthy_dic_create_session(void)
670 {
671 return anthy_create_mem_dic();
672 }
673
674 void
anthy_dic_activate_session(dic_session_t d)675 anthy_dic_activate_session(dic_session_t d)
676 {
677 anthy_current_personal_dic_cache = d;
678 }
679
680 void
anthy_dic_release_session(dic_session_t d)681 anthy_dic_release_session(dic_session_t d)
682 {
683 anthy_release_mem_dic(d);
684 }
685
686 void
anthy_dic_set_personality(const char * id)687 anthy_dic_set_personality(const char *id)
688 {
689 anthy_current_record = anthy_create_record(id);
690 anthy_current_personal_dic_cache = anthy_create_mem_dic();
691 anthy_init_private_dic(id);
692 }
693
694
695 /** 辞書サブシステムを初期化
696 */
697 int
anthy_init_dic(void)698 anthy_init_dic(void)
699 {
700 if (dic_init_count) {
701 dic_init_count ++;
702 return 0;
703 }
704 if (anthy_init_diclib() == -1) {
705 return -1;
706 }
707
708 anthy_init_wtypes();
709 anthy_init_mem_dic();
710 anthy_init_record();
711 anthy_init_ext_ent();
712 anthy_init_features();
713
714 anthy_init_word_dic();
715 master_dic_file = anthy_create_word_dic();
716 if (!master_dic_file) {
717 anthy_log(0, "Failed to create file dic.\n");
718 return -1;
719 }
720 dic_init_count ++;
721 return 0;
722 }
723
724 /** 辞書サブシステムをすべて解放
725 */
726 void
anthy_quit_dic(void)727 anthy_quit_dic(void)
728 {
729 dic_init_count --;
730 if (dic_init_count) {
731 return;
732 }
733 if (anthy_current_record) {
734 anthy_release_record(anthy_current_record);
735 }
736 anthy_release_private_dic();
737 anthy_current_record = NULL;
738 anthy_quit_mem_dic();
739 anthy_quit_diclib();
740 }
741
742