1 /*
2 * 文を文節にsplitするsplitter
3 *
4 * 文節の境界を検出する
5 * anthy_init_split_context() 分割用のコンテキストを作って
6 * anthy_mark_border() 分割をして
7 * anthy_release_split_context() コンテキストを解放する
8 *
9 * anthy_commit_border() コミットされた内容に対して学習をする
10 *
11 * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
12 *
13 * Copyright (C) 2004 YOSHIDA Yuichi
14 * Copyright (C) 2000-2004 TABATA Yusuke
15 * Copyright (C) 2000-2001 UGAWA Tomoharu
16 *
17 */
18 /*
19 This library is free software; you can redistribute it and/or
20 modify it under the terms of the GNU Lesser General Public
21 License as published by the Free Software Foundation; either
22 version 2 of the License, or (at your option) any later version.
23
24 This library is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 Lesser General Public License for more details.
28
29 You should have received a copy of the GNU Lesser General Public
30 License along with this library; if not, write to the Free Software
31 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 */
33 #include <stdlib.h>
34 #include <string.h>
35
36 #include <anthy/alloc.h>
37 #include <anthy/record.h>
38 #include <anthy/splitter.h>
39 #include <anthy/logger.h>
40 #include "wordborder.h"
41
42 #define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
43
44 static int splitter_debug_flags;
45
46 /** make_word_cacheで作成した文節情報を解放する
47 */
48 static void
release_info_cache(struct splitter_context * sc)49 release_info_cache(struct splitter_context *sc)
50 {
51 struct word_split_info_cache *info = sc->word_split_info;
52
53 anthy_free_allocator(info->MwAllocator);
54 anthy_free_allocator(info->WlAllocator);
55 free(info->cnode);
56 free(info->seq_len);
57 free(info->rev_seq_len);
58 free(info);
59 }
60
61 static void
metaword_dtor(void * p)62 metaword_dtor(void *p)
63 {
64 struct meta_word *mw = (struct meta_word*)p;
65 if (mw->cand_hint.str) {
66 free(mw->cand_hint.str);
67 }
68 }
69
70
71 static void
alloc_char_ent(xstr * xs,struct splitter_context * sc)72 alloc_char_ent(xstr *xs, struct splitter_context *sc)
73 {
74 int i;
75
76 sc->char_count = xs->len;
77 sc->ce = (struct char_ent*)
78 malloc(sizeof(struct char_ent)*(xs->len + 1));
79 for (i = 0; i <= xs->len; i++) {
80 sc->ce[i].c = &xs->str[i];
81 sc->ce[i].seg_border = 0;
82 sc->ce[i].initial_seg_len = 0;
83 sc->ce[i].best_seg_class = SEG_HEAD;
84 sc->ce[i].best_mw = NULL;
85 }
86
87 /* 左右両端は文節の境界である */
88 sc->ce[0].seg_border = 1;
89 sc->ce[xs->len].seg_border = 1;
90 }
91
92 /* ここで確保した内容はrelease_info_cacheで解放される
93 */
94 static void
alloc_info_cache(struct splitter_context * sc)95 alloc_info_cache(struct splitter_context *sc)
96 {
97 int i;
98 struct word_split_info_cache *info;
99
100 /* キャッシュのデータを確保 */
101 sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
102 info = sc->word_split_info;
103 info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
104 info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
105 info->cnode =
106 malloc(sizeof(struct char_node) * (sc->char_count + 1));
107
108 info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
109 info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
110
111 /* 各文字インデックスに対して初期化を行う */
112 for (i = 0; i <= sc->char_count; i++) {
113 info->seq_len[i] = 0;
114 info->rev_seq_len[i] = 0;
115 info->cnode[i].wl = NULL;
116 info->cnode[i].mw = NULL;
117 info->cnode[i].max_len = 0;
118 }
119 }
120
121 /** 外から呼び出されるwordsplitterのトップレベルの関数 */
122 void
anthy_mark_border(struct splitter_context * sc,int from,int from2,int to)123 anthy_mark_border(struct splitter_context *sc,
124 int from, int from2, int to)
125 {
126 int i;
127 struct word_split_info_cache *info;
128
129 /* sanity check */
130 if ((to - from) <= 0) {
131 return ;
132 }
133
134 /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
135 info = sc->word_split_info;
136 info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
137 info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
138 info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
139 for (i = 0; i < sc->char_count + 1; ++i) {
140 info->seg_border[i] = sc->ce[i].seg_border;
141 info->best_seg_class[i] = sc->ce[i].best_seg_class;
142 info->best_mw[i] = sc->ce[i].best_mw;
143 }
144
145 /* 境界を決定する */
146 anthy_eval_border(sc, from, from2, to);
147
148 for (i = from; i < to; ++i) {
149 sc->ce[i].seg_border = info->seg_border[i];
150 sc->ce[i].best_seg_class = info->best_seg_class[i];
151 sc->ce[i].best_mw = info->best_mw[i];
152 }
153 }
154
155 /* 文節が拡大されたので,それを学習する */
156 static void
proc_expanded_segment(struct splitter_context * sc,int from,int len)157 proc_expanded_segment(struct splitter_context *sc,
158 int from, int len)
159 {
160 int initial_len = sc->ce[from].initial_seg_len;
161 int i, nr;
162 xstr from_xs, to_xs, *xs;
163
164 from_xs.str = sc->ce[from].c;
165 from_xs.len = initial_len;
166 to_xs.str = sc->ce[from].c;
167 to_xs.len = len;
168 if (anthy_select_section("EXPANDPAIR", 1) == -1) {
169 return ;
170 }
171 if (anthy_select_row(&from_xs, 1) == -1) {
172 return ;
173 }
174 nr = anthy_get_nr_values();
175 for (i = 0; i < nr; i ++) {
176 xs = anthy_get_nth_xstr(i);
177 if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
178 /* 既にある */
179 return ;
180 }
181 }
182 anthy_set_nth_xstr(nr, &to_xs);
183 anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
184 }
185
186 /* 文節のマージと語尾を学習する */
187 void
anthy_commit_border(struct splitter_context * sc,int nr_segments,struct meta_word ** mw,int * seg_len)188 anthy_commit_border(struct splitter_context *sc, int nr_segments,
189 struct meta_word **mw, int *seg_len)
190 {
191 int i, from = 0;
192
193 /* 伸ばした文節 */
194 for (i = 0; i < nr_segments; i++) {
195 /* それぞれの文節に対して */
196
197 int len = seg_len[i];
198 int initial_len = sc->ce[from].initial_seg_len;
199 int real_len = 0;
200 int l2;
201
202 if (!initial_len || from + initial_len == sc->char_count) {
203 /* そこは境界ではない */
204 goto tail;
205 }
206 l2 = sc->ce[from + initial_len].initial_seg_len;
207 if (initial_len + l2 > len) {
208 /* 隣の文節を含むほど拡大されたわけではない */
209 goto tail;
210 }
211 if (mw[i]) {
212 real_len = mw[i]->len;
213 }
214 if (real_len <= initial_len) {
215 goto tail;
216 }
217 /* 右の文節を含む長さに拡張された文節がコミットされた */
218 proc_expanded_segment(sc, from, real_len);
219 tail:
220 from += len;
221 }
222 }
223
224 int
anthy_splitter_debug_flags(void)225 anthy_splitter_debug_flags(void)
226 {
227 return splitter_debug_flags;
228 }
229
230 void
anthy_init_split_context(xstr * xs,struct splitter_context * sc,int is_reverse)231 anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
232 {
233 alloc_char_ent(xs, sc);
234 alloc_info_cache(sc);
235 sc->is_reverse = is_reverse;
236 /* 全ての部分文字列をチェックして、文節の候補を列挙する
237 word_listを構成してからmetawordを構成する */
238 anthy_lock_dic();
239 anthy_make_word_list_all(sc);
240 anthy_unlock_dic();
241 anthy_make_metaword_all(sc);
242
243 }
244
245 void
anthy_release_split_context(struct splitter_context * sc)246 anthy_release_split_context(struct splitter_context *sc)
247 {
248 if (sc->word_split_info) {
249 release_info_cache(sc);
250 sc->word_split_info = 0;
251 }
252 if (sc->ce) {
253 free(sc->ce);
254 sc->ce = 0;
255 }
256 }
257
258 /** splitter全体の初期化を行う */
259 int
anthy_init_splitter(void)260 anthy_init_splitter(void)
261 {
262 /* デバッグプリントの設定 */
263 char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
264 char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
265 splitter_debug_flags = SPLITTER_DEBUG_NONE;
266 if (!dis && en && strlen(en)) {
267 char *fs = getenv("ANTHY_SPLITTER_PRINT");
268 if (fs) {
269 if (strchr(fs, 'w')) {
270 splitter_debug_flags |= SPLITTER_DEBUG_WL;
271 }
272 if (strchr(fs, 'm')) {
273 splitter_debug_flags |= SPLITTER_DEBUG_MW;
274 }
275 if (strchr(fs, 'l')) {
276 splitter_debug_flags |= SPLITTER_DEBUG_LN;
277 }
278 if (strchr(fs, 'i')) {
279 splitter_debug_flags |= SPLITTER_DEBUG_ID;
280 }
281 if (strchr(fs, 'c')) {
282 splitter_debug_flags |= SPLITTER_DEBUG_CAND;
283 }
284 }
285 }
286 /* 付属語グラフの初期化 */
287 if (anthy_init_depword_tab()) {
288 anthy_log(0, "Failed to init dependent word table.\n");
289 return -1;
290 }
291 return 0;
292 }
293
294 void
anthy_quit_splitter(void)295 anthy_quit_splitter(void)
296 {
297 anthy_quit_depword_tab();
298 }
299