1 /*
2  * 文を文節にsplitするsplitter
3  *
4  * 文節の境界を検出する
5  *  anthy_init_split_context() 分割用のコンテキストを作って
6  *  anthy_mark_border() 分割をして
7  *  anthy_release_split_context() コンテキストを解放する
8  *
9  *  anthy_commit_border() コミットされた内容に対して学習をする
10  *
11  * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
12  *
13  * Copyright (C) 2004 YOSHIDA Yuichi
14  * Copyright (C) 2000-2004 TABATA Yusuke
15  * Copyright (C) 2000-2001 UGAWA Tomoharu
16  *
17  */
18 /*
19   This library is free software; you can redistribute it and/or
20   modify it under the terms of the GNU Lesser General Public
21   License as published by the Free Software Foundation; either
22   version 2 of the License, or (at your option) any later version.
23 
24   This library is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27   Lesser General Public License for more details.
28 
29   You should have received a copy of the GNU Lesser General Public
30   License along with this library; if not, write to the Free Software
31   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
32  */
33 #include <stdlib.h>
34 #include <string.h>
35 
36 #include <anthy/alloc.h>
37 #include <anthy/record.h>
38 #include <anthy/splitter.h>
39 #include <anthy/logger.h>
40 #include "wordborder.h"
41 
42 #define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
43 
44 static int splitter_debug_flags;
45 
46 /** make_word_cacheで作成した文節情報を解放する
47  */
48 static void
release_info_cache(struct splitter_context * sc)49 release_info_cache(struct splitter_context *sc)
50 {
51   struct word_split_info_cache *info = sc->word_split_info;
52 
53   anthy_free_allocator(info->MwAllocator);
54   anthy_free_allocator(info->WlAllocator);
55   free(info->cnode);
56   free(info->seq_len);
57   free(info->rev_seq_len);
58   free(info);
59 }
60 
61 static void
metaword_dtor(void * p)62 metaword_dtor(void *p)
63 {
64   struct meta_word *mw = (struct meta_word*)p;
65   if (mw->cand_hint.str) {
66     free(mw->cand_hint.str);
67   }
68 }
69 
70 
71 static void
alloc_char_ent(xstr * xs,struct splitter_context * sc)72 alloc_char_ent(xstr *xs, struct splitter_context *sc)
73 {
74   int i;
75 
76   sc->char_count = xs->len;
77   sc->ce = (struct char_ent*)
78     malloc(sizeof(struct char_ent)*(xs->len + 1));
79   for (i = 0; i <= xs->len; i++) {
80     sc->ce[i].c = &xs->str[i];
81     sc->ce[i].seg_border = 0;
82     sc->ce[i].initial_seg_len = 0;
83     sc->ce[i].best_seg_class = SEG_HEAD;
84     sc->ce[i].best_mw = NULL;
85   }
86 
87   /* 左右両端は文節の境界である */
88   sc->ce[0].seg_border = 1;
89   sc->ce[xs->len].seg_border = 1;
90 }
91 
92 /*  ここで確保した内容はrelease_info_cacheで解放される
93  */
94 static void
alloc_info_cache(struct splitter_context * sc)95 alloc_info_cache(struct splitter_context *sc)
96 {
97   int i;
98   struct word_split_info_cache *info;
99 
100   /* キャッシュのデータを確保 */
101   sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
102   info = sc->word_split_info;
103   info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
104   info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
105   info->cnode =
106     malloc(sizeof(struct char_node) * (sc->char_count + 1));
107 
108   info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
109   info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
110 
111   /* 各文字インデックスに対して初期化を行う */
112   for (i = 0; i <= sc->char_count; i++) {
113     info->seq_len[i] = 0;
114     info->rev_seq_len[i] = 0;
115     info->cnode[i].wl = NULL;
116     info->cnode[i].mw = NULL;
117     info->cnode[i].max_len = 0;
118   }
119 }
120 
121 /** 外から呼び出されるwordsplitterのトップレベルの関数 */
122 void
anthy_mark_border(struct splitter_context * sc,int from,int from2,int to)123 anthy_mark_border(struct splitter_context *sc,
124 		  int from, int from2, int to)
125 {
126   int i;
127   struct word_split_info_cache *info;
128 
129   /* sanity check */
130   if ((to - from) <= 0) {
131     return ;
132   }
133 
134   /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
135   info = sc->word_split_info;
136   info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
137   info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
138   info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
139   for (i = 0; i < sc->char_count + 1; ++i) {
140     info->seg_border[i] = sc->ce[i].seg_border;
141     info->best_seg_class[i] = sc->ce[i].best_seg_class;
142     info->best_mw[i] = sc->ce[i].best_mw;
143   }
144 
145   /* 境界を決定する */
146   anthy_eval_border(sc, from, from2, to);
147 
148   for (i = from; i < to; ++i) {
149     sc->ce[i].seg_border = info->seg_border[i];
150     sc->ce[i].best_seg_class = info->best_seg_class[i];
151     sc->ce[i].best_mw = info->best_mw[i];
152   }
153 }
154 
155 /* 文節が拡大されたので,それを学習する */
156 static void
proc_expanded_segment(struct splitter_context * sc,int from,int len)157 proc_expanded_segment(struct splitter_context *sc,
158 		      int from, int len)
159 {
160   int initial_len = sc->ce[from].initial_seg_len;
161   int i, nr;
162   xstr from_xs, to_xs, *xs;
163 
164   from_xs.str = sc->ce[from].c;
165   from_xs.len = initial_len;
166   to_xs.str = sc->ce[from].c;
167   to_xs.len = len;
168   if (anthy_select_section("EXPANDPAIR", 1) == -1) {
169     return ;
170   }
171   if (anthy_select_row(&from_xs, 1) == -1) {
172     return ;
173   }
174   nr = anthy_get_nr_values();
175   for (i = 0; i < nr; i ++) {
176     xs = anthy_get_nth_xstr(i);
177     if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
178       /* 既にある */
179       return ;
180     }
181   }
182   anthy_set_nth_xstr(nr, &to_xs);
183   anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
184 }
185 
186 /* 文節のマージと語尾を学習する */
187 void
anthy_commit_border(struct splitter_context * sc,int nr_segments,struct meta_word ** mw,int * seg_len)188 anthy_commit_border(struct splitter_context *sc, int nr_segments,
189 		    struct meta_word **mw, int *seg_len)
190 {
191   int i, from = 0;
192 
193   /* 伸ばした文節 */
194   for (i = 0; i < nr_segments; i++) {
195     /* それぞれの文節に対して */
196 
197     int len = seg_len[i];
198     int initial_len = sc->ce[from].initial_seg_len;
199     int real_len = 0;
200     int l2;
201 
202     if (!initial_len || from + initial_len == sc->char_count) {
203       /* そこは境界ではない */
204       goto tail;
205     }
206     l2 = sc->ce[from + initial_len].initial_seg_len;
207     if (initial_len + l2 > len) {
208       /* 隣の文節を含むほど拡大されたわけではない */
209       goto tail;
210     }
211     if (mw[i]) {
212       real_len = mw[i]->len;
213     }
214     if (real_len <= initial_len) {
215       goto tail;
216     }
217     /* 右の文節を含む長さに拡張された文節がコミットされた */
218     proc_expanded_segment(sc, from, real_len);
219   tail:
220     from += len;
221   }
222 }
223 
224 int
anthy_splitter_debug_flags(void)225 anthy_splitter_debug_flags(void)
226 {
227   return splitter_debug_flags;
228 }
229 
230 void
anthy_init_split_context(xstr * xs,struct splitter_context * sc,int is_reverse)231 anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
232 {
233   alloc_char_ent(xs, sc);
234   alloc_info_cache(sc);
235   sc->is_reverse = is_reverse;
236   /* 全ての部分文字列をチェックして、文節の候補を列挙する
237      word_listを構成してからmetawordを構成する */
238   anthy_lock_dic();
239   anthy_make_word_list_all(sc);
240   anthy_unlock_dic();
241   anthy_make_metaword_all(sc);
242 
243 }
244 
245 void
anthy_release_split_context(struct splitter_context * sc)246 anthy_release_split_context(struct splitter_context *sc)
247 {
248   if (sc->word_split_info) {
249     release_info_cache(sc);
250     sc->word_split_info = 0;
251   }
252   if (sc->ce) {
253     free(sc->ce);
254     sc->ce = 0;
255   }
256 }
257 
258 /** splitter全体の初期化を行う */
259 int
anthy_init_splitter(void)260 anthy_init_splitter(void)
261 {
262   /* デバッグプリントの設定 */
263   char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
264   char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
265   splitter_debug_flags = SPLITTER_DEBUG_NONE;
266   if (!dis && en && strlen(en)) {
267     char *fs = getenv("ANTHY_SPLITTER_PRINT");
268     if (fs) {
269       if (strchr(fs, 'w')) {
270 	splitter_debug_flags |= SPLITTER_DEBUG_WL;
271       }
272       if (strchr(fs, 'm')) {
273 	splitter_debug_flags |= SPLITTER_DEBUG_MW;
274       }
275       if (strchr(fs, 'l')) {
276 	splitter_debug_flags |= SPLITTER_DEBUG_LN;
277       }
278       if (strchr(fs, 'i')) {
279 	splitter_debug_flags |= SPLITTER_DEBUG_ID;
280       }
281       if (strchr(fs, 'c')) {
282 	splitter_debug_flags |= SPLITTER_DEBUG_CAND;
283       }
284     }
285   }
286   /* 付属語グラフの初期化 */
287   if (anthy_init_depword_tab()) {
288     anthy_log(0, "Failed to init dependent word table.\n");
289     return -1;
290   }
291   return 0;
292 }
293 
294 void
anthy_quit_splitter(void)295 anthy_quit_splitter(void)
296 {
297   anthy_quit_depword_tab();
298 }
299