1 /*
2 * 確定(コミット)後の処理をする。
3 * 各種の学習処理を呼び出す
4 *
5 * anthy_proc_commit() が外部から呼ばれる
6 */
7 #include <stdlib.h>
8 #include <time.h>
9
10 #include <anthy/ordering.h>
11 #include <anthy/record.h>
12 #include <anthy/splitter.h>
13 #include <anthy/segment.h>
14 #include "sorter.h"
15
16 #define MAX_OCHAIRE_ENTRY_COUNT 100
17 #define MAX_OCHAIRE_LEN 32
18 #define MAX_PREDICTION_ENTRY 100
19
20 #define MAX_UNKNOWN_WORD 100
21
22 /* 交換された候補を探す */
23 static void
learn_swapped_candidates(struct segment_list * sl)24 learn_swapped_candidates(struct segment_list *sl)
25 {
26 int i;
27 struct seg_ent *seg;
28 for (i = 0; i < sl->nr_segments; i++) {
29 seg = anthy_get_nth_segment(sl, i);
30 if (seg->committed != 0) {
31 /* 最初の候補(0番目)でない候補(seg->committed番目)がコミットされた */
32 anthy_swap_cand_ent(seg->cands[0],
33 seg->cands[seg->committed]);
34 }
35 }
36 anthy_cand_swap_ageup();
37 }
38
39 /* 長さが変わった文節の変更後に対して */
40 static void
learn_resized_segment(struct splitter_context * sc,struct segment_list * sl)41 learn_resized_segment(struct splitter_context *sc,
42 struct segment_list *sl)
43
44 {
45 int i;
46 struct meta_word **mw
47 = alloca(sizeof(struct meta_word*) * sl->nr_segments);
48 int *len_array
49 = alloca(sizeof(int) * sl->nr_segments);
50
51 /* 各文節の長さの配列とmeta_wordの配列を用意する */
52 for (i = 0; i < sl->nr_segments; i++) {
53 struct seg_ent *se = anthy_get_nth_segment(sl, i);
54 mw[i] = se->cands[se->committed]->mw;
55 len_array[i] = se->str.len;
56 }
57
58 anthy_commit_border(sc, sl->nr_segments, mw, len_array);
59 }
60
61 /* 長さが変わった文節の変更前に対して */
62 static void
clear_resized_segment(struct splitter_context * sc,struct segment_list * sl)63 clear_resized_segment(struct splitter_context *sc,
64 struct segment_list *sl)
65 {
66 int *mark, i, from;
67 struct seg_ent *seg;
68 mark = alloca(sizeof(int)*sc->char_count);
69 for (i = 0; i < sc->char_count; i++) {
70 mark[i] = 0;
71 }
72 /* 実際に確定された文節の長さをマークする */
73 from = 0;
74 for (i = 0; i < sl->nr_segments; i++) {
75 seg = anthy_get_nth_segment(sl, i);
76 mark[from] = seg->len;
77 from = from + seg->len;
78 }
79 for (i = 0; i < sc->char_count; i++) {
80 int len = sc->ce[i].initial_seg_len;
81 /* 最初の長さと確定された長さが異なれば、
82 使われなかった未知語の可能性がある */
83 if (len && len != mark[i]) {
84 xstr xs;
85 xs.str = sc->ce[i].c;
86 xs.len = len;
87 anthy_forget_unused_unknown_word(&xs);
88 }
89 }
90 if (!anthy_select_section("UNKNOWN_WORD", 0)) {
91 anthy_truncate_section(MAX_UNKNOWN_WORD);
92 }
93 }
94
95 /* recordにお茶入れ学習の結果を書き込む */
96 static void
commit_ochaire(struct seg_ent * seg,int count,xstr * xs)97 commit_ochaire(struct seg_ent *seg, int count, xstr* xs)
98 {
99 int i;
100 if (xs->len >= MAX_OCHAIRE_LEN) {
101 return ;
102 }
103 if (anthy_select_row(xs, 1)) {
104 return ;
105 }
106 anthy_set_nth_value(0, count);
107 for (i = 0; i < count; i++, seg = seg->next) {
108 anthy_set_nth_value(i * 2 + 1, seg->len);
109 anthy_set_nth_xstr(i * 2 + 2, &seg->cands[seg->committed]->str);
110 }
111 }
112
113 /* recordの領域を節約するために、お茶入れ学習のネガティブな
114 エントリを消す */
115 static void
release_negative_ochaire(struct splitter_context * sc,struct segment_list * sl)116 release_negative_ochaire(struct splitter_context *sc,
117 struct segment_list *sl)
118 {
119 int start, len;
120 xstr xs;
121 (void)sl;
122 /* 変換前のひらがな文字列 */
123 xs.len = sc->char_count;
124 xs.str = sc->ce[0].c;
125
126 /* xsの部分文字列に対して */
127 for (start = 0; start < xs.len; start ++) {
128 for (len = 1; len <= xs.len - start && len < MAX_OCHAIRE_LEN; len ++) {
129 xstr part;
130 part.str = &xs.str[start];
131 part.len = len;
132 if (anthy_select_row(&part, 0) == 0) {
133 anthy_release_row();
134 }
135 }
136 }
137 }
138
139 /* お茶入れ学習を行う */
140 static void
learn_ochaire(struct splitter_context * sc,struct segment_list * sl)141 learn_ochaire(struct splitter_context *sc,
142 struct segment_list *sl)
143 {
144 int i;
145 int count;
146
147 if (anthy_select_section("OCHAIRE", 1)) {
148 return ;
149 }
150
151 /* お茶入れ学習のネガティブなエントリを消す */
152 release_negative_ochaire(sc, sl);
153
154 /* お茶入れ学習をする */
155 for (count = 2; count <= sl->nr_segments && count < 5; count++) {
156 /* 2文節以上の長さの文節列に対して */
157
158 for (i = 0; i <= sl->nr_segments - count; i++) {
159 struct seg_ent *head = anthy_get_nth_segment(sl, i);
160 struct seg_ent *s;
161 xstr xs;
162 int j;
163 xs = head->str;
164 if (xs.len < 2 && count < 3) {
165 /* 細切れの文節を学習することを避ける、
166 * いい加減なheuristics */
167 continue;
168 }
169 /* 文節列を構成する文字列を作る */
170 for (j = 1, s = head->next; j < count; j++, s = s->next) {
171 xs.len += s->str.len;
172 }
173 /**/
174 commit_ochaire(head, count, &xs);
175 }
176 }
177 if (anthy_select_section("OCHAIRE", 1)) {
178 return ;
179 }
180 anthy_truncate_section(MAX_OCHAIRE_ENTRY_COUNT);
181 }
182
183 static int
learn_prediction_str(xstr * idx,xstr * xs)184 learn_prediction_str(xstr *idx, xstr *xs)
185 {
186 int nr_predictions;
187 int i;
188 time_t t = time(NULL);
189 if (anthy_select_row(idx, 1)) {
190 return 0;
191 }
192 nr_predictions = anthy_get_nr_values();
193
194 /* 既に履歴にある場合はタイムスタンプだけ更新 */
195 for (i = 0; i < nr_predictions; i += 2) {
196 xstr *log = anthy_get_nth_xstr(i + 1);
197 if (!log) {
198 continue;
199 }
200 if (anthy_xstrcmp(log, xs) == 0) {
201 anthy_set_nth_value(i, t);
202 break;
203 }
204 }
205
206 /* ない場合は末尾に追加 */
207 if (i == nr_predictions) {
208 anthy_set_nth_value(nr_predictions, t);
209 anthy_set_nth_xstr(nr_predictions + 1, xs);
210 anthy_mark_row_used();
211 return 1;
212 }
213 anthy_mark_row_used();
214 return 0;
215 }
216
217 static void
learn_prediction(struct segment_list * sl)218 learn_prediction(struct segment_list *sl)
219 {
220 int i;
221 int added = 0;
222 if (anthy_select_section("PREDICTION", 1)) {
223 return ;
224 }
225 for (i = 0; i < sl->nr_segments; i++) {
226 struct seg_ent *seg = anthy_get_nth_segment(sl, i);
227 xstr *xs = &seg->cands[seg->committed]->str;
228
229 if (seg->committed < 0) {
230 continue;
231 }
232 if (learn_prediction_str(&seg->str, xs)) {
233 added = 1;
234 }
235 }
236 if (added) {
237 anthy_truncate_section(MAX_PREDICTION_ENTRY);
238 }
239 }
240
241 static void
learn_unknown(struct segment_list * sl)242 learn_unknown(struct segment_list *sl)
243 {
244 int i;
245 for (i = 0; i < sl->nr_segments; i++) {
246 struct seg_ent *seg = anthy_get_nth_segment(sl, i);
247 struct cand_ent *ce = seg->cands[seg->committed];
248 if (ce->nr_words == 0) {
249 anthy_add_unknown_word(&seg->str, &ce->str);
250 }
251 }
252 }
253
254 void
anthy_do_commit_prediction(xstr * src,xstr * xs)255 anthy_do_commit_prediction(xstr *src, xstr *xs)
256 {
257 if (anthy_select_section("PREDICTION", 1)) {
258 return ;
259 }
260 learn_prediction_str(src, xs);
261 }
262
263 void
anthy_proc_commit(struct segment_list * sl,struct splitter_context * sc)264 anthy_proc_commit(struct segment_list *sl,
265 struct splitter_context *sc)
266 {
267 /* 各種の学習を行う */
268 learn_swapped_candidates(sl);
269 learn_resized_segment(sc, sl);
270 clear_resized_segment(sc, sl);
271 learn_ochaire(sc, sl);
272 learn_prediction(sl);
273 learn_unknown(sl);
274 anthy_learn_cand_history(sl);
275 }
276