1 /*
2 * 文節の関係を処理する
3 * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program)
4 * Copyright (C) 2002-2007 TABATA Yusuke
5 *
6 * anthy_reorder_candidates_by_relation()
7 *
8 */
9 /*
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2 of the License, or (at your option) any later version.
14
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with this library; if not, write to the Free Software
22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #include <arpa/inet.h>
26 #include <stdlib.h>
27
28 #include <anthy/segclass.h>
29 #include <anthy/segment.h>
30 #include <anthy/ordering.h>
31 #include <anthy/dic.h>
32 #include <anthy/diclib.h>
33 #include <anthy/feature_set.h>
34 #include <anthy/corpus.h>
35 #include "sorter.h"
36
37 #define MAX_COLLISION 4
38 #define SEARCH_LIMIT 100
39 #define MAX_NEIGHBOR 10
40
41
42 /* 全文検索用のコーパス */
43 static struct corpus_ {
44 /* header */
45 void *corpus_bucket;
46 void *corpus_array;
47 /**/
48 int *bucket;
49 int *array;
50 /**/
51 int bucket_size;
52 int array_size;
53 } corpus_info;
54
55 /* 検索用のiterator */
56 struct iterator {
57 /* 検索のキーと現在の場所 */
58 int key;
59 int idx;
60 /* 検索回数の上限 */
61 int limit;
62 };
63
64 struct neighbor {
65 int nr;
66 int id[MAX_NEIGHBOR];
67 };
68
69 /** 文節@segの中に@from_word_idの単語と共起関係にある
70 * 候補があるかどうかを探し、あればスコアを上げる。
71 */
72 static void
reorder_candidate(int from_word_id,struct seg_ent * seg)73 reorder_candidate(int from_word_id, struct seg_ent *seg)
74 {
75 int i, pos;
76 struct cand_ent *ce;
77 if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */
78 return;
79 }
80 ce = seg->cands[0];
81 if (ce->core_elm_index == -1) {
82 return ;
83 }
84 /* 0番目の候補の品詞 */
85 pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt);
86
87 for (i = 0; i < seg->nr_cands; i++) {
88 int word_id;
89 ce = seg->cands[i];
90 if (ce->core_elm_index == -1) {
91 continue;
92 }
93 word_id = ce->elm[ce->core_elm_index].id;
94 if (anthy_dic_check_word_relation(from_word_id, word_id) &&
95 anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) {
96 /* 用例にマッチしたので、候補のスコアを更新 */
97 ce->flag |= CEF_USEDICT;
98 ce->score *= 10;
99 }
100 }
101 }
102
103 static int
get_indep_word_id(struct seg_ent * seg,int nth)104 get_indep_word_id(struct seg_ent *seg, int nth)
105 {
106 struct cand_ent *ce;
107 if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */
108 return -1;
109 }
110 if (seg->cands[nth]->core_elm_index == -1) {
111 /* 一番目の候補がseq_entから作られた候補ではない */
112 return -1;
113 }
114 ce = seg->cands[nth];
115 /* 自立語のidを取り出す */
116 return ce->elm[ce->core_elm_index].id;
117 }
118
119 /* 用例辞書を使って並び替えをする */
120 static void
reorder_by_use_dict(struct segment_list * sl,int nth)121 reorder_by_use_dict(struct segment_list *sl, int nth)
122 {
123 int i;
124 struct seg_ent *cur_seg;
125 int word_id;
126
127 cur_seg = anthy_get_nth_segment(sl, nth);
128 word_id = get_indep_word_id(cur_seg, 0);
129 if (word_id == -1) {
130 /**/
131 return ;
132 }
133 /* 近所の文節を順に見ていく */
134 for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) {
135 struct seg_ent *target_seg;
136 if (i < 0 || i == nth) {
137 continue ;
138 }
139 /* i番目の文節と前後のj番目の文節に対して */
140 target_seg = anthy_get_nth_segment(sl, i);
141 reorder_candidate(word_id, target_seg);
142 }
143 }
144
145 static int
find_border_of_this_word(int idx)146 find_border_of_this_word(int idx)
147 {
148 int val;
149 if (idx < 0) {
150 return 0;
151 }
152 val = ntohl(corpus_info.array[idx * 2]);
153 while (!(val & ELM_WORD_BORDER) &&
154 idx > -1) {
155 idx --;
156 }
157 return idx;
158 }
159
160 static int
find_left_word_border(int idx)161 find_left_word_border(int idx)
162 {
163 int val;
164 if (idx == -1) {
165 return -1;
166 }
167 val = ntohl(corpus_info.array[idx * 2]);
168 if (val & ELM_BOS) {
169 return -1;
170 }
171 idx --;
172 return find_border_of_this_word(idx);
173 }
174
175 static int
find_right_word_border(int idx)176 find_right_word_border(int idx)
177 {
178 if (idx == -1) {
179 return -1;
180 }
181 while (idx < corpus_info.array_size - 2) {
182 int val;
183 idx ++;
184 val = ntohl(corpus_info.array[idx * 2]);
185 if (val & ELM_BOS) {
186 return -1;
187 }
188 if (val & ELM_WORD_BORDER) {
189 return idx;
190 }
191 }
192 return -1;
193 }
194
195 static void
push_id(struct neighbor * ctx,int id)196 push_id(struct neighbor *ctx,
197 int id)
198 {
199 if (ctx->nr < MAX_NEIGHBOR - 1) {
200 ctx->id[ctx->nr] = id;
201 ctx->nr++;
202 }
203 }
204
205 static void
collect_word_context(struct neighbor * ctx,int idx)206 collect_word_context(struct neighbor *ctx, int idx)
207 {
208 int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK;
209 /*printf(" id=%d\n", id);*/
210 push_id(ctx, id);
211 }
212
213 /* 例文中で周辺の情報を取得する */
214 static void
collect_corpus_context(struct neighbor * ctx,struct iterator * it)215 collect_corpus_context(struct neighbor *ctx,
216 struct iterator *it)
217 {
218 int i;
219 int this_idx, idx;
220
221 this_idx = find_border_of_this_word(it->idx);
222
223 /*printf(" key=%d\n", it->key);*/
224 /* 左へスキャン */
225 idx = this_idx;
226 for (i = 0; i < 2; i++) {
227 idx = find_left_word_border(idx);
228 if (idx == -1) {
229 break;
230 }
231 collect_word_context(ctx, idx);
232 }
233 /* 右へスキャン */
234 idx = this_idx;
235 for (i = 0; i < 2; i++) {
236 idx = find_right_word_border(idx);
237 if (idx == -1) {
238 break;
239 }
240 collect_word_context(ctx, idx);
241 }
242 }
243
244 /* 変換対象の文字列の周辺の情報を取得する */
245 static void
collect_user_context(struct neighbor * ctx,struct segment_list * sl,int nth)246 collect_user_context(struct neighbor *ctx,
247 struct segment_list *sl, int nth)
248 {
249 int i;
250 ctx->nr = 0;
251 for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) {
252 int id;
253 if ((i < 0) || (i == nth)) {
254 continue;
255 }
256 id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0);
257 if (id > -1) {
258 id &= CORPUS_KEY_MASK;
259 /*printf("user_ctx=%d\n", id);*/
260 push_id(ctx, id);
261 }
262 }
263 }
264
265 /* 隣接文節の情報を比較する */
266 static int
do_compare_context(struct neighbor * n1,struct neighbor * n2)267 do_compare_context(struct neighbor *n1,
268 struct neighbor *n2)
269 {
270 int i, j;
271 int m = 0;
272 for (i = 0; i < n1->nr; i++) {
273 for (j = 0; j < n2->nr; j++) {
274 if (n1->id[i] == n2->id[j]) {
275 m++;
276 }
277 }
278 }
279 return m;
280 }
281
282 /* 隣接文節の情報を取得して比較する */
283 static int
compare_context(struct neighbor * user,struct iterator * it)284 compare_context(struct neighbor *user,
285 struct iterator *it)
286 {
287 struct neighbor sample;
288 int nr;
289 /**/
290 sample.nr = 0;
291 /* 例文中の周辺情報を集める */
292 collect_corpus_context(&sample, it);
293 if (sample.nr == 0) {
294 return 0;
295 }
296 /* 比較する */
297 nr = do_compare_context(user, &sample);
298 if (nr >= sample.nr / 2) {
299 return nr;
300 }
301 return 0;
302 }
303
304 /* keyの最初の出現場所を見つける
305 * 見つからなかったら-1を返す
306 */
307 static int
find_first_pos(int key)308 find_first_pos(int key)
309 {
310 int i;
311 for (i = 0; i < MAX_COLLISION; i++) {
312 int bkt = (key + i) % corpus_info.bucket_size;
313 if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) {
314 return ntohl(corpus_info.bucket[bkt * 2 + 1]);
315 }
316 }
317 return -1;
318 }
319
320 /* keyの最初の出現場所でiteratorを初期化する
321 * 見つからなかったら-1を返す
322 */
323 static int
find_first_from_corpus(int key,struct iterator * it,int limit)324 find_first_from_corpus(int key, struct iterator *it, int limit)
325 {
326 key &= CORPUS_KEY_MASK;
327 it->idx = find_first_pos(key);
328 it->key = key;
329 it->limit = limit;
330 return it->idx;
331 }
332
333 /* keyの次の出現場所のiteratorを設定する
334 */
335 static int
find_next_from_corpus(struct iterator * it)336 find_next_from_corpus(struct iterator *it)
337 {
338 int idx = it->idx;
339 it->limit--;
340 if (it->limit < 1) {
341 it->idx = -1;
342 return -1;
343 }
344 it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]);
345 if (it->idx < 0 || it->idx >= corpus_info.array_size ||
346 it->idx < idx) {
347 it->idx = -1;
348 }
349 return it->idx;
350 }
351
352 static void
check_candidate_context(struct seg_ent * cur_seg,int i,struct neighbor * user)353 check_candidate_context(struct seg_ent *cur_seg,
354 int i,
355 struct neighbor *user)
356 {
357 struct iterator it;
358 int nr = 0;
359 int word_id;
360 word_id = get_indep_word_id(cur_seg, i);
361 if (word_id == -1) {
362 return ;
363 }
364 /* 各出現場所をスキャンする */
365 find_first_from_corpus(word_id, &it, SEARCH_LIMIT);
366 /*printf("word_id=%d %d\n", word_id, it.idx);*/
367 while (it.idx > -1) {
368 nr += compare_context(user, &it);
369 /**/
370 find_next_from_corpus(&it);
371 }
372 /**/
373 if (nr > 0) {
374 cur_seg->cands[i]->flag |= CEF_CONTEXT;
375 }
376 }
377
378 /* 全文検索で候補を並び替える */
379 static void
reorder_by_corpus(struct segment_list * sl,int nth)380 reorder_by_corpus(struct segment_list *sl, int nth)
381 {
382 struct seg_ent *cur_seg;
383 struct neighbor user;
384 int i;
385 /* 文節の周辺情報を集める */
386 collect_user_context(&user, sl, nth);
387 if (user.nr == 0) {
388 return ;
389 }
390 cur_seg = anthy_get_nth_segment(sl, nth);
391 if (NULL == cur_seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */
392 return;
393 }
394 /* 各候補について */
395 for (i = 0; i < cur_seg->nr_cands; i++) {
396 check_candidate_context(cur_seg, i, &user);
397 }
398 /* トップの候補に用例があれば、他の候補は見ない */
399 if (cur_seg->cands[0]->flag & CEF_CONTEXT) {
400 cur_seg->cands[0]->flag &= ~CEF_CONTEXT;
401 return ;
402 }
403 /* 用例によるスコア加算 */
404 for (i = 1; i < cur_seg->nr_cands; i++) {
405 if (cur_seg->cands[i]->flag & CEF_CONTEXT) {
406 cur_seg->cands[i]->score *= 2;
407 }
408 }
409 }
410
411 /*
412 * 用例を用いて候補を並び替える
413 * @nth番目以降の文節を対象とする
414 */
415 void
anthy_reorder_candidates_by_relation(struct segment_list * sl,int nth)416 anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth)
417 {
418 int i;
419 for (i = nth; i < sl->nr_segments; i++) {
420 reorder_by_use_dict(sl, i);
421 if (corpus_info.array)
422 reorder_by_corpus(sl, i);
423 }
424 }
425
426 void
anthy_relation_init(void)427 anthy_relation_init(void)
428 {
429 corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array");
430 corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket");
431 if (!corpus_info.corpus_array ||
432 !corpus_info.corpus_bucket) {
433 corpus_info.array = NULL;
434 corpus_info.bucket = NULL;
435 corpus_info.array_size = 0;
436 corpus_info.bucket_size = 0;
437 return ;
438 }
439 corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]);
440 corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]);
441 corpus_info.array = &(((int *)corpus_info.corpus_array)[16]);
442 corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]);
443 /*
444 {
445 int i;
446 for (i = 0; i < corpus_info.array_size; i++) {
447 int v = ntohl(corpus_info.array[i * 2]);
448 printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK);
449 }
450 }
451 */
452 }
453