1 /*
2 * 文節もしくは単語を一つ以上セットにしてmetawordとして扱う。
3 * ここでは各種のmetawordを生成する
4 *
5 * init_metaword_tab() metaword処理のための情報を構成する
6 * anthy_make_metaword_all() context中のmetawordを構成する
7 * anthy_print_metaword() 指定されたmetawordを表示する
8 *
9 * Funded by IPA未踏ソフトウェア創造事業 2001 10/29
10 * Copyright (C) 2000-2006 TABATA Yusuke
11 * Copyright (C) 2004-2006 YOSHIDA Yuichi
12 * Copyright (C) 2000-2003 UGAWA Tomoharu
13 */
14 #include <stdlib.h>
15 #include <stdio.h>
16 #include <math.h>
17
18 #include <anthy/record.h>
19 #include <anthy/splitter.h>
20 #include <anthy/xchar.h>
21 #include <anthy/xstr.h>
22 #include <anthy/segment.h>
23 #include <anthy/segclass.h>
24 #include "wordborder.h"
25
26 /* 各種meta_wordをどのように処理するか */
27 struct metaword_type_tab_ anthy_metaword_type_tab[] = {
28 {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE},
29 {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE},
30 {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP},
31 {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND},
32 {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE},
33 {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE},
34 {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE},
35 {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER},
36 {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER},
37 {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER},
38 {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE},
39 /**/
40 {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE}
41 };
42
43 static void
44 combine_metaword(struct splitter_context *sc, struct meta_word *mw);
45
46 /* コンテキスト中にmetawordを追加する */
47 static void
anthy_commit_meta_word(struct splitter_context * sc,struct meta_word * mw)48 anthy_commit_meta_word(struct splitter_context *sc, struct meta_word *mw)
49 {
50 struct word_split_info_cache *info = sc->word_split_info;
51 /* 同じ開始点を持つノードのリスト */
52 mw->next = info->cnode[mw->from].mw;
53 info->cnode[mw->from].mw = mw;
54 /**/
55 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) {
56 anthy_print_metaword(sc, mw);
57 }
58 }
59
60 static void
print_metaword_features(int features)61 print_metaword_features(int features)
62 {
63 if (features & MW_FEATURE_SV) {
64 printf(":sv");
65 }
66 if (features & MW_FEATURE_WEAK_CONN) {
67 printf(":weak");
68 }
69 if (features & MW_FEATURE_SUFFIX) {
70 printf(":suffix");
71 }
72 if (features & MW_FEATURE_NUM) {
73 printf(":num");
74 }
75 if (features & MW_FEATURE_CORE1) {
76 printf(":c1");
77 }
78 if (features & MW_FEATURE_HIGH_FREQ) {
79 printf(":hf");
80 }
81 }
82
83 static void
anthy_do_print_metaword(struct splitter_context * sc,struct meta_word * mw,int indent)84 anthy_do_print_metaword(struct splitter_context *sc,
85 struct meta_word *mw,
86 int indent)
87 {
88 int i;
89 for (i = 0; i < indent; i++) {
90 printf(" ");
91 }
92 printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s",
93 anthy_metaword_type_tab[mw->type].name,
94 mw->from, mw->len, mw->score,
95 anthy_seg_class_sym(mw->seg_class));
96 print_metaword_features(mw->mw_features);
97 printf(":can_use=%d*\n", mw->can_use);
98 if (mw->wl) {
99 anthy_print_word_list(sc, mw->wl);
100 }
101 if (mw->cand_hint.str) {
102 printf("(");
103 anthy_putxstr(&mw->cand_hint);
104 printf(")\n");
105 }
106 if (mw->mw1) {
107 anthy_do_print_metaword(sc, mw->mw1, indent + 1);
108 }
109 if (mw->mw2) {
110 anthy_do_print_metaword(sc, mw->mw2, indent + 1);
111 }
112 }
113
114 void
anthy_print_metaword(struct splitter_context * sc,struct meta_word * mw)115 anthy_print_metaword(struct splitter_context *sc,
116 struct meta_word *mw)
117 {
118 anthy_do_print_metaword(sc, mw, 0);
119 }
120
121 static struct meta_word *
alloc_metaword(struct splitter_context * sc)122 alloc_metaword(struct splitter_context *sc)
123 {
124 struct meta_word *mw;
125 mw = anthy_smalloc(sc->word_split_info->MwAllocator);
126 mw->type = MW_SINGLE;
127 mw->score = 0;
128 mw->struct_score = 0;
129 mw->dep_word_hash = 0;
130 mw->core_wt = anthy_wt_none;
131 mw->mw_features = 0;
132 mw->dep_class = DEP_NONE;
133 mw->wl = NULL;
134 mw->mw1 = NULL;
135 mw->mw2 = NULL;
136 mw->cand_hint.str = NULL;
137 mw->cand_hint.len = 0;
138 mw->seg_class = SEG_HEAD;
139 mw->can_use = ok;
140 return mw;
141 }
142
143
144 /*
145 * wlの接頭辞部分と接尾辞部分を文字列として取り出す
146 */
147 static void
get_surrounding_text(struct splitter_context * sc,struct word_list * wl,xstr * xs_pre,xstr * xs_post)148 get_surrounding_text(struct splitter_context* sc,
149 struct word_list* wl,
150 xstr* xs_pre, xstr* xs_post)
151 {
152 int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len;
153 int pre_len = wl->part[PART_PREFIX].len;
154
155 xs_pre->str = sc->ce[wl->from].c;
156 xs_pre->len = pre_len;
157 xs_post->str = sc->ce[wl->from + wl->len - post_len].c;
158 xs_post->len = post_len;
159 }
160
161 static int
count_vu(xstr * xs)162 count_vu(xstr *xs) {
163 int i, r = 0;
164 for (i = 0; i < xs->len; i++) {
165 if (xs->str[i] == KK_VU) {
166 r++;
167 }
168 }
169 return r;
170 }
171
172 /*
173 * 複合語であるwlからn番めの部分を取り出してmwにする
174 */
175 static struct meta_word*
make_compound_nth_metaword(struct splitter_context * sc,compound_ent_t ce,int nth,struct word_list * wl,enum metaword_type type)176 make_compound_nth_metaword(struct splitter_context* sc,
177 compound_ent_t ce, int nth,
178 struct word_list* wl,
179 enum metaword_type type)
180 {
181 int i;
182 int len = 0;
183 int from = wl->from;
184 int seg_num = anthy_compound_get_nr_segments(ce);
185 struct meta_word* mw;
186 xstr xs_pre, xs_core, xs_post;
187
188 get_surrounding_text(sc, wl, &xs_pre, &xs_post);
189
190 for (i = 0; i <= nth; ++i) {
191 xstr part;
192 from += len;
193 len = anthy_compound_get_nth_segment_len(ce, i);
194 part.str = sc->ce[from].c;
195 part.len = len;
196 len -= count_vu(&part);
197 if (i == 0) {
198 len += xs_pre.len;
199 }
200 if (i == seg_num - 1) {
201 len += xs_post.len;
202 }
203 }
204
205 mw = alloc_metaword(sc);
206 mw->from = from;
207 mw->len = len;
208 mw->type = type;
209 mw->score = 1000;
210 mw->seg_class = wl->seg_class;
211
212 anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core);
213 if (nth == 0) {
214 anthy_xstrcat(&mw->cand_hint, &xs_pre);
215 }
216 anthy_xstrcat(&mw->cand_hint, &xs_core);
217 if (nth == seg_num - 1) {
218 anthy_xstrcat(&mw->cand_hint, &xs_post);
219 }
220 return mw;
221 }
222
223
224 /*
225 * metawordを実際に結合する
226 */
227 static struct meta_word *
anthy_do_cons_metaword(struct splitter_context * sc,enum metaword_type type,struct meta_word * mw,struct meta_word * mw2)228 anthy_do_cons_metaword(struct splitter_context *sc,
229 enum metaword_type type,
230 struct meta_word *mw, struct meta_word *mw2)
231 {
232 struct meta_word *n;
233
234 n = alloc_metaword(sc);
235 n->from = mw->from;
236 n->len = mw->len + (mw2 ? mw2->len : 0);
237
238 if (mw2) {
239 n->score = sqrt(mw->score) * sqrt(mw2->score);
240 } else {
241 n->score = mw->score;
242 }
243 n->type = type;
244 n->mw1 = mw;
245 n->mw2 = mw2;
246 if (mw2) {
247 n->seg_class = mw2->seg_class;
248 n->nr_parts = mw->nr_parts + mw2->nr_parts;
249 n->dep_word_hash = mw2->dep_word_hash;
250 } else {
251 n->seg_class = mw->seg_class;
252 n->nr_parts = mw->nr_parts;
253 n->dep_word_hash = mw->dep_word_hash;
254 }
255 anthy_commit_meta_word(sc, n);
256 return n;
257 }
258
259 /*
260 * 複合語用のmeta_wordを作成する。
261 */
262 static void
make_compound_metaword(struct splitter_context * sc,struct word_list * wl)263 make_compound_metaword(struct splitter_context* sc, struct word_list* wl)
264 {
265 int i, j;
266 seq_ent_t se = wl->part[PART_CORE].seq;
267 int ent_num = anthy_get_nr_dic_ents(se, NULL);
268
269 for (i = 0; i < ent_num; ++i) {
270 compound_ent_t ce;
271 int seg_num;
272 struct meta_word *mw = NULL;
273 struct meta_word *mw2 = NULL;
274 if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
275 continue;
276 }
277 ce = anthy_get_nth_compound_ent(se, i);
278 seg_num = anthy_compound_get_nr_segments(ce);
279
280 for (j = seg_num - 1; j >= 0; --j) {
281 enum metaword_type type;
282 mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF);
283 anthy_commit_meta_word(sc, mw);
284
285 type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND;
286 mw2 = anthy_do_cons_metaword(sc, type, mw, mw2);
287 }
288 }
289 }
290
291 /*
292 * 複合語の中の個々の文節を結合したmeta_wordを作成する。
293 */
294 static void
make_compound_part_metaword(struct splitter_context * sc,struct word_list * wl)295 make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl)
296 {
297 int i, j, k;
298 seq_ent_t se = wl->part[PART_CORE].seq;
299 int ent_num = anthy_get_nr_dic_ents(se, NULL);
300
301 for (i = 0; i < ent_num; ++i) {
302 compound_ent_t ce;
303 int seg_num;
304 struct meta_word *mw = NULL;
305 struct meta_word *mw2 = NULL;
306
307 if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
308 continue;
309 }
310
311 ce = anthy_get_nth_compound_ent(se, i);
312 seg_num = anthy_compound_get_nr_segments(ce);
313
314 /* 後ろから */
315 for (j = seg_num - 1; j >= 0; --j) {
316 mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART);
317 for (k = j - 1; k >= 0; --k) {
318 mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART);
319 mw2->len += mw->len;
320 mw2->score += mw->score;
321 anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint);
322
323 anthy_commit_meta_word(sc, mw2);
324 mw = mw2;
325 }
326 }
327 }
328 }
329
330 /*
331 * 単文節単語
332 */
333 static void
make_simple_metaword(struct splitter_context * sc,struct word_list * wl)334 make_simple_metaword(struct splitter_context *sc, struct word_list* wl)
335 {
336 struct meta_word *mw = alloc_metaword(sc);
337 mw->wl = wl;
338 mw->from = wl->from;
339 mw->len = wl->len;
340 mw->score = 1000;
341 mw->type = MW_SINGLE;
342 mw->dep_class = wl->part[PART_DEPWORD].dc;
343 mw->seg_class = wl->seg_class;
344 if (wl->part[PART_CORE].len) {
345 mw->core_wt = wl->part[PART_CORE].wt;
346 }
347 mw->nr_parts = NR_PARTS;
348 mw->dep_word_hash = wl->dep_word_hash;
349 mw->mw_features = wl->mw_features;
350 anthy_commit_meta_word(sc, mw);
351 }
352
353 /*
354 * wordlist一個からなる、metawordを作成
355 */
356 static void
make_metaword_from_word_list(struct splitter_context * sc)357 make_metaword_from_word_list(struct splitter_context *sc)
358 {
359 int i;
360 for (i = 0; i < sc->char_count; i++) {
361 struct word_list *wl;
362 for (wl = sc->word_split_info->cnode[i].wl;
363 wl; wl = wl->next) {
364 if (wl->is_compound) {
365 make_compound_part_metaword(sc, wl);
366 make_compound_metaword(sc, wl);
367 } else {
368 make_simple_metaword(sc, wl);
369 }
370 }
371 }
372 }
373
374 /*
375 * metawordをリスト風に結合する
376 */
377 static struct meta_word *
list_metaword(struct splitter_context * sc,enum metaword_type type,struct meta_word * mw,struct meta_word * mw2)378 list_metaword(struct splitter_context *sc,
379 enum metaword_type type,
380 struct meta_word *mw, struct meta_word *mw2)
381 {
382 struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL);
383 struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw);
384
385 n->mw_features = mw->mw_features | mw2->mw_features;
386
387 return n;
388 }
389
390 /*
391 * 動詞連用形 + 形容詞化接尾語 「〜しやすい」など
392 */
393 static void
try_combine_v_renyou_a(struct splitter_context * sc,struct meta_word * mw,struct meta_word * mw2)394 try_combine_v_renyou_a(struct splitter_context *sc,
395 struct meta_word *mw, struct meta_word *mw2)
396 {
397 wtype_t w2;
398 if (!mw->wl || !mw2->wl) return;
399
400 w2 = mw2->wl->part[PART_CORE].wt;
401
402 if (mw->wl->head_pos == POS_V &&
403 mw->wl->tail_ct == CT_RENYOU &&
404 anthy_wtype_get_pos(w2) == POS_D2KY) {
405 /* 形容詞ではあるので次のチェック */
406 if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq,
407 anthy_wtype_a_tail_of_v_renyou)) {
408 list_metaword(sc, MW_V_RENYOU_A, mw, mw2);
409 }
410 }
411 }
412
413 /*
414 * 動詞連用形 + 名詞化接尾語(#D2T35) 「入れ たて(のお茶)」など
415 */
416 static void
try_combine_v_renyou_noun(struct splitter_context * sc,struct meta_word * mw,struct meta_word * mw2)417 try_combine_v_renyou_noun(struct splitter_context *sc,
418 struct meta_word *mw, struct meta_word *mw2)
419 {
420 wtype_t w2;
421 if (!mw->wl || !mw2->wl) return;
422
423 w2 = mw2->wl->part[PART_CORE].wt;
424 if (mw->wl->head_pos == POS_V &&
425 mw->wl->tail_ct == CT_RENYOU &&
426 anthy_wtype_get_pos(w2) == POS_NOUN &&
427 anthy_wtype_get_scos(w2) == SCOS_T40) {
428 list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2);
429 }
430 }
431
432 /*
433 * 数字を結合する
434 */
435 static void
try_combine_number(struct splitter_context * sc,struct meta_word * mw1,struct meta_word * mw2)436 try_combine_number(struct splitter_context *sc,
437 struct meta_word *mw1, struct meta_word *mw2)
438 {
439 struct word_list *wl1 = mw1->wl;
440 struct word_list *wl2 = mw2->wl;
441 struct meta_word *combined_mw;
442 int recursive = wl2 ? 0 : 1; /* combinedなmwを結合する場合1 */
443
444 /* 左mwは数詞 */
445
446 if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return;
447 if (recursive) {
448 /* 右mwは数字を結合したmw */
449 if (mw2->type != MW_NUMBER) return;
450 wl2 = mw2->mw1->wl;
451 } else {
452 /* 右mwは数詞 */
453 if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return;
454 }
455 /* 左mwの後ろに文字が付いていなければ */
456 if (wl1->part[PART_POSTFIX].len == 0 &&
457 wl1->part[PART_DEPWORD].len == 0) {
458 int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt);
459 int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt);
460
461 /* #NNは対象外 */
462 if (scos2 == SCOS_NONE) return;
463 /*
464 左mwの種類によって、後ろにつくことができる右mwの種類が変わる
465 例えば一〜九の後ろには万〜九万、億〜九億しかつくことができないが、
466 十〜九十の後ろには、あわせて一〜九などもつくことができる
467 */
468 switch (scos1) {
469 case SCOS_N1:
470 if (scos2 == SCOS_N1) return; /* 後ろに一〜九がついてはいけない */
471 case SCOS_N10:
472 if (scos2 == SCOS_N10) return; /* 後ろに十〜九十がついてはいけない */
473 case SCOS_N100:
474 if (scos2 == SCOS_N100) return; /* 後ろに百〜九百がついてはいけない */
475 case SCOS_N1000:
476 if (scos2 == SCOS_N1000) return; /* 後ろに千〜九千がついてはいけない */
477 case SCOS_N10000:
478 /* 万〜九万、億〜九億…などは、
479 いつでも後ろにつくことができる */
480 break;
481 default:
482 return;
483 }
484
485 if (recursive) {
486 combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2);
487 } else {
488 /* 初めて結合する場合は後ろにnullをつけてlistにする */
489 combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2);
490 }
491 combine_metaword(sc, combined_mw);
492 }
493 }
494
495 /* 右隣のmetawordと結合できるかチェック */
496 static void
try_combine_metaword(struct splitter_context * sc,struct meta_word * mw1,struct meta_word * mw2)497 try_combine_metaword(struct splitter_context *sc,
498 struct meta_word *mw1, struct meta_word *mw2)
499 {
500 if (!mw1->wl) return;
501
502 /* metawordの結合を行うためには、後続の
503 metawordに接頭辞がないことが必要 */
504 if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) {
505 return;
506 }
507
508 try_combine_v_renyou_a(sc, mw1, mw2);
509 try_combine_v_renyou_noun(sc, mw1, mw2);
510 try_combine_number(sc, mw1, mw2);
511 }
512
513 static void
combine_metaword(struct splitter_context * sc,struct meta_word * mw)514 combine_metaword(struct splitter_context *sc, struct meta_word *mw)
515 {
516 struct word_split_info_cache *info = sc->word_split_info;
517 int i;
518
519 if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
520 /* 付属語だけの文節とは結合しない */
521 return;
522 }
523
524 for (i = mw->from - 1; i >= 0; i--) {
525 struct meta_word *mw_left;
526 for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) {
527 if (mw_left->from + mw_left->len == mw->from) {
528 /* 結合できるかチェック */
529 try_combine_metaword(sc, mw_left, mw);
530 }
531 }
532 }
533 }
534
535 static void
combine_metaword_all(struct splitter_context * sc)536 combine_metaword_all(struct splitter_context *sc)
537 {
538 int i;
539
540 struct word_split_info_cache *info = sc->word_split_info;
541 /* metawordの左端によるループ */
542 for (i = sc->char_count - 1; i >= 0; i--){
543 struct meta_word *mw;
544 /* 各metawordのループ */
545 for (mw = info->cnode[i].mw;
546 mw; mw = mw->next) {
547 combine_metaword(sc, mw);
548 }
549 }
550 }
551
552 static void
make_dummy_metaword(struct splitter_context * sc,int from,int len,int orig_len)553 make_dummy_metaword(struct splitter_context *sc, int from,
554 int len, int orig_len)
555 {
556 int score = 0;
557 struct meta_word *mw, *n;
558
559 for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) {
560 if (mw->len != orig_len) continue;
561 if (mw->score > score) {
562 score = mw->score;
563 }
564 }
565
566 n = alloc_metaword(sc);
567 n->type = MW_DUMMY;
568 n->from = from;
569 n->len = len;
570 n->score = 3 * score * len / orig_len;
571 if (mw) {
572 mw->nr_parts = 0;
573 }
574 anthy_commit_meta_word(sc, n);
575 }
576
577 /*
578 * 文節を伸ばしたらそれを覚えておく
579 */
580 static void
make_expanded_metaword_all(struct splitter_context * sc)581 make_expanded_metaword_all(struct splitter_context *sc)
582 {
583 int i, j;
584 if (anthy_select_section("EXPANDPAIR", 0) == -1) {
585 return ;
586 }
587 for (i = 0; i < sc->char_count; i++) {
588 for (j = 1; j < sc->char_count - i; j++) {
589 /* 全ての部分文字列に対して */
590 xstr xs;
591 xs.len = j;
592 xs.str = sc->ce[i].c;
593 if (anthy_select_row(&xs, 0) == 0) {
594 /* この部分文字列は過去に拡大の対象となった */
595 int k;
596 int nr = anthy_get_nr_values();
597 for (k = 0; k < nr; k++) {
598 xstr *exs;
599 exs = anthy_get_nth_xstr(k);
600 if (exs && exs->len <= sc->char_count - i) {
601 xstr txs;
602 txs.str = sc->ce[i].c;
603 txs.len = exs->len;
604 if (!anthy_xstrcmp(&txs, exs)) {
605 make_dummy_metaword(sc, i, txs.len, j);
606 }
607 }
608 }
609 }
610 }
611 }
612 }
613
614 /* お茶入れ学習のmetawordを作る */
615 static void
make_ochaire_metaword(struct splitter_context * sc,int from,int len)616 make_ochaire_metaword(struct splitter_context *sc,
617 int from, int len)
618 {
619 struct meta_word *mw;
620 int count;
621 int s;
622 int j;
623 int seg_len;
624 int mw_len = 0;
625 xstr* xs;
626
627 (void)len;
628
629 /* 文節数を取得 */
630 count = anthy_get_nth_value(0);
631 /* 一番右の文節をのぞいた文字数の合計を計算 */
632 for (s = 0, j = 0; j < count - 1; j++) {
633 s += anthy_get_nth_value(j * 2 + 1);
634 }
635 /* 一番右の文節のmetawordを構成 */
636 xs = anthy_get_nth_xstr((count - 1) * 2 + 2);
637 if (!xs) {
638 return ;
639 }
640 seg_len = anthy_get_nth_value((count - 1) * 2 + 1);
641 mw = alloc_metaword(sc);
642 mw->type = MW_OCHAIRE;
643 mw->from = from + s;
644 mw->len = seg_len;
645 mw->score = OCHAIRE_SCORE;
646 mw->cand_hint.str = malloc(sizeof(xchar)*xs->len);
647 anthy_xstrcpy(&mw->cand_hint, xs);
648 anthy_commit_meta_word(sc, mw);
649 mw_len += seg_len;
650 /* それ以外の文節でmetawordを構成 */
651 for (j-- ; j >= 0; j--) {
652 struct meta_word *n;
653 seg_len = anthy_get_nth_value(j * 2 + 1);
654 s -= seg_len;
655 xs = anthy_get_nth_xstr(j * 2 + 2);
656 if (!xs) {
657 return ;
658 }
659 n = alloc_metaword(sc);
660 n->type = MW_OCHAIRE;
661 /* 右のmetawordをつなぐ */
662 n->mw1 = mw;
663 n->from = from + s;
664 n->len = seg_len;
665 n->score = OCHAIRE_SCORE;
666 n->cand_hint.str = malloc(sizeof(xchar)*xs->len);
667 anthy_xstrcpy(&n->cand_hint, xs);
668 anthy_commit_meta_word(sc, n);
669 mw = n;
670 mw_len += seg_len;
671 }
672 }
673
674 /*
675 * 複数の文節の組を履歴から検索する
676 */
677 static void
make_ochaire_metaword_all(struct splitter_context * sc)678 make_ochaire_metaword_all(struct splitter_context *sc)
679 {
680 int i;
681 if (anthy_select_section("OCHAIRE", 0) == -1) {
682 return ;
683 }
684
685 for (i = 0; i < sc->char_count; i++) {
686 xstr xs;
687 xs.len = sc->char_count - i;
688 xs.str = sc->ce[i].c;
689 if (anthy_select_longest_row(&xs) == 0) {
690 xstr* key;
691 int len;
692 anthy_mark_row_used();
693 key = anthy_get_index_xstr();
694 len = key->len;
695
696 make_ochaire_metaword(sc, i, len);
697 /* 今回見つかった meta_word の次の文字から始める */
698 i += len - 1;
699 break;
700 }
701 }
702 }
703
704 static void
add_dummy_metaword(struct splitter_context * sc,int from)705 add_dummy_metaword(struct splitter_context *sc,
706 int from)
707 {
708 struct meta_word *n;
709 n = alloc_metaword(sc);
710 n->from = from;
711 n->len = 1;
712 n->type = MW_SINGLE;
713 n->score = 1;
714 n->seg_class = SEG_BUNSETSU;
715 anthy_commit_meta_word(sc, n);
716 }
717
718 /* 指定したmetawordをwrapしてj文字長いmeta_wordを作る */
719 static void
expand_meta_word(struct splitter_context * sc,struct meta_word * mw,int from,int len,int destroy_seg_class,int j)720 expand_meta_word(struct splitter_context *sc,
721 struct meta_word *mw, int from, int len,
722 int destroy_seg_class, int j)
723 {
724 struct meta_word *n;
725 n = alloc_metaword(sc);
726 n->from = from;
727 n->len = len + j;
728 if (mw) {
729 n->type = MW_WRAP;
730 n->mw1 = mw;
731 n->score = mw->score;
732 n->nr_parts = mw->nr_parts;
733 if (destroy_seg_class) {
734 n->seg_class = SEG_BUNSETSU;
735 n->score /= 10;
736 } else {
737 n->seg_class = mw->seg_class;
738 }
739 } else {
740 n->type = MW_SINGLE;
741 n->score = 1;
742 n->seg_class = SEG_BUNSETSU;
743 }
744 anthy_commit_meta_word(sc, n);
745 }
746
747 /*
748 * metawordの後ろの雑多な文字をくっつけたmetawordを構成する
749 */
750 static void
make_metaword_with_depchar(struct splitter_context * sc,struct meta_word * mw)751 make_metaword_with_depchar(struct splitter_context *sc,
752 struct meta_word *mw)
753 {
754 int j;
755 int destroy_seg_class = 0;
756 int from = mw ? mw->from : 0;
757 int len = mw ? mw->len : 0;
758
759 /* metawordの直後の文字の種類を調べる */
760 int type;
761 if (sc->char_count <= from + len) {
762 return ;
763 }
764 type = anthy_get_xchar_type(*sc->ce[from + len].c);
765 if (!(type & XCT_SYMBOL) &&
766 !(type & XCT_PART)) {
767 return;
768 }
769 if (type & XCT_PUNCTUATION) {
770 /* 句読点ならば別の文節にする */
771 return ;
772 }
773
774 /* 同じ種類の文字でなければくっつけるのをうちきり */
775 for (j = 0; from + len + j < sc->char_count; j++) {
776 int p = from + len + j;
777 if ((anthy_get_xchar_type(*sc->ce[p].c) != type)) {
778 break;
779 }
780 if (!(p + 1 < sc->char_count) ||
781 *sc->ce[p].c != *sc->ce[p + 1].c) {
782 destroy_seg_class = 1;
783 }
784 }
785
786 /* 上のループを抜けた時、jには独立できない文字の数が入っている */
787
788 /* 独立できない文字があるので、それを付けたmetawordを作る */
789 if (j > 0) {
790 expand_meta_word(sc, mw, from, len, destroy_seg_class, j);
791 }
792 }
793
794 static void
make_metaword_with_depchar_all(struct splitter_context * sc)795 make_metaword_with_depchar_all(struct splitter_context *sc)
796 {
797 int i;
798 struct word_split_info_cache *info = sc->word_split_info;
799
800 /* 全metawordに対して */
801 for (i = 0; i < sc->char_count; i++) {
802 struct meta_word *mw;
803 for (mw = info->cnode[i].mw;
804 mw; mw = mw->next) {
805 make_metaword_with_depchar(sc, mw);
806 }
807 if (!info->cnode[i].mw) {
808 /**/
809 add_dummy_metaword(sc, i);
810 }
811 }
812 /* 文の左端から始まるもの */
813 make_metaword_with_depchar(sc, NULL);
814 }
815
816 static int
is_single(xstr * xs)817 is_single(xstr* xs)
818 {
819 int i;
820 int xct;
821 for (i = xs->len - 1; i >= 1; --i) {
822 xct = anthy_get_xchar_type(xs->str[i]);
823 if (!(xct & XCT_PART)) {
824 return 0;
825 }
826 }
827 return 1;
828 }
829
830 static void
bias_to_single_char_metaword(struct splitter_context * sc)831 bias_to_single_char_metaword(struct splitter_context *sc)
832 {
833 int i;
834
835 for (i = sc->char_count - 1; i >= 0; --i) {
836 struct meta_word *mw;
837 xstr xs;
838 int xct;
839
840 struct char_node *cnode = &sc->word_split_info->cnode[i];
841
842 /* カッコの場合は一文字で文節を構成できる */
843 xct = anthy_get_xchar_type(*sc->ce[i].c);
844 if (xct & (XCT_OPEN|XCT_CLOSE)) {
845 continue;
846 }
847
848 xs.str = sc->ce[i].c;
849 for (mw = cnode->mw; mw; mw = mw->next) {
850 /* 付属語のみの文節は減点しない */
851 if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
852 continue;
853 }
854 /* 一文字(+直前につながる文字の繰り返し)のスコアを下げる */
855 xs.len = mw->len;
856 if (is_single(&xs)) {
857 mw->score /= 10;
858 }
859 }
860 }
861 }
862
863 void
anthy_mark_border_by_metaword(struct splitter_context * sc,struct meta_word * mw)864 anthy_mark_border_by_metaword(struct splitter_context* sc,
865 struct meta_word* mw)
866 {
867 struct word_split_info_cache* info = sc->word_split_info;
868 if (!mw) return;
869
870 switch (mw->type) {
871 case MW_DUMMY:
872 /* BREAK THROUGH */
873 case MW_SINGLE:
874 /* BREAK THROUGH */
875 case MW_COMPOUND_PART:
876 info->seg_border[mw->from] = 1;
877 break;
878 case MW_COMPOUND_LEAF:
879 info->seg_border[mw->from] = 1;
880 info->best_mw[mw->from] = mw;
881 mw->can_use = ok;
882 break;
883 case MW_COMPOUND_HEAD:
884 /* BREAK THROUGH */
885 case MW_COMPOUND:
886 /* BREAK THROUGH */
887 case MW_NUMBER:
888 info->best_mw[mw->mw1->from] = mw->mw1;
889 anthy_mark_border_by_metaword(sc, mw->mw1);
890 anthy_mark_border_by_metaword(sc, mw->mw2);
891 break;
892 case MW_V_RENYOU_A:
893 /* BREAK THROUGH */
894 case MW_V_RENYOU_NOUN:
895 info->seg_border[mw->from] = 1;
896 break;
897 case MW_WRAP:
898 anthy_mark_border_by_metaword(sc, mw->mw1);
899 break;
900 case MW_OCHAIRE:
901 info->seg_border[mw->from] = 1;
902 anthy_mark_border_by_metaword(sc, mw->mw1);
903 break;
904 default:
905 break;
906 }
907 }
908
909 void
anthy_make_metaword_all(struct splitter_context * sc)910 anthy_make_metaword_all(struct splitter_context *sc)
911 {
912 /* まず、word_list一個のmetawordを作る */
913 make_metaword_from_word_list(sc);
914
915 /* metawordを結合する */
916 combine_metaword_all(sc);
917
918 /* 拡大された文節を処理する */
919 make_expanded_metaword_all(sc);
920
921 /* 濁点や長音などの記号、その他の記号を処理 */
922 make_metaword_with_depchar_all(sc);
923
924 /* おちゃをいれる */
925 make_ochaire_metaword_all(sc);
926
927 /* 一文字の文節は減点 */
928 bias_to_single_char_metaword(sc);
929 }
930
931 /*
932 * 指定された領域をカバーするmetawordを数える
933 */
934 int
anthy_get_nr_metaword(struct splitter_context * sc,int from,int len)935 anthy_get_nr_metaword(struct splitter_context *sc,
936 int from, int len)
937 {
938 struct meta_word *mw;
939 int n;
940
941 for (n = 0, mw = sc->word_split_info->cnode[from].mw;
942 mw; mw = mw->next) {
943 if (mw->len == len && mw->can_use == ok) {
944 n++;
945 }
946 }
947 return n;
948 }
949
950 struct meta_word *
anthy_get_nth_metaword(struct splitter_context * sc,int from,int len,int nth)951 anthy_get_nth_metaword(struct splitter_context *sc,
952 int from, int len, int nth)
953 {
954 struct meta_word *mw;
955 int n;
956 for (n = 0, mw = sc->word_split_info->cnode[from].mw;
957 mw; mw = mw->next) {
958 if (mw->len == len && mw->can_use == ok) {
959 if (n == nth) {
960 return mw;
961 }
962 n++;
963 }
964 }
965 return NULL;
966 }
967