1 /*
2  * 文節の遷移行列を作成する
3  *
4  * このコマンドは二つの機能を持っている。(-cオプションで制御)
5  * (1) proccorpusの結果からテキスト形式で経験的確率の表を作る
6  * (2) テキスト形式の表からバイナリ形式に変換する
7  *
8  * morphological-analyzerの出力には下記のマークが付けてある
9  * ~ 候補の誤り
10  * ! 文節長の誤り
11  * ^ 複合文節の2つめ以降の要素
12  *
13  * generate transition matrix
14  *
15  * Copyright (C) 2006 HANAOKA Toshiyuki
16  * Copyright (C) 2006-2007 TABATA Yusuke
17  *
18  */
19 /*
20   This library is free software; you can redistribute it and/or
21   modify it under the terms of the GNU Lesser General Public
22   License as published by the Free Software Foundation; either
23   version 2 of the License, or (at your option) any later version.
24 
25   This library is distributed in the hope that it will be useful,
26   but WITHOUT ANY WARRANTY; without even the implied warranty of
27   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
28   Lesser General Public License for more details.
29 
30   You should have received a copy of the GNU Lesser General Public
31   License along with this library; if not, write to the Free Software
32   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
33  */
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <math.h>
38 
39 #include <anthy/anthy.h>
40 #include <anthy/xstr.h>
41 #include <anthy/feature_set.h>
42 #include <anthy/diclib.h>
43 #include "input_set.h"
44 #include <anthy/corpus.h>
45 
46 #define FEATURE_SET_SIZE NR_EM_FEATURES
47 
48 #define ARRAY_SIZE 16
49 
50 struct array {
51   int len;
52   int f[ARRAY_SIZE];
53 };
54 
55 #define MAX_SEGMENT 64
56 
57 struct segment_info {
58   int orig_hash;
59   int hash;
60 };
61 
62 struct sentence_info {
63   int nr_segments;
64   struct segment_info segs[MAX_SEGMENT];
65 };
66 
67 /* 確率のテーブル */
68 struct input_info {
69   /* 候補全体の素性 */
70   struct input_set *cand_is;
71   /* 文節の素性 */
72   struct input_set *seg_is;
73   /* 自立語の全文検索用情報 */
74   struct corpus *indep_corpus;
75 
76   /**/
77   struct array missed_cand_features;
78 
79   /**/
80   int nth_input_file;
81 
82   /* 入力された例文の量に関する情報 */
83   int nr_sentences;
84   int nr_connections;
85 };
86 
87 static struct input_info *
init_input_info(void)88 init_input_info(void)
89 {
90   struct input_info *m;
91   m = malloc(sizeof(struct input_info));
92   m->seg_is = input_set_create();
93   m->cand_is = input_set_create();
94   m->indep_corpus = corpus_new();
95   m->missed_cand_features.len = 0;
96   m->nth_input_file = 0;
97   m->nr_sentences = 0;
98   m->nr_connections = 0;
99   return m;
100 }
101 
102 /* features=1,2,3,,の形式をparseする */
103 static void
parse_features(struct array * features,char * s)104 parse_features(struct array *features, char *s)
105 {
106   char *tok, *str = s;
107   tok = strtok(str, ",");
108   features->len = 0;
109   do {
110     features->f[features->len] = atoi(tok);
111     features->len++;
112     tok = strtok(NULL, ",");
113   } while(tok);
114 }
115 
116 static void
add_seg_struct_info(struct input_info * m,struct array * features,int weight)117 add_seg_struct_info(struct input_info *m,
118 		    struct array *features,
119 		    int weight)
120 {
121   input_set_set_features(m->cand_is, features->f, features->len, weight);
122 }
123 
124 static void
set_hash(struct sentence_info * sinfo,int error_class,char tag,int hash)125 set_hash(struct sentence_info *sinfo, int error_class,
126 	 char tag, int hash)
127 {
128   if (tag == '~') {
129     sinfo->segs[sinfo->nr_segments].orig_hash = hash;
130   } else {
131     sinfo->segs[sinfo->nr_segments].hash = hash;
132   }
133   if (!error_class) {
134     sinfo->nr_segments++;
135   }
136 }
137 
138 static int
compare_array(struct array * a1,struct array * a2)139 compare_array(struct array *a1, struct array *a2)
140 {
141   int i;
142   if (a1->len != a2->len) {
143     return 1;
144   }
145   for (i = 0; i < a1->len; i++) {
146     if (a1->f[i] != a2->f[i]) {
147       return 1;
148     }
149   }
150   return 0;
151 }
152 
153 /* 自立語の行をparseする */
154 static void
parse_indep(struct input_info * m,struct sentence_info * sinfo,char * line,char * buf,int error_class)155 parse_indep(struct input_info *m, struct sentence_info *sinfo,
156 	    char *line, char *buf, int error_class)
157 {
158   struct array features;
159   char *s;
160   int weight = 1;
161   /**/
162   s = strstr(buf, "features=");
163   if (s) {
164     s += 9;
165     parse_features(&features, s);
166     m->nr_connections ++;
167   }
168   s = strstr(buf, "hash=");
169   if (s) {
170     s += 5;
171     set_hash(sinfo, error_class, line[0], atoi(s));
172   }
173 
174   /* 加算する */
175   if (error_class) {
176     if (line[0] == '~') {
177       /* 誤った候補の構造を保存 */
178       m->missed_cand_features = features;
179     }
180     if (line[0] == '!') {
181       /* 文節長の誤り */
182       input_set_set_features(m->seg_is, features.f, features.len, -weight);
183     }
184   } else {
185     /* 接続行列 */
186     input_set_set_features(m->seg_is, features.f, features.len, weight);
187     /* 候補の構造 */
188     if (m->missed_cand_features.len != 0 &&
189 	compare_array(&features, &m->missed_cand_features)) {
190       /* 正解と異なる構造なら分母に加算 */
191       add_seg_struct_info(m, &m->missed_cand_features, -weight);
192     }
193     m->missed_cand_features.len = 0;
194     add_seg_struct_info(m, &features, weight);
195   }
196 }
197 
198 static void
init_sentence_info(struct sentence_info * sinfo)199 init_sentence_info(struct sentence_info *sinfo)
200 {
201   int i;
202   sinfo->nr_segments = 0;
203   for (i = 0; i < MAX_SEGMENT; i++) {
204     sinfo->segs[i].orig_hash = 0;
205     sinfo->segs[i].hash = 0;
206   }
207 }
208 
209 /* 一つの文を読んだときに全文検索用のデータを作る
210  */
211 static void
complete_sentence_info(struct input_info * m,struct sentence_info * sinfo)212 complete_sentence_info(struct input_info *m, struct sentence_info *sinfo)
213 {
214   int i;
215   if (m->nth_input_file > 0) {
216     /* 二つめ以降の入力ファイルは使わない */
217     return ;
218   }
219   for (i = 0; i < sinfo->nr_segments; i++) {
220     int flags = ELM_NONE;
221     int nr = 1;
222     int buf[2];
223     if (i == 0) {
224       flags |= ELM_BOS;
225     }
226     /**/
227     buf[0] = sinfo->segs[i].hash;
228     if (sinfo->segs[i].orig_hash) {
229       /*
230       buf[1] = sinfo->segs[i].orig_hash;
231       nr ++;
232       */
233     }
234     corpus_push_back(m->indep_corpus, buf, nr, flags);
235   }
236 }
237 
238 static void
do_read_file(struct input_info * m,FILE * fp)239 do_read_file(struct input_info *m, FILE *fp)
240 {
241   char line[1024];
242   struct sentence_info sinfo;
243 
244   init_sentence_info(&sinfo);
245 
246   while (fgets(line, 1024, fp)) {
247     char *buf = line;
248     int error_class = 0;
249     if (!strncmp(buf, "eos", 3)) {
250       m->nr_sentences ++;
251       complete_sentence_info(m, &sinfo);
252       init_sentence_info(&sinfo);
253     }
254     if (line[0] == '~' || line[0] == '!' ||
255 	line[0] == '^') {
256       buf ++;
257       error_class = 1;
258     }
259     if (!strncmp(buf, "indep_word", 10) ||
260 	!strncmp(buf, "eos", 3)) {
261       parse_indep(m, &sinfo, line, buf, error_class);
262     }
263   }
264 }
265 
266 static void
read_file(struct input_info * m,char * fn)267 read_file(struct input_info *m, char *fn)
268 {
269   FILE *ifp;
270   ifp = fopen(fn, "r");
271   if (!ifp) {
272     fprintf(stderr, "failed to open (%s)\n", fn);
273     exit (1);
274   }
275   do_read_file(m, ifp);
276   fclose(ifp);
277 }
278 
279 static void
write_nl(FILE * fp,int i)280 write_nl(FILE *fp, int i)
281 {
282   i = anthy_dic_htonl(i);
283   fwrite(&i, sizeof(int), 1, fp);
284 }
285 
286 static void
dump_line(FILE * ofp,struct input_line * il)287 dump_line(FILE *ofp, struct input_line *il)
288 {
289   int i;
290   for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
291     if (i) {
292       fprintf(ofp, ", ");
293     }
294     if (i < il->nr_features) {
295       fprintf(ofp, "%d", il->features[i]);
296     } else {
297       fprintf(ofp, "0");
298     }
299   }
300   fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
301 }
302 
303 static int
compare_line(const void * p1,const void * p2)304 compare_line(const void *p1, const void *p2)
305 {
306   const struct input_line *const *il1 = p1;
307   const struct input_line *const *il2 = p2;
308   int i;
309   for (i = 0; i < (*il1)->nr_features &&
310 	 i < (*il2)->nr_features; i++) {
311     if ((*il1)->features[i] !=
312 	(*il2)->features[i]) {
313       return (*il1)->features[i] - (*il2)->features[i];
314     }
315   }
316   return (*il1)->nr_features - (*il2)->nr_features;
317 }
318 
319 static void
dump_features(FILE * ofp,struct input_set * is)320 dump_features(FILE *ofp, struct input_set *is)
321 {
322   struct input_line *il, **lines;
323   int i, nr = 0;
324   int weight = 0;
325 
326   /* count lines */
327   for (il = input_set_get_input_line(is); il; il = il->next_line) {
328     nr ++;
329     weight += (int)il->weight;
330   }
331   /* copy lines */
332   lines = malloc(sizeof(struct input_line *) * nr);
333   for (il = input_set_get_input_line(is), i = 0; i < nr;
334        i++, il = il->next_line) {
335     lines[i] = il;
336   }
337   /* sort */
338   qsort(lines, nr, sizeof(struct input_line *), compare_line);
339   /* output */
340   fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
341   /**/
342   for (i = 0; i < nr; i++) {
343     dump_line(ofp, lines[i]);
344   }
345 }
346 
347 static void
dump_input_info(FILE * ofp,struct input_info * m)348 dump_input_info(FILE *ofp, struct input_info *m)
349 {
350   fprintf(ofp, "section anthy.trans_info ");
351   dump_features(ofp, m->seg_is);
352   fprintf(ofp, "section anthy.cand_info ");
353   dump_features(ofp, m->cand_is);
354   fprintf(ofp, "section anthy.corpus_bucket ");
355   corpus_write_bucket(ofp, m->indep_corpus);
356   fprintf(ofp, "section anthy.corpus_array ");
357   corpus_write_array(ofp, m->indep_corpus);
358   /**/
359   fprintf(ofp, "section anthy.feature_info ");
360   input_set_output_feature_freq(ofp, m->seg_is);
361 }
362 
363 static void
convert_line(FILE * ofp,char * buf)364 convert_line(FILE *ofp, char *buf)
365 {
366   char *tok;
367   tok = strtok(buf, ",");
368   do {
369     int n = atoi(tok);
370     write_nl(ofp, n);
371     tok = strtok(NULL, ",");
372   } while (tok);
373 }
374 
375 static void
convert_file(FILE * ifp)376 convert_file(FILE *ifp)
377 {
378   char buf[1024];
379   FILE *ofp = NULL;
380   while (fgets(buf, 1024, ifp)) {
381     /**/
382     if (buf[0] == '#') {
383       continue;
384     }
385     if (!strncmp("section", buf, 7)) {
386       int w, n, i;
387       char fn[1024];
388       if (ofp) {
389 	fclose(ofp);
390 	ofp = NULL;
391       }
392       sscanf(buf, "section %s %d %d", fn, &w, &n);
393       ofp = fopen(fn, "w");
394       if (!ofp) {
395 	fprintf(stderr, "failed to open (%s)\n", fn);
396 	exit (1);
397       }
398       write_nl(ofp, w);
399       write_nl(ofp, n);
400       for (i = 0; i < NR_EM_FEATURES; i++) {
401 	write_nl(ofp, 0);
402       }
403     } else {
404       convert_line(ofp, buf);
405     }
406   }
407   if (ofp) {
408     fclose(ofp);
409   }
410 }
411 
412 static void
convert_data(int nr_fn,char ** fns)413 convert_data(int nr_fn, char **fns)
414 {
415   FILE *ifp;
416   int i;
417   /**/
418   for (i = 0; i < nr_fn; i++) {
419     ifp = fopen(fns[i], "r");
420     if (!ifp) {
421       fprintf(stderr, "failed to open (%s)\n", fns[i]);
422       exit (1);
423     }
424     convert_file(ifp);
425     fclose(ifp);
426   }
427 }
428 
429 /**/
430 #define STRING_HASH_SIZE 256
431 struct string_node {
432   int key;
433   char *str;
434   struct string_node *next_hash;
435 };
436 struct string_pool {
437   int nr;
438   struct string_node hash[STRING_HASH_SIZE];
439   struct string_node **array;
440 };
441 struct resize_info {
442   char *indep;
443   int valid;
444 };
445 struct extract_stat {
446   int nr;
447   struct resize_info info[MAX_SEGMENT];
448 };
449 
450 static void
string_pool_init(struct string_pool * sp)451 string_pool_init(struct string_pool *sp)
452 {
453   int i;
454   for (i = 0; i < STRING_HASH_SIZE; i++) {
455     sp->hash[i].next_hash = NULL;
456   }
457   sp->nr = 0;
458 }
459 
460 static int
compare_string_node(const void * p1,const void * p2)461 compare_string_node(const void *p1, const void *p2)
462 {
463   const struct string_node *const *n1 = p1;
464   const struct string_node *const *n2 = p2;
465   return (*n1)->key -(*n2)->key;
466 }
467 
468 static void
string_pool_sort(struct string_pool * sp)469 string_pool_sort(struct string_pool *sp)
470 {
471   int idx, h;
472   sp->array = malloc(sizeof(struct string_node *) * sp->nr);
473   for (idx = 0, h = 0; h < STRING_HASH_SIZE; h++) {
474     struct string_node *node;
475     for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
476       sp->array[idx] = node;
477       idx ++;
478     }
479   }
480   /**/
481   qsort(sp->array, sp->nr, sizeof(struct string_node *), compare_string_node);
482 }
483 
484 static void
string_pool_dump(FILE * ofp,struct string_pool * sp)485 string_pool_dump(FILE *ofp, struct string_pool *sp)
486 {
487   int i;
488   fprintf(ofp, "section anthy.weak_words 0 %d\n", sp->nr);
489   for (i = 0; i < sp->nr; i++) {
490     fprintf(ofp, "%d\n", sp->array[i]->key);
491   }
492 }
493 
494 static unsigned int
string_hash(const unsigned char * str)495 string_hash(const unsigned char *str)
496 {
497   unsigned int h = 0;
498   while (*str) {
499     h += *str;
500     h *= 13;
501     str ++;
502   }
503   return h % STRING_HASH_SIZE;
504 }
505 
506 static struct string_node *
find_string_node(struct string_pool * sp,const char * str)507 find_string_node(struct string_pool *sp, const char *str)
508 {
509   int h = (int)string_hash((const unsigned char *)str);
510   struct string_node *node;
511   for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
512     if (!strcmp(str, node->str)) {
513       return node;
514     }
515   }
516   /* allocate new */
517   node = malloc(sizeof(*node));
518   node->str = strdup(str);
519   node->key = 0;
520   node->next_hash = sp->hash[h].next_hash;
521   sp->hash[h].next_hash = node;
522   sp->nr ++;
523   return node;
524 }
525 
526 static void
flush_extract_stat(struct extract_stat * es,struct string_pool * sp)527 flush_extract_stat(struct extract_stat *es, struct string_pool *sp)
528 {
529   int i;
530   for (i = 0; i < es->nr; i++) {
531     if (es->info[i].valid) {
532       struct string_node *node;
533       node = find_string_node(sp, es->info[i].indep);
534       if (node->key == 0) {
535 	xstr *xs = anthy_cstr_to_xstr(node->str, ANTHY_UTF8_ENCODING);
536 	node->key = anthy_xstr_hash(xs);
537 	anthy_free_xstr(xs);
538       }
539       /* printf("(%s)%d\n", es->info[i].indep, node->key); */
540     }
541     free(es->info[i].indep);
542     es->info[i].indep = NULL;
543   }
544   es->nr = 0;
545 }
546 
547 static char *
get_indep_part(char * buf)548 get_indep_part(char *buf)
549 {
550   int len;
551   char *c = strchr(buf, '#');
552   if (!c) {
553     return NULL;
554   }
555   c = strchr(c, ' ');
556   if (!c) {
557     return NULL;
558   }
559   c++;
560   c = strchr(c, ' ');
561   if (!c) {
562     return NULL;
563   }
564   c++;
565   len = strlen(c);
566   c[len-1] = 0;
567   return c;
568 }
569 
570 static void
fixup_missed_word(struct extract_stat * es,char * buf)571 fixup_missed_word(struct extract_stat *es, char *buf)
572 {
573   int i;
574   char *c = get_indep_part(buf);
575   if (!c) {
576     return ;
577   }
578   for (i = 0; i < es->nr; i++) {
579     if (!strcmp(es->info[i].indep, c)) {
580       es->info[i].valid = 0;
581     }
582   }
583 }
584 
585 static void
fill_missed_word(struct extract_stat * es,char * buf)586 fill_missed_word(struct extract_stat *es, char *buf)
587 {
588   char *c = get_indep_part(buf);
589   if (!c) {
590     return ;
591   }
592   es->info[es->nr].indep = strdup(c);
593   es->info[es->nr].valid = 1;
594   es->nr++;
595 }
596 
597 static void
extract_word_from_file(FILE * ifp,struct string_pool * sp)598 extract_word_from_file(FILE *ifp, struct string_pool *sp)
599 {
600   int i;
601   char buf[1024];
602   struct extract_stat es;
603   /**/
604   es.nr = 0;
605   for (i = 0; i < MAX_SEGMENT; i++) {
606     es.info[i].indep = NULL;
607   }
608   /**/
609   while (fgets(buf, 1024, ifp)) {
610     if (buf[0] == '#') {
611       continue;
612     }
613     if (buf[0] == '\n' ||
614 	buf[0] == ' ') {
615       flush_extract_stat(&es, sp);
616       continue;
617     }
618     /**/
619     if (!strncmp("!indep_word ", buf, 12)) {
620       fill_missed_word(&es, buf);
621     }
622     if (!strncmp("indep_word", buf, 10)) {
623       fixup_missed_word(&es, buf);
624     }
625   }
626   flush_extract_stat(&es, sp);
627 }
628 
629 static void
extract_word(int nr_fn,char ** fns,FILE * ofp)630 extract_word(int nr_fn, char **fns, FILE *ofp)
631 {
632   struct string_pool sp;
633   FILE *ifp;
634   int i;
635   /**/
636   string_pool_init(&sp);
637   /**/
638   for (i = 0; i < nr_fn; i++) {
639     ifp = fopen(fns[i], "r");
640     if (!ifp) {
641       fprintf(stderr, "failed to open (%s)\n", fns[i]);
642       exit (1);
643     }
644     extract_word_from_file(ifp, &sp);
645     fclose(ifp);
646   }
647   /**/
648   string_pool_sort(&sp);
649   string_pool_dump(ofp, &sp);
650 }
651 
652 /* 変換結果から確率のテーブルを作る */
653 static void
proc_corpus(int nr_fn,char ** fns,FILE * ofp)654 proc_corpus(int nr_fn, char **fns, FILE *ofp)
655 {
656   int i;
657   struct input_info *m;
658   /**/
659   m = init_input_info();
660   /**/
661   for (i = 0; i < nr_fn; i++) {
662     m->nth_input_file = i;
663     read_file(m, fns[i]);
664   }
665 
666   corpus_build(m->indep_corpus);
667   /**/
668   dump_input_info(ofp, m);
669   /**/
670   fprintf(stderr, " %d sentences\n", m->nr_sentences);
671   fprintf(stderr, " %d connections\n", m->nr_connections);
672   fprintf(stderr, " %d segments\n", m->nr_connections - m->nr_sentences);
673 }
674 
675 int
main(int argc,char ** argv)676 main(int argc, char **argv)
677 {
678   FILE *ofp;
679   int i;
680   int nr_input = 0;
681   char **input_files;
682   int convert = 0;
683   int extract = 0;
684 
685   ofp = NULL;
686   input_files = malloc(sizeof(char *) * argc);
687 
688   for (i = 1; i < argc; i++) {
689     char *arg = argv[i];
690     if (!strcmp(arg, "-o")) {
691       ofp = fopen(argv[i+1], "w");
692       if (!ofp) {
693 	fprintf(stderr, "failed to open (%s)\n", argv[i+1]);
694 	exit (1);
695       }
696       i ++;
697     } else if (!strcmp(arg, "-c") ||
698 	       !strcmp(arg, "--convert")) {
699       convert = 1;
700     } else if (!strcmp(arg, "-e") ||
701 	       !strcmp(arg, "--extract")) {
702       extract = 1;
703     } else {
704       input_files[nr_input] = arg;
705       nr_input ++;
706     }
707   }
708   if (extract) {
709     printf(" -- extracting missed words\n");
710     if (!ofp) {
711       ofp = stdout;
712     }
713     extract_word(nr_input, input_files, ofp);
714     return 0;
715   }
716   if (ofp) {
717     printf(" -- generating dictionary in text form\n");
718     proc_corpus(nr_input, input_files, ofp);
719     fclose(ofp);
720   }
721   if (convert) {
722     printf(" -- converting dictionary from text to binary form\n");
723     convert_data(nr_input, input_files);
724   }
725 
726   return 0;
727 }
728