1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2006-2007, 2011 Peng Wu
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <locale.h>
29 #include <glib.h>
30 #include "pinyin_internal.h"
31 #include "utils_helper.h"
32
33 static gboolean train_pi_gram = TRUE;
34 static const gchar * bigram_filename = DELETED_BIGRAM;
35
36 static GOptionEntry entries[] =
37 {
38 {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
39 {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL},
40 {NULL}
41 };
42
43
main(int argc,char * argv[])44 int main(int argc, char * argv[]){
45 setlocale(LC_ALL, "");
46
47 GError * error = NULL;
48 GOptionContext * context;
49
50 context = g_option_context_new("- generate deleted n-gram");
51 g_option_context_add_main_entries(context, entries, NULL);
52 if (!g_option_context_parse(context, &argc, &argv, &error)) {
53 g_print("option parsing failed:%s\n", error->message);
54 exit(EINVAL);
55 }
56
57 SystemTableInfo2 system_table_info;
58
59 bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
60 if (!retval) {
61 fprintf(stderr, "load table.conf failed.\n");
62 exit(ENOENT);
63 }
64
65 FacadePhraseIndex phrase_index;
66
67 const pinyin_table_info_t * phrase_files =
68 system_table_info.get_default_tables();
69
70 if (!load_phrase_index(phrase_files, &phrase_index))
71 exit(ENODATA);
72
73 Bigram bigram;
74 bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
75
76 char* linebuf = NULL; size_t size = 0;
77 phrase_token_t last_token, cur_token = last_token = 0;
78 while( getline(&linebuf, &size, stdin) ){
79 if ( feof(stdin) )
80 break;
81
82 if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
83 linebuf[strlen(linebuf) - 1] = '\0';
84 }
85
86 TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
87
88 last_token = cur_token;
89 cur_token = token;
90
91 /* skip null_token in second word. */
92 if ( null_token == cur_token )
93 continue;
94
95 /* skip pi-gram training. */
96 if ( null_token == last_token ){
97 if ( !train_pi_gram )
98 continue;
99 last_token = sentence_start;
100 }
101
102 /* train bi-gram */
103 SingleGram * single_gram = NULL;
104 bigram.load(last_token, single_gram);
105
106 if ( NULL == single_gram ){
107 single_gram = new SingleGram;
108 }
109 guint32 freq, total_freq;
110 //increase freq
111 if (single_gram->get_freq(cur_token, freq))
112 assert(single_gram->set_freq(cur_token, freq + 1));
113 else
114 assert(single_gram->insert_freq(cur_token, 1));
115 //increase total freq
116 single_gram->get_total_freq(total_freq);
117 single_gram->set_total_freq(total_freq + 1);
118
119 bigram.store(last_token, single_gram);
120 delete single_gram;
121 }
122
123 free(linebuf);
124 return 0;
125 }
126