1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25
26 #include <stdio.h>
27 #include <glib.h>
28 #include "pinyin_internal.h"
29
30
print_help()31 void print_help(){
32 printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
33 "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
34 "<OUTPUTFILE> the result output file\n"
35 "<FILEi> input pinyin files\n"
36 "<PHRASE_INDEX> phrase index identifier\n");
37 }
38
39
40 static gint phrase_index = 0;
41 static const gchar * outputfile = "temp.out";
42
43 static GOptionEntry entries[] =
44 {
45 {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
46 {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
47 {NULL}
48 };
49
50
51 using namespace pinyin;
52
53 /* map from phrase_item to GArray of chewing_and_freq_item */
54 GTree * g_chewing_tree;
55 /* Array of GArray of phrase_and_array_item */
56 GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
57
58 struct phrase_item{
59 size_t length;
60 gunichar * uniphrase;
61 };
62
63 struct chewing_and_freq_item{
64 ChewingKeyVector keys;
65 ChewingKeyRestVector key_rests;
66 guint32 freq;
67 };
68
69 struct phrase_and_array_item{
70 phrase_item phrase; /* the key of g_chewing_tree */
71 /* Array of chewing_and_freq_item */
72 GArray * chewing_and_freq_array; /* the value of g_chewing_tree */
73 };
74
75
76 void feed_file(const char * filename);
77
78 void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
79
80 gboolean store_one_item(gpointer key, gpointer value, gpointer data);
81
82 int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
83 gpointer userdata);
84
85 void gen_phrase_file(const char * outputfile, int phrase_index);
86
87
phrase_item_compare(gconstpointer a,gconstpointer b)88 gint phrase_item_compare(gconstpointer a, gconstpointer b){
89 phrase_item * itema = (phrase_item *) a;
90 phrase_item * itemb = (phrase_item *) b;
91 if ( itema->length != itemb->length )
92 return itema->length - itemb->length;
93 else
94 return memcmp(itema->uniphrase, itemb->uniphrase,
95 sizeof(gunichar) * itema->length);
96 }
97
98
main(int argc,char * argv[])99 int main(int argc, char * argv[]){
100 int i;
101
102 g_chewing_tree = g_tree_new(phrase_item_compare);
103
104 GError * error = NULL;
105 GOptionContext * context;
106
107 context = g_option_context_new("- generate pinyin table");
108 g_option_context_add_main_entries(context, entries, NULL);
109 if (!g_option_context_parse(context, &argc, &argv, &error)) {
110 g_print("option parsing failed:%s\n", error->message);
111 exit(EINVAL);
112 }
113
114 for (i = 1; i < argc; ++i) {
115 feed_file(argv[i]);
116 }
117
118 printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
119
120 /* store in item array */
121 g_item_array[0] = NULL;
122 for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
123 g_item_array[i] = g_array_new
124 (FALSE, TRUE, sizeof(phrase_and_array_item));
125 }
126 g_tree_foreach(g_chewing_tree, store_one_item, NULL);
127
128 /* sort item array */
129 for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
130 g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
131 }
132
133 gen_phrase_file(outputfile, phrase_index);
134
135 return 0;
136 }
137
feed_file(const char * filename)138 void feed_file ( const char * filename){
139 FILE * infile = fopen(filename, "r");
140 if ( NULL == infile ){
141 fprintf(stderr, "Can't open file %s.\n", filename);
142 exit(ENOENT);
143 }
144
145 char * linebuf = NULL; size_t size = 0; ssize_t read;
146 while( (read = getline(&linebuf, &size, infile)) != -1 ){
147 if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
148 linebuf[strlen(linebuf) - 1] = '\0';
149 }
150
151 /* assume tsi.src only use the single space to separate tokens. */
152 gchar ** strs = g_strsplit_set(linebuf, " ", 3);
153
154 const char * phrase = strs[0];
155 guint32 freq = atoi(strs[1]);
156 const char * pinyin = strs[2];
157
158 if (3 != g_strv_length(strs)) {
159 fprintf(stderr, "wrong line format:%s\n", linebuf);
160 continue;
161 }
162
163 if (feof(infile))
164 break;
165
166 feed_line(phrase, pinyin, freq);
167 }
168
169 free(linebuf);
170 fclose(infile);
171 }
172
feed_line(const char * phrase,const char * pinyin,const guint32 freq)173 void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
174 phrase_item * item = new phrase_item;
175 item->length = g_utf8_strlen(phrase, -1);
176
177 /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
178 * where is the code which I don't want to touch. :-)
179 */
180
181 if (item->length >= MAX_PHRASE_LENGTH) {
182 fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
183 delete item;
184 return;
185 }
186
187 item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
188
189 ZhuyinDirectParser2 parser;
190 ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
191 ChewingKeyRestVector key_rests = g_array_new
192 (FALSE, FALSE, sizeof(ChewingKeyRest));
193
194 pinyin_option_t options = USE_TONE | FORCE_TONE;
195 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
196 assert(keys->len == key_rests->len);
197
198 if (keys->len != item->length) {
199 fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
200 delete item;
201 return;
202 }
203
204 GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
205
206 chewing_and_freq_item value_item;
207 value_item.keys = keys; value_item.key_rests = key_rests;
208 value_item.freq = freq;
209
210 assert(item->length == value_item.keys->len);
211 if (NULL == array) {
212 array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
213 g_array_append_val(array, value_item);
214 g_tree_insert(g_chewing_tree, item, array);
215 return;
216 }
217
218 bool found = false;
219 for (size_t i = 0; i < array->len; ++i) {
220 chewing_and_freq_item * cur_item =
221 &g_array_index(array, chewing_and_freq_item, i);
222 int result = pinyin_exact_compare2
223 ((ChewingKey *) value_item.keys->data,
224 (ChewingKey *) cur_item->keys->data,
225 value_item.keys->len);
226
227 if (0 == result) {
228 fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
229 phrase, pinyin, freq);
230 cur_item->freq += freq;
231 found = true;
232 }
233 }
234
235 if (!found) {
236 g_array_append_val(array, value_item);
237 g_tree_insert(g_chewing_tree, item, array);
238 } else {
239 /* clean up */
240 g_array_free(keys, TRUE);
241 g_array_free(key_rests, TRUE);
242 }
243
244 delete item;
245 }
246
247
store_one_item(gpointer key,gpointer value,gpointer data)248 gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
249 phrase_and_array_item item;
250 item.phrase = *((phrase_item *) key);
251 item.chewing_and_freq_array = (GArray *) value;
252 int len = item.phrase.length;
253 g_array_append_val(g_item_array[len], item);
254 return FALSE;
255 }
256
257
phrase_array_compare(gconstpointer lhs,gconstpointer rhs,gpointer userdata)258 int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
259 gpointer userdata) {
260 int phrase_length = *((int *) userdata);
261 phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
262 phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
263
264 ChewingKeyVector keys_lhs = g_array_index
265 (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
266 ChewingKeyVector keys_rhs = g_array_index
267 (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
268 return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
269 (ChewingKey *)keys_rhs->data, phrase_length);
270 }
271
272
gen_phrase_file(const char * outputfile,int phrase_index)273 void gen_phrase_file(const char * outputfile, int phrase_index){
274 FILE * outfile = fopen(outputfile, "w");
275 if (NULL == outfile ) {
276 fprintf(stderr, "Can't write file %s.\n", outputfile);
277 exit(ENOENT);
278 }
279
280 phrase_token_t token = 1;
281
282 /* phrase length index */
283 for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
284 GArray * item_array = g_item_array[i];
285
286 /* item array index */
287 for (size_t m = 0; m < item_array->len; ++m) {
288 phrase_and_array_item * item = &g_array_index
289 (item_array, phrase_and_array_item, m);
290 phrase_item phrase = item->phrase;
291 GArray * chewing_and_freqs = item->chewing_and_freq_array;
292
293 gchar * phrase_str = g_ucs4_to_utf8
294 (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
295
296 /* iterate each pinyin */
297 for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
298 chewing_and_freq_item * chewing_and_freq =
299 &g_array_index
300 (chewing_and_freqs, chewing_and_freq_item, n);
301
302 ChewingKeyVector keys = chewing_and_freq->keys;
303 ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
304
305 GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
306 gchar * pinyin = NULL;
307
308 size_t k;
309 for (k = 0; k < keys->len; ++k) {
310 ChewingKey key = g_array_index(keys, ChewingKey, k);
311 ChewingKeyRest key_rest = g_array_index
312 (key_rests, ChewingKeyRest, k);
313
314 assert (CHEWING_ZERO_TONE != key.m_tone);
315 pinyin = key.get_zhuyin_string();
316 g_array_append_val(pinyins, pinyin);
317 }
318 gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
319
320 for (k = 0; k < pinyins->len; ++k) {
321 g_free(g_array_index(pinyins, gchar *, k));
322 }
323 g_array_free(pinyins, TRUE);
324
325 guint32 freq = chewing_and_freq->freq;
326
327 /* avoid zero freq */
328 if (freq < 3) freq = 3;
329
330 fprintf(outfile, "%s\t%s\t%d\t%d\n",
331 pinyin_str, phrase_str,
332 PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
333
334 g_free(pinyin_str);
335 }
336 g_free(phrase_str);
337 token++;
338 }
339 }
340
341 fclose(outfile);
342 }
343