1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25 
26 #include <stdio.h>
27 #include <glib.h>
28 #include "pinyin_internal.h"
29 
30 
print_help()31 void print_help(){
32     printf("Usage: gen_pinyin_table -t <PHRASE_INDEX> \n"
33            "-o <OUTPUTFILE> <FILE1> <FILE2> .. <FILEn>\n"
34            "<OUTPUTFILE> the result output file\n"
35            "<FILEi> input pinyin files\n"
36            "<PHRASE_INDEX> phrase index identifier\n");
37 }
38 
39 
40 static gint phrase_index = 0;
41 static const gchar * outputfile = "temp.out";
42 
43 static GOptionEntry entries[] =
44 {
45     {"phraseindex", 't', 0, G_OPTION_ARG_INT, &phrase_index, "phrase index", NULL},
46     {"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output filename", NULL},
47     {NULL}
48 };
49 
50 
51 using namespace pinyin;
52 
53 /* map from phrase_item to GArray of chewing_and_freq_item */
54 GTree  * g_chewing_tree;
55 /* Array of GArray of phrase_and_array_item */
56 GArray * g_item_array[MAX_PHRASE_LENGTH + 1];
57 
58 struct phrase_item{
59     size_t length;
60     gunichar * uniphrase;
61 };
62 
63 struct chewing_and_freq_item{
64     ChewingKeyVector keys;
65     ChewingKeyRestVector key_rests;
66     guint32 freq;
67 };
68 
69 struct phrase_and_array_item{
70     phrase_item phrase;                    /* the key of g_chewing_tree */
71     /* Array of chewing_and_freq_item */
72     GArray * chewing_and_freq_array;       /* the value of g_chewing_tree */
73 };
74 
75 
76 void feed_file(const char * filename);
77 
78 void feed_line(const char * phrase, const char * pinyin, const guint32 freq);
79 
80 gboolean store_one_item(gpointer key, gpointer value, gpointer data);
81 
82 int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
83                          gpointer userdata);
84 
85 void gen_phrase_file(const char * outputfile, int phrase_index);
86 
87 
phrase_item_compare(gconstpointer a,gconstpointer b)88 gint phrase_item_compare(gconstpointer a, gconstpointer b){
89     phrase_item * itema = (phrase_item *) a;
90     phrase_item * itemb = (phrase_item *) b;
91     if ( itema->length != itemb->length )
92 	return itema->length - itemb->length;
93     else
94 	return memcmp(itema->uniphrase, itemb->uniphrase,
95 		      sizeof(gunichar) * itema->length);
96 }
97 
98 
main(int argc,char * argv[])99 int main(int argc, char * argv[]){
100     int i;
101 
102     g_chewing_tree = g_tree_new(phrase_item_compare);
103 
104     GError * error = NULL;
105     GOptionContext * context;
106 
107     context = g_option_context_new("- generate pinyin table");
108     g_option_context_add_main_entries(context, entries, NULL);
109     if (!g_option_context_parse(context, &argc, &argv, &error)) {
110         g_print("option parsing failed:%s\n", error->message);
111         exit(EINVAL);
112     }
113 
114     for (i = 1; i < argc; ++i) {
115         feed_file(argv[i]);
116     }
117 
118     printf("nnodes: %d\n", g_tree_nnodes(g_chewing_tree));
119 
120     /* store in item array */
121     g_item_array[0] = NULL;
122     for (i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
123 	g_item_array[i] = g_array_new
124             (FALSE, TRUE, sizeof(phrase_and_array_item));
125     }
126     g_tree_foreach(g_chewing_tree, store_one_item, NULL);
127 
128     /* sort item array */
129     for ( int i = 1; i < MAX_PHRASE_LENGTH + 1; ++i){
130 	g_array_sort_with_data(g_item_array[i], phrase_array_compare , &i);
131     }
132 
133     gen_phrase_file(outputfile, phrase_index);
134 
135     return 0;
136 }
137 
feed_file(const char * filename)138 void feed_file ( const char * filename){
139     FILE * infile = fopen(filename, "r");
140     if ( NULL == infile ){
141         fprintf(stderr, "Can't open file %s.\n", filename);
142         exit(ENOENT);
143     }
144 
145     char * linebuf = NULL; size_t size = 0; ssize_t read;
146     while( (read = getline(&linebuf, &size, infile)) != -1 ){
147         if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
148             linebuf[strlen(linebuf) - 1] = '\0';
149         }
150 
151         /* assume tsi.src only use the single space to separate tokens. */
152         gchar ** strs = g_strsplit_set(linebuf, " ", 3);
153 
154         const char * phrase = strs[0];
155         guint32 freq = atoi(strs[1]);
156         const char * pinyin = strs[2];
157 
158         if (3 != g_strv_length(strs)) {
159             fprintf(stderr, "wrong line format:%s\n", linebuf);
160             continue;
161         }
162 
163 	if (feof(infile))
164             break;
165 
166 	feed_line(phrase, pinyin, freq);
167     }
168 
169     free(linebuf);
170     fclose(infile);
171 }
172 
feed_line(const char * phrase,const char * pinyin,const guint32 freq)173 void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
174     phrase_item * item = new phrase_item;
175     item->length = g_utf8_strlen(phrase, -1);
176 
177     /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
178      *	where is the code which I don't want to touch. :-)
179      */
180 
181     if (item->length >= MAX_PHRASE_LENGTH) {
182         fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
183         delete item;
184         return;
185     }
186 
187     item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
188 
189     ZhuyinDirectParser2 parser;
190     ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
191     ChewingKeyRestVector key_rests = g_array_new
192         (FALSE, FALSE, sizeof(ChewingKeyRest));
193 
194     pinyin_option_t options = USE_TONE | FORCE_TONE;
195     parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
196     assert(keys->len == key_rests->len);
197 
198     if (keys->len != item->length) {
199         fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
200         delete item;
201         return;
202     }
203 
204     GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);
205 
206     chewing_and_freq_item value_item;
207     value_item.keys = keys; value_item.key_rests = key_rests;
208     value_item.freq = freq;
209 
210     assert(item->length == value_item.keys->len);
211     if (NULL == array) {
212         array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
213         g_array_append_val(array, value_item);
214         g_tree_insert(g_chewing_tree, item, array);
215         return;
216     }
217 
218     bool found = false;
219     for (size_t i = 0; i < array->len; ++i) {
220         chewing_and_freq_item * cur_item =
221             &g_array_index(array, chewing_and_freq_item, i);
222         int result = pinyin_exact_compare2
223             ((ChewingKey *) value_item.keys->data,
224              (ChewingKey *) cur_item->keys->data,
225              value_item.keys->len);
226 
227         if (0 == result) {
228             fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
229                     phrase, pinyin, freq);
230             cur_item->freq += freq;
231             found = true;
232         }
233     }
234 
235     if (!found) {
236         g_array_append_val(array, value_item);
237         g_tree_insert(g_chewing_tree, item, array);
238     } else {
239         /* clean up */
240         g_array_free(keys, TRUE);
241         g_array_free(key_rests, TRUE);
242     }
243 
244     delete item;
245 }
246 
247 
store_one_item(gpointer key,gpointer value,gpointer data)248 gboolean store_one_item(gpointer key, gpointer value, gpointer data) {
249     phrase_and_array_item item;
250     item.phrase = *((phrase_item *) key);
251     item.chewing_and_freq_array = (GArray *) value;
252     int len = item.phrase.length;
253     g_array_append_val(g_item_array[len], item);
254     return FALSE;
255 }
256 
257 
phrase_array_compare(gconstpointer lhs,gconstpointer rhs,gpointer userdata)258 int phrase_array_compare(gconstpointer lhs, gconstpointer rhs,
259                          gpointer userdata) {
260     int phrase_length = *((int *) userdata);
261     phrase_and_array_item * item_lhs = (phrase_and_array_item *) lhs;
262     phrase_and_array_item * item_rhs = (phrase_and_array_item *) rhs;
263 
264     ChewingKeyVector keys_lhs = g_array_index
265         (item_lhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
266     ChewingKeyVector keys_rhs = g_array_index
267         (item_rhs->chewing_and_freq_array, chewing_and_freq_item, 0).keys;
268     return pinyin_exact_compare2((ChewingKey *)keys_lhs->data,
269                                  (ChewingKey *)keys_rhs->data, phrase_length);
270 }
271 
272 
gen_phrase_file(const char * outputfile,int phrase_index)273 void gen_phrase_file(const char * outputfile, int phrase_index){
274     FILE * outfile = fopen(outputfile, "w");
275     if (NULL == outfile ) {
276         fprintf(stderr, "Can't write file %s.\n", outputfile);
277         exit(ENOENT);
278     }
279 
280     phrase_token_t token = 1;
281 
282     /* phrase length index */
283     for (size_t i = 1; i < MAX_PHRASE_LENGTH + 1; ++i) {
284         GArray * item_array = g_item_array[i];
285 
286         /* item array index */
287         for (size_t m = 0; m < item_array->len; ++m) {
288             phrase_and_array_item * item = &g_array_index
289                 (item_array, phrase_and_array_item, m);
290             phrase_item phrase = item->phrase;
291             GArray * chewing_and_freqs = item->chewing_and_freq_array;
292 
293             gchar * phrase_str = g_ucs4_to_utf8
294                 (phrase.uniphrase, phrase.length, NULL, NULL, NULL);
295 
296             /* iterate each pinyin */
297             for (size_t n = 0; n < chewing_and_freqs->len; ++n) {
298                 chewing_and_freq_item * chewing_and_freq =
299                     &g_array_index
300                     (chewing_and_freqs, chewing_and_freq_item, n);
301 
302                 ChewingKeyVector keys = chewing_and_freq->keys;
303                 ChewingKeyRestVector key_rests = chewing_and_freq->key_rests;
304 
305                 GArray * pinyins = g_array_new(TRUE, FALSE, sizeof(gchar *));
306                 gchar * pinyin = NULL;
307 
308                 size_t k;
309                 for (k = 0; k < keys->len; ++k) {
310                     ChewingKey key = g_array_index(keys, ChewingKey, k);
311                     ChewingKeyRest key_rest = g_array_index
312                         (key_rests, ChewingKeyRest, k);
313 
314                     assert (CHEWING_ZERO_TONE != key.m_tone);
315                     pinyin = key.get_zhuyin_string();
316                     g_array_append_val(pinyins, pinyin);
317                 }
318                 gchar * pinyin_str = g_strjoinv("'", (gchar **)pinyins->data);
319 
320                 for (k = 0; k < pinyins->len; ++k) {
321                     g_free(g_array_index(pinyins, gchar *, k));
322                 }
323                 g_array_free(pinyins, TRUE);
324 
325                 guint32 freq = chewing_and_freq->freq;
326 
327                 /* avoid zero freq */
328                 if (freq < 3) freq = 3;
329 
330 		fprintf(outfile, "%s\t%s\t%d\t%d\n",
331                         pinyin_str, phrase_str,
332                         PHRASE_INDEX_MAKE_TOKEN(phrase_index, token), freq);
333 
334                 g_free(pinyin_str);
335             }
336             g_free(phrase_str);
337             token++;
338         }
339     }
340 
341     fclose(outfile);
342 }
343