1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2006-2007, 2011 Peng Wu
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <locale.h>
29 #include <glib.h>
30 #include "pinyin_internal.h"
31 #include "utils_helper.h"
32 
33 static gboolean train_pi_gram = TRUE;
34 static const gchar * bigram_filename = DELETED_BIGRAM;
35 
36 static GOptionEntry entries[] =
37 {
38     {"skip-pi-gram-training", 0, G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, &train_pi_gram, "skip pi-gram training", NULL},
39     {"deleted-bigram-file", 0, 0, G_OPTION_ARG_FILENAME, &bigram_filename, "deleted bi-gram file", NULL},
40     {NULL}
41 };
42 
43 
main(int argc,char * argv[])44 int main(int argc, char * argv[]){
45     setlocale(LC_ALL, "");
46 
47     GError * error = NULL;
48     GOptionContext * context;
49 
50     context = g_option_context_new("- generate deleted n-gram");
51     g_option_context_add_main_entries(context, entries, NULL);
52     if (!g_option_context_parse(context, &argc, &argv, &error)) {
53         g_print("option parsing failed:%s\n", error->message);
54         exit(EINVAL);
55     }
56 
57     SystemTableInfo2 system_table_info;
58 
59     bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
60     if (!retval) {
61         fprintf(stderr, "load table.conf failed.\n");
62         exit(ENOENT);
63     }
64 
65     FacadePhraseIndex phrase_index;
66 
67     const pinyin_table_info_t * phrase_files =
68         system_table_info.get_default_tables();
69 
70     if (!load_phrase_index(phrase_files, &phrase_index))
71         exit(ENODATA);
72 
73     Bigram bigram;
74     bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
75 
76     char* linebuf = NULL; size_t size = 0;
77     phrase_token_t last_token, cur_token = last_token = 0;
78     while( getline(&linebuf, &size, stdin) ){
79 	if ( feof(stdin) )
80 	    break;
81 
82         if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
83             linebuf[strlen(linebuf) - 1] = '\0';
84         }
85 
86         TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);
87 
88 	last_token = cur_token;
89 	cur_token = token;
90 
91         /* skip null_token in second word. */
92         if ( null_token == cur_token )
93             continue;
94 
95         /* skip pi-gram training. */
96         if ( null_token == last_token ){
97             if ( !train_pi_gram )
98                 continue;
99             last_token = sentence_start;
100         }
101 
102         /* train bi-gram */
103         SingleGram * single_gram = NULL;
104         bigram.load(last_token, single_gram);
105 
106         if ( NULL == single_gram ){
107             single_gram = new SingleGram;
108         }
109         guint32 freq, total_freq;
110         //increase freq
111         if (single_gram->get_freq(cur_token, freq))
112             assert(single_gram->set_freq(cur_token, freq + 1));
113         else
114             assert(single_gram->insert_freq(cur_token, 1));
115         //increase total freq
116         single_gram->get_total_freq(total_freq);
117         single_gram->set_total_freq(total_freq + 1);
118 
119         bigram.store(last_token, single_gram);
120         delete single_gram;
121     }
122 
123     free(linebuf);
124     return 0;
125 }
126