1 /*
2 * Smart Common Input Method
3 *
4 * Copyright (c) 2002-2005 James Su <suzhe@tsinghua.org.cn>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * $Id: scim_make_table.cpp,v 1.2 2005/10/26 07:53:53 suzhe Exp $
21 *
22 */
23
24 #define Uses_STL_FUNCTIONAL
25 #define Uses_STL_VECTOR
26 #define Uses_STL_IOSTREAM
27 #define Uses_STL_FSTREAM
28 #define Uses_STL_ALGORITHM
29 #define Uses_STL_MAP
30 #define Uses_STL_UTILITY
31 #define Uses_STL_IOMANIP
32 #define Uses_C_STDIO
33 #define Uses_C_WCTYPE
34 #define Uses_C_LOCALE
35 #define Uses_SCIM_UTILITY
36
37 #include <scim.h>
38 #include "scim_generic_table.h"
39 #include "scim_table_private.h"
40
41 using namespace scim;
42
self_learn(GenericTableLibrary & lib,const char * file)43 void self_learn (GenericTableLibrary &lib, const char *file)
44 {
45 std::vector <ucs4_t> buffer;
46 std::ifstream ifs(file);
47
48 if (!ifs) return;
49
50 uint32 byte = 0;
51 uint32 kb = 0;
52
53 WideString str;
54
55 std::vector <uint32> phrases;
56 std::vector <uint32>::const_iterator pit;
57
58 uint32 maxlen = lib.get_max_phrase_length ();
59
60 ucs4_t wc;
61 bool skip;
62 char wheel [] = {'-', '\\', '|', '/', 0};
63 int wheel_state;
64
65 buffer.reserve (1048576*32);
66
67 skip = false;
68
69 wheel_state = 0;
70
71 while (!ifs.eof()) {
72 buffer.clear ();
73 // Read a line
74 while (!ifs.eof ()) {
75 if ((wc = utf8_read_wchar (ifs)) == 0) break;
76 if (wc == L'\n') break;
77 else if (iswpunct (wc) || iswspace (wc) || iswdigit (wc) ) {
78 if (!skip) {
79 buffer.push_back (0);
80 skip = true;
81 }
82 } else {
83 buffer.push_back (wc);
84 skip = false;
85 }
86 }
87
88 buffer.push_back (0);
89 for (int i=0; i<buffer.size (); i++) {
90 str = WideString ();
91 for (int j=0; j<maxlen; j++) {
92 if (buffer [j+i] == 0)
93 break;
94 str.push_back (buffer [j+i]);
95
96 phrases.clear ();
97 if (lib.find_phrase (phrases, str)) {
98 for (pit = phrases.begin (); pit != phrases.end (); ++ pit)
99 lib.set_phrase_frequency (*pit, lib.get_phrase_frequency (*pit) + 1);
100 }
101 }
102 byte ++;
103 if (byte == 1024) {
104 byte = 0;
105 kb ++;
106 std::cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
107 std::cout << kb << "\tK ("
108 << wheel [wheel_state/2] << ") " << std::flush;
109 wheel_state = (wheel_state+1) % 8;
110 }
111 }
112 }
113
114 std::cout << std::endl;
115 }
116
main(int argc,char * argv[])117 int main (int argc, char * argv [])
118 {
119 bool binary = false;
120
121 bool no_save = false;
122
123 char *output = NULL;
124
125 char *corpus = NULL;
126
127 char *output_freq = NULL;
128
129 char *input_freq = NULL;
130
131 GenericTableLibrary phrase_lib;
132
133 bindtextdomain (GETTEXT_PACKAGE, SCIM_TABLE_LOCALEDIR);
134 bind_textdomain_codeset (GETTEXT_PACKAGE,
135 scim_get_locale_encoding (scim_get_current_locale ()).c_str ());
136
137 if (argc < 2) {
138 std::cerr << _("Too few arguments!\n"
139 "Usage:\n"
140 " scim-make-table <table_file> [options]\n\n"
141 " table_file\tthe table file for table module\n"
142 " -b\t\tconvert to binary format, otherwise to text format\n"
143 " -o output\tsave new table to file output\n"
144 " -no\t\tdo not save new phrase table\n"
145 " -if ifreq\tload phrase frequencies from this file\n"
146 " -of ofreq\tsave phrase frequencies to this file\n"
147 " -s file\tspecifiy the source file to count phrase ages.\n");
148 return -1;
149 }
150
151 int i = 1;
152 while (i<argc) {
153 if (++i >= argc) break;
154
155 if (String ("-b") == argv [i]) {
156 binary = true;
157 continue;
158 }
159
160 if (String ("-no") == argv [i]) {
161 if (output != NULL) {
162 std::cerr << _("option -no cannot be used with -o\n");
163 return -1;
164 }
165 no_save = true;
166 continue;
167 }
168
169 if (String ("-o") == argv [i]) {
170 if (no_save) {
171 std::cerr << _("option -o cannot be used with -no\n");
172 return -1;
173 }
174 if (++i >= argc) {
175 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
176 return -1;
177 }
178 output = argv [i];
179 continue;
180 }
181
182 if (String ("-if") == argv [i]) {
183 if (++i >= argc) {
184 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
185 return -1;
186 }
187 input_freq = argv [i];
188 continue;
189 }
190
191 if (String ("-of") == argv [i]) {
192 if (++i >= argc) {
193 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
194 return -1;
195 }
196 output_freq = argv [i];
197 continue;
198 }
199
200 if (String ("-s") == argv [i]) {
201 if (++i >= argc) {
202 std::cerr << "No argument for option " << argv [i-1] << std::endl;
203 return -1;
204 }
205 corpus = argv [i];
206 continue;
207 }
208
209 std::cerr << _("Invalid option: ") << argv [i] << std::endl;
210 return -1;
211 };
212
213 if (output == NULL) output = argv [1];
214
215 std::cout << _("Loading table file ") << argv [1] << _(" ...\n");
216
217 if (!phrase_lib.init (argv [1], "", (input_freq?input_freq:""))) {
218 std::cerr << _("table file load failed!") << std::endl;
219 return -1;
220 }
221
222 if (corpus != NULL) {
223 std::cout << "Counting phrase frequency...\n";
224 self_learn (phrase_lib, corpus);
225 }
226
227 if (output_freq != NULL) {
228 std::cout << _("Saving frequency table file ") << output_freq << _(" ...\n");
229 if (!phrase_lib.save ("", "", output_freq, binary))
230 std::cerr << _("frequency table file load failed!") << std::endl;
231 }
232
233 if (!no_save && output) {
234 std::cout << _("Saving table file ") << output << _(" ...\n");
235
236 if (!phrase_lib.save (output, "", "", binary)) {
237 std::cerr << _("Table file save failed!") << std::endl;
238 }
239 }
240
241 return 0;
242 }
243 /*
244 vi:ts=4:nowrap:ai:expandtab
245 */
246