1 /*
2  * Smart Common Input Method
3  *
4  * Copyright (c) 2002-2005 James Su <suzhe@tsinghua.org.cn>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  * $Id: scim_make_table.cpp,v 1.2 2005/10/26 07:53:53 suzhe Exp $
21  *
22  */
23 
24 #define Uses_STL_FUNCTIONAL
25 #define Uses_STL_VECTOR
26 #define Uses_STL_IOSTREAM
27 #define Uses_STL_FSTREAM
28 #define Uses_STL_ALGORITHM
29 #define Uses_STL_MAP
30 #define Uses_STL_UTILITY
31 #define Uses_STL_IOMANIP
32 #define Uses_C_STDIO
33 #define Uses_C_WCTYPE
34 #define Uses_C_LOCALE
35 #define Uses_SCIM_UTILITY
36 
37 #include <scim.h>
38 #include "scim_generic_table.h"
39 #include "scim_table_private.h"
40 
41 using namespace scim;
42 
self_learn(GenericTableLibrary & lib,const char * file)43 void self_learn (GenericTableLibrary &lib, const char *file)
44 {
45     std::vector <ucs4_t> buffer;
46     std::ifstream ifs(file);
47 
48     if (!ifs) return;
49 
50     uint32 byte = 0;
51     uint32 kb = 0;
52 
53     WideString str;
54 
55     std::vector <uint32> phrases;
56     std::vector <uint32>::const_iterator pit;
57 
58     uint32 maxlen = lib.get_max_phrase_length ();
59 
60     ucs4_t wc;
61     bool skip;
62     char wheel [] = {'-', '\\', '|', '/', 0};
63     int wheel_state;
64 
65     buffer.reserve (1048576*32);
66 
67     skip = false;
68 
69     wheel_state = 0;
70 
71     while (!ifs.eof()) {
72         buffer.clear ();
73         // Read a line
74         while (!ifs.eof ()) {
75             if ((wc = utf8_read_wchar (ifs)) == 0) break;
76             if (wc == L'\n') break;
77             else if (iswpunct (wc) || iswspace (wc) || iswdigit (wc) ) {
78                 if (!skip) {
79                     buffer.push_back (0);
80                     skip = true;
81                 }
82             } else {
83                 buffer.push_back (wc);
84                 skip = false;
85             }
86         }
87 
88         buffer.push_back (0);
89         for (int i=0; i<buffer.size (); i++) {
90             str = WideString ();
91             for (int j=0; j<maxlen; j++) {
92                 if (buffer [j+i] == 0)
93                     break;
94                 str.push_back (buffer [j+i]);
95 
96                 phrases.clear ();
97                 if (lib.find_phrase (phrases, str)) {
98                     for (pit = phrases.begin (); pit != phrases.end (); ++ pit)
99                         lib.set_phrase_frequency (*pit, lib.get_phrase_frequency (*pit) + 1);
100                 }
101             }
102             byte ++;
103             if (byte == 1024) {
104                 byte = 0;
105                 kb ++;
106                 std::cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
107                 std::cout << kb << "\tK ("
108                      << wheel [wheel_state/2] << ") " << std::flush;
109                 wheel_state = (wheel_state+1) % 8;
110             }
111         }
112     }
113 
114     std::cout << std::endl;
115 }
116 
main(int argc,char * argv[])117 int main (int argc, char * argv [])
118 {
119     bool binary = false;
120 
121     bool no_save = false;
122 
123     char *output = NULL;
124 
125     char *corpus = NULL;
126 
127     char *output_freq = NULL;
128 
129     char *input_freq = NULL;
130 
131     GenericTableLibrary phrase_lib;
132 
133     bindtextdomain (GETTEXT_PACKAGE, SCIM_TABLE_LOCALEDIR);
134     bind_textdomain_codeset (GETTEXT_PACKAGE,
135         scim_get_locale_encoding (scim_get_current_locale ()).c_str ());
136 
137     if (argc < 2) {
138         std::cerr    << _("Too few arguments!\n"
139                 "Usage:\n"
140                 "  scim-make-table <table_file> [options]\n\n"
141                 "  table_file\tthe table file for table module\n"
142                 "  -b\t\tconvert to binary format, otherwise to text format\n"
143                 "  -o output\tsave new table to file output\n"
144                 "  -no\t\tdo not save new phrase table\n"
145                 "  -if ifreq\tload phrase frequencies from this file\n"
146                 "  -of ofreq\tsave phrase frequencies to this file\n"
147                 "  -s file\tspecifiy the source file to count phrase ages.\n");
148         return -1;
149     }
150 
151     int i = 1;
152     while (i<argc) {
153         if (++i >= argc) break;
154 
155         if (String ("-b") == argv [i]) {
156             binary = true;
157             continue;
158         }
159 
160         if (String ("-no") == argv [i]) {
161             if (output != NULL) {
162                 std::cerr << _("option -no cannot be used with -o\n");
163                 return -1;
164             }
165             no_save = true;
166             continue;
167         }
168 
169         if (String ("-o") == argv [i]) {
170             if (no_save) {
171                 std::cerr << _("option -o cannot be used with -no\n");
172                 return -1;
173             }
174             if (++i >= argc) {
175                 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
176                 return -1;
177             }
178             output = argv [i];
179             continue;
180         }
181 
182         if (String ("-if") == argv [i]) {
183             if (++i >= argc) {
184                 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
185                 return -1;
186             }
187             input_freq = argv [i];
188             continue;
189         }
190 
191         if (String ("-of") == argv [i]) {
192             if (++i >= argc) {
193                 std::cerr << _("No argument for option ") << argv [i-1] << std::endl;
194                 return -1;
195             }
196             output_freq = argv [i];
197             continue;
198         }
199 
200         if (String ("-s") == argv [i]) {
201             if (++i >= argc) {
202                 std::cerr << "No argument for option " << argv [i-1] << std::endl;
203                 return -1;
204             }
205             corpus = argv [i];
206             continue;
207         }
208 
209         std::cerr << _("Invalid option: ") << argv [i] << std::endl;
210         return -1;
211     };
212 
213     if (output == NULL) output = argv [1];
214 
215     std::cout << _("Loading table file ") << argv [1] << _(" ...\n");
216 
217     if (!phrase_lib.init (argv [1], "", (input_freq?input_freq:""))) {
218         std::cerr << _("table file load failed!") << std::endl;
219         return -1;
220     }
221 
222     if (corpus != NULL) {
223         std::cout << "Counting phrase frequency...\n";
224         self_learn (phrase_lib, corpus);
225     }
226 
227     if (output_freq != NULL) {
228         std::cout << _("Saving frequency table file ") << output_freq << _(" ...\n");
229         if (!phrase_lib.save ("", "", output_freq, binary))
230             std::cerr << _("frequency table file load failed!") << std::endl;
231     }
232 
233     if (!no_save && output) {
234         std::cout << _("Saving table file ") << output << _(" ...\n");
235 
236         if (!phrase_lib.save (output, "", "", binary)) {
237             std::cerr << _("Table file save failed!") << std::endl;
238         }
239     }
240 
241     return 0;
242 }
243 /*
244 vi:ts=4:nowrap:ai:expandtab
245 */
246