1 /*
2 * guess.c
3 *
4 * All rights reserved. Copyright (C) 1996 by NARITA Tomio.
5 * $Id: guess.c,v 1.8 2003/11/13 03:08:19 nrt Exp $
6 */
7 /*
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23 #include <stdio.h>
24
25 #include <import.h>
26 #include <decode.h>
27 #include <big5.h>
28 #include <utf.h>
29 #ifndef MSDOS /* IF NOT DEFINED */
30 #include <unimap.h>
31 #include <unirev.h>
32 #endif /* MSDOS */
33 #ifdef HAVE_SETLOCALE
34 #include <locale.h>
35 #include <guesslocale.h>
36 #endif
37 #include <begin.h>
38 #include <guess.h>
39
40 #ifndef MSDOS
isUTF8(byte * str,int length)41 static int isUTF8( byte *str, int length )
42 {
43 int i, mode = 0, m2;
44 unsigned long ucs;
45 byte ch;
46
47 for( i = 0 ; i < length ; i++ ){
48 ch = str[ i ];
49 if( mode == 0 ){
50 if( (ch&0x80) == 0 ) continue;
51 if( (ch&0xe0) == 0xc0 ){ mode = m2 = 1; ucs = (ch&0x1f); continue; }
52 if( (ch&0xf0) == 0xe0 ){ mode = m2 = 2; ucs = (ch&0x0f); continue; }
53 if( (ch&0xf8) == 0xf0 ){ mode = m2 = 3; ucs = (ch&0x07); continue; }
54 if( (ch&0xfc) == 0xf8 ){ mode = m2 = 4; ucs = (ch&0x03); continue; }
55 if( (ch&0xfe) == 0xfc ){ mode = m2 = 5; ucs = (ch&0x01); continue; }
56 return 0;
57 } else {
58 if( (ch&0xc0) != 0x80 ) return 0;
59 ucs <<= 6; ucs += (ch&0x3f);
60 mode--;
61 if( !mode ){
62 if( m2 == 1 && ucs < 0x0000080 ) return 0;
63 if( m2 == 2 && ucs < 0x0000800 ) return 0;
64 if( m2 == 3 && ucs < 0x0010000 ) return 0;
65 if( m2 == 4 && ucs < 0x0200000 ) return 0;
66 if( m2 == 5 && ucs < 0x4000000 ) return 0;
67 }
68 }
69 }
70 return 1;
71 }
72 #endif /* MSDOS */
73
GuessCodingSystem_EastAsia(byte * str,int length,byte defaultEuc,char * language)74 private byte GuessCodingSystem_EastAsia( byte *str, int length,
75 byte defaultEuc, char *language )
76 {
77 int i;
78 byte ch;
79 int big5Penalty = 0;
80 int sjisPenalty = 0;
81 int eucjapanPenalty = 0;
82 int eucPenalty = 0;
83
84 for( i = 0 ; i < length ; i++ ){
85 /*
86 * check for simple euc
87 */
88 ch = str[ i ];
89 if( 0x80 & ch ){
90 if( SS2 == ch || SS3 == ch ){
91 eucPenalty++;
92 break;
93 }
94 if( !IsEucByte( ch ) ) {
95 eucPenalty++;
96 break;
97 }
98 if( ++i >= length )
99 break;
100 ch = str[ i ];
101 if( !IsEucByte( ch ) ) {
102 eucPenalty++;
103 break;
104 }
105 }
106 }
107 if( 0 == eucPenalty ) {
108 if (language)
109 return DetermineEUC(language, defaultEuc);
110 else if ( AUTOSELECT != defaultEuc )
111 return defaultEuc;
112 }
113
114 for( i = 0 ; i < length ; i++ ){
115 /*
116 * check for euc-japan or euc-taiwan
117 */
118 ch = str[ i ];
119 if( 0x80 & ch ){
120 if( SS2 == ch ){
121 if( ++i >= length )
122 break;
123 ch = str[ i ];
124 if( !IsKatakanaByte( ch ) ) {
125 eucjapanPenalty++;
126 break;
127 }
128 continue;
129 }
130 if( SS3 == ch ){
131 if( ++i >= length )
132 break;
133 ch = str[ i ];
134 }
135 if( !IsEucByte( ch ) ) {
136 eucjapanPenalty++;
137 break;
138 }
139 if( ++i >= length )
140 break;
141 ch = str[ i ];
142 if( !IsEucByte( ch ) ) {
143 eucjapanPenalty++;
144 break;
145 }
146 }
147 }
148 if( 0 == eucjapanPenalty ) {
149 if (language) {
150 if( !strncmp(language, "ja", 2) )
151 return EUC_JAPAN;
152 else
153 return EUC_TAIWAN;
154 } else {
155 return EUC_JAPAN; /* XXX */
156 }
157 }
158
159 for( i = 0 ; i < length ; i++ ){
160 /*
161 * check for big5
162 */
163 ch = str[ i ];
164 if( 0x80 & ch ){
165 if( !IsBig5Byte1( ch ) ) {
166 big5Penalty++;
167 break;
168 }
169 ch = str[ ++i ];
170 if( i >= length )
171 break;
172 if( !IsBig5Byte2( ch ) ) {
173 big5Penalty++;
174 break;
175 }
176 }
177 }
178 if( 0 == big5Penalty )
179 return BIG_FIVE;
180
181 for( i = 0 ; i < length ; i++ ){
182 /*
183 * check for shift-jis
184 */
185 ch = str[ i ];
186 if( 0x80 & ch ){
187 if( IsKatakanaByte( ch ) )
188 continue;
189 if( !IsShiftJisByte1( ch ) ) {
190 sjisPenalty++;
191 break;
192 }
193 ch = str[ ++i ];
194 if( i >= length )
195 break;
196 if( !IsShiftJisByte2( ch ) ) {
197 sjisPenalty++;
198 break;
199 }
200 }
201 }
202 if( 0 == sjisPenalty )
203 return SHIFT_JIS;
204
205 return ISO_8859_1;
206 }
207
GuessCodingSystem(byte * str,int length,byte defaultEuc)208 public byte GuessCodingSystem( byte *str, int length, byte defaultEuc )
209 {
210 int i;
211 byte ch;
212 #ifdef HAVE_SETLOCALE
213 byte c;
214 char *nl, language[6];
215 int use_locale = 1;
216
217 nl = setlocale( LC_CTYPE, "" );
218 if (NULL == nl || !strcmp("C", nl))
219 use_locale = 0;
220
221 if (use_locale) {
222 #ifndef MSDOS /* IF NOT DEFINED */
223 /*
224 * Since UTF-8 is a strict coding system, it is unlikely that
225 * a non-UTF-8 file is accidently recognized as a UTF-8 file.
226 * Thus, UTF-8 test is performed first regardless of locale.
227 */
228 if( 1 == isUTF8( str, length ) )
229 return UTF_8;
230 #endif /* MSDOS */
231 /*
232 * Now, the file is not UTF-8. In this case, separate algorithms
233 * are used for east Asian locales (where multibyte coding systems
234 * are expected to be used and further guessing may be possible) and
235 * other locales (where 8bit coding systems are used and further
236 * guessing is almost impossible).
237 */
238 c = LocaleCodingSystem(language);
239
240 if( !strncmp(language, "ja", 2) || !strncmp(language, "ko", 2) ||
241 !strncmp(language, "zh", 2) ) {
242 /*
243 * In case of east Asian locales. If the user has a certain
244 * preference on EUC coding system, the preference is used.
245 * If the file is EUC-compliant, coding system is determined
246 * by locale.
247 */
248 return GuessCodingSystem_EastAsia(str, length, defaultEuc, language);
249 }
250
251 /*
252 * In case of other than east Asian locales. (not multibyte)
253 */
254 if( UTF_8 != defaultEuc && AUTOSELECT != defaultEuc )
255 /*
256 * If the user has a certain preference on coding system,
257 * the will is repected. However, it is already known that
258 * the file is not UTF-8.
259 */
260 return defaultEuc;
261
262 if( UTF_8 == c ) {
263 /*
264 * When the environment is UTF-8 locale while the file is
265 * not UTF-8, coding system is guessed from language/country
266 * part of the current locale.
267 */
268 return Determine8bit( language );
269 }
270
271 /*
272 * When the environment is not UTF-8 locale, coding system is
273 * determined by the current locale.
274 */
275 return c;
276 } else
277 #endif /* HAVE_SETLOCALE */
278 {
279 #ifndef MSDOS /* IF NOT DEFINED */
280 /*
281 * Since UTF-8 is a strict coding system, it is unlikely that
282 * a non-UTF-8 file is accidently recognized as a UTF-8 file.
283 * Thus, UTF-8 test is performed first regardless of locale.
284 */
285 if( 1 == isUTF8( str, length ) )
286 return UTF_8;
287 #endif /* MSDOS */
288
289 return GuessCodingSystem_EastAsia(str, length, defaultEuc, NULL);
290 }
291 }
292
GuessHz(byte * str,int length)293 public byte GuessHz( byte *str, int length )
294 {
295 boolean_t gb = FALSE;
296 int i, hzPenalty = 0;
297 byte ch;
298
299 for( i = 0 ; i < length ; i++ ){
300 /*
301 * check for HZ
302 */
303 ch = str[ i ];
304 if( '~' == ch ){
305 if( ++i >= length )
306 break;
307 ch = str[ i ];
308 if( FALSE == gb ){
309 if( '{' == ch ){
310 gb = TRUE;
311 } else if( '~' == ch || '}' == ch ){
312 /* do nothing */
313 } else
314 hzPenalty++;
315 } else {
316 if( '}' == ch )
317 gb = FALSE;
318 }
319 }
320 }
321
322 return 0 == hzPenalty;
323 }
324
325
AdjustPatternCharset(byte inputCodingSystem,byte keyboardCodingSystem,byte defaultCodingSystem,i_str_t * istr)326 public void AdjustPatternCharset( byte inputCodingSystem,
327 byte keyboardCodingSystem,
328 byte defaultCodingSystem,
329 i_str_t *istr )
330 {
331 if( FALSE == adjust_charset )
332 return;
333
334 #ifndef MSDOS /* IF NOT DEFINED */
335 if( IsUtfEncoding( inputCodingSystem ) || IsUtfEncoding( keyboardCodingSystem ) ){
336 if( IsUtfEncoding( inputCodingSystem ) && !IsUtfEncoding( keyboardCodingSystem ) )
337 ConvertToUNI( istr );
338 else if( !IsUtfEncoding( inputCodingSystem ) && IsUtfEncoding( keyboardCodingSystem ) ){
339 if( AUTOSELECT == inputCodingSystem )
340 inputCodingSystem = defaultCodingSystem;
341 ConvertFromUNI( istr, inputCodingSystem );
342 }
343 } else
344 #endif /* MSDOS */
345 if( BIG_FIVE == inputCodingSystem && BIG_FIVE != keyboardCodingSystem ){
346 ConvertCNStoBIG5( istr );
347 } else if( BIG_FIVE != inputCodingSystem && BIG_FIVE == keyboardCodingSystem ){
348 ConvertBIG5toCNS( istr );
349 }
350 }
351