1 /*
2  * guess.c
3  *
4  * All rights reserved. Copyright (C) 1996 by NARITA Tomio.
5  * $Id: guess.c,v 1.8 2003/11/13 03:08:19 nrt Exp $
6  */
7 /*
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21  */
22 
23 #include <stdio.h>
24 
25 #include <import.h>
26 #include <decode.h>
27 #include <big5.h>
28 #include <utf.h>
29 #ifndef MSDOS /* IF NOT DEFINED */
30 #include <unimap.h>
31 #include <unirev.h>
32 #endif /* MSDOS */
33 #ifdef HAVE_SETLOCALE
34 #include <locale.h>
35 #include <guesslocale.h>
36 #endif
37 #include <begin.h>
38 #include <guess.h>
39 
40 #ifndef MSDOS
isUTF8(byte * str,int length)41 static int isUTF8( byte *str, int length )
42 {
43   int i, mode = 0, m2;
44   unsigned long ucs;
45   byte ch;
46 
47   for( i = 0 ; i < length ; i++ ){
48     ch = str[ i ];
49     if( mode == 0 ){
50       if( (ch&0x80) == 0 ) continue;
51       if( (ch&0xe0) == 0xc0 ){ mode = m2 = 1; ucs = (ch&0x1f); continue; }
52       if( (ch&0xf0) == 0xe0 ){ mode = m2 = 2; ucs = (ch&0x0f); continue; }
53       if( (ch&0xf8) == 0xf0 ){ mode = m2 = 3; ucs = (ch&0x07); continue; }
54       if( (ch&0xfc) == 0xf8 ){ mode = m2 = 4; ucs = (ch&0x03); continue; }
55       if( (ch&0xfe) == 0xfc ){ mode = m2 = 5; ucs = (ch&0x01); continue; }
56       return 0;
57     } else {
58       if( (ch&0xc0) != 0x80 ) return 0;
59       ucs <<= 6; ucs += (ch&0x3f);
60       mode--;
61       if( !mode ){
62 	if( m2 == 1 && ucs < 0x0000080 ) return 0;
63 	if( m2 == 2 && ucs < 0x0000800 ) return 0;
64 	if( m2 == 3 && ucs < 0x0010000 ) return 0;
65 	if( m2 == 4 && ucs < 0x0200000 ) return 0;
66 	if( m2 == 5 && ucs < 0x4000000 ) return 0;
67       }
68     }
69   }
70   return 1;
71 }
72 #endif /* MSDOS */
73 
GuessCodingSystem_EastAsia(byte * str,int length,byte defaultEuc,char * language)74 private byte GuessCodingSystem_EastAsia( byte *str, int length,
75 					 byte defaultEuc, char *language )
76 {
77   int i;
78   byte ch;
79   int big5Penalty = 0;
80   int sjisPenalty = 0;
81   int eucjapanPenalty = 0;
82   int eucPenalty = 0;
83 
84   for( i = 0 ; i < length ; i++ ){
85     /*
86      * check for simple euc
87      */
88     ch = str[ i ];
89     if( 0x80 & ch ){
90       if( SS2 == ch || SS3 == ch ){
91 	eucPenalty++;
92 	break;
93       }
94       if( !IsEucByte( ch ) ) {
95 	eucPenalty++;
96 	break;
97       }
98       if( ++i >= length )
99 	break;
100       ch = str[ i ];
101       if( !IsEucByte( ch ) ) {
102 	eucPenalty++;
103 	break;
104       }
105     }
106   }
107   if( 0 == eucPenalty ) {
108     if (language)
109       return DetermineEUC(language, defaultEuc);
110     else if ( AUTOSELECT != defaultEuc )
111       return defaultEuc;
112   }
113 
114   for( i = 0 ; i < length ; i++ ){
115     /*
116      * check for euc-japan or euc-taiwan
117      */
118     ch = str[ i ];
119     if( 0x80 & ch ){
120       if( SS2 == ch ){
121 	if( ++i >= length )
122 	  break;
123 	ch = str[ i ];
124 	if( !IsKatakanaByte( ch ) ) {
125 	  eucjapanPenalty++;
126 	  break;
127 	}
128 	continue;
129       }
130       if( SS3 == ch ){
131 	if( ++i >= length )
132 	  break;
133 	ch = str[ i ];
134       }
135       if( !IsEucByte( ch ) ) {
136 	eucjapanPenalty++;
137 	break;
138       }
139       if( ++i >= length )
140 	break;
141       ch = str[ i ];
142       if( !IsEucByte( ch ) ) {
143 	eucjapanPenalty++;
144 	break;
145       }
146     }
147   }
148   if( 0 == eucjapanPenalty ) {
149     if (language) {
150       if( !strncmp(language, "ja", 2) )
151 	return EUC_JAPAN;
152       else
153 	return EUC_TAIWAN;
154     } else {
155       return EUC_JAPAN; /* XXX */
156     }
157   }
158 
159   for( i = 0 ; i < length ; i++ ){
160     /*
161      * check for big5
162      */
163     ch = str[ i ];
164     if( 0x80 & ch ){
165       if( !IsBig5Byte1( ch ) ) {
166 	big5Penalty++;
167 	break;
168       }
169       ch = str[ ++i ];
170       if( i >= length )
171 	break;
172       if( !IsBig5Byte2( ch ) ) {
173 	big5Penalty++;
174 	break;
175       }
176     }
177   }
178   if( 0 == big5Penalty )
179     return BIG_FIVE;
180 
181   for( i = 0 ; i < length ; i++ ){
182     /*
183      * check for shift-jis
184      */
185     ch = str[ i ];
186     if( 0x80 & ch ){
187       if( IsKatakanaByte( ch ) )
188 	continue;
189       if( !IsShiftJisByte1( ch ) ) {
190 	sjisPenalty++;
191 	break;
192       }
193       ch = str[ ++i ];
194       if( i >= length )
195 	break;
196       if( !IsShiftJisByte2( ch ) ) {
197 	sjisPenalty++;
198 	break;
199       }
200     }
201   }
202   if( 0 == sjisPenalty )
203     return SHIFT_JIS;
204 
205   return ISO_8859_1;
206 }
207 
GuessCodingSystem(byte * str,int length,byte defaultEuc)208 public byte GuessCodingSystem( byte *str, int length, byte defaultEuc )
209 {
210   int i;
211   byte ch;
212 #ifdef HAVE_SETLOCALE
213   byte c;
214   char *nl, language[6];
215   int use_locale = 1;
216 
217   nl = setlocale( LC_CTYPE, "" );
218   if (NULL == nl || !strcmp("C", nl))
219     use_locale = 0;
220 
221   if (use_locale) {
222 #ifndef MSDOS /* IF NOT DEFINED */
223     /*
224      * Since UTF-8 is a strict coding system, it is unlikely that
225      * a non-UTF-8 file is accidently recognized as a UTF-8 file.
226      * Thus, UTF-8 test is performed first regardless of locale.
227      */
228     if( 1 == isUTF8( str, length ) )
229       return UTF_8;
230 #endif /* MSDOS */
231     /*
232      * Now, the file is not UTF-8.  In this case, separate algorithms
233      * are used for east Asian locales (where multibyte coding systems
234      * are expected to be used and further guessing may be possible) and
235      * other locales (where 8bit coding systems are used and further
236      * guessing is almost impossible).
237      */
238     c = LocaleCodingSystem(language);
239 
240     if( !strncmp(language, "ja", 2) || !strncmp(language, "ko", 2) ||
241 	!strncmp(language, "zh", 2) ) {
242       /*
243        * In case of east Asian locales.  If the user has a certain
244        * preference on EUC coding system, the preference is used.
245        * If the file is EUC-compliant, coding system is determined
246        * by locale.
247        */
248       return GuessCodingSystem_EastAsia(str, length, defaultEuc, language);
249     }
250 
251     /*
252      * In case of other than east Asian locales. (not multibyte)
253      */
254     if( UTF_8 != defaultEuc && AUTOSELECT != defaultEuc )
255       /*
256        * If the user has a certain preference on coding system,
257        * the will is repected.  However, it is already known that
258        * the file is not UTF-8.
259        */
260       return defaultEuc;
261 
262     if( UTF_8 == c ) {
263       /*
264        * When the environment is UTF-8 locale while the file is
265        * not UTF-8, coding system is guessed from language/country
266        * part of the current locale.
267        */
268       return Determine8bit( language );
269     }
270 
271     /*
272      * When the environment is not UTF-8 locale, coding system is
273      * determined by the current locale.
274      */
275     return c;
276   } else
277 #endif /* HAVE_SETLOCALE */
278   {
279 #ifndef MSDOS /* IF NOT DEFINED */
280     /*
281      * Since UTF-8 is a strict coding system, it is unlikely that
282      * a non-UTF-8 file is accidently recognized as a UTF-8 file.
283      * Thus, UTF-8 test is performed first regardless of locale.
284      */
285     if( 1 == isUTF8( str, length ) )
286       return UTF_8;
287 #endif /* MSDOS */
288 
289     return GuessCodingSystem_EastAsia(str, length, defaultEuc, NULL);
290   }
291 }
292 
GuessHz(byte * str,int length)293 public byte GuessHz( byte *str, int length )
294 {
295   boolean_t gb = FALSE;
296   int i, hzPenalty = 0;
297   byte ch;
298 
299   for( i = 0 ; i < length ; i++ ){
300     /*
301      * check for HZ
302      */
303     ch = str[ i ];
304     if( '~' == ch ){
305       if( ++i >= length )
306 	break;
307       ch = str[ i ];
308       if( FALSE == gb ){
309 	if( '{' == ch ){
310 	  gb = TRUE;
311 	} else if( '~' == ch || '}' == ch ){
312 	  /* do nothing */
313 	} else
314 	  hzPenalty++;
315       } else {
316 	if( '}' == ch )
317 	  gb = FALSE;
318       }
319     }
320   }
321 
322   return 0 == hzPenalty;
323 }
324 
325 
AdjustPatternCharset(byte inputCodingSystem,byte keyboardCodingSystem,byte defaultCodingSystem,i_str_t * istr)326 public void AdjustPatternCharset( byte inputCodingSystem,
327 				 byte keyboardCodingSystem,
328 				 byte defaultCodingSystem,
329 				 i_str_t *istr )
330 {
331   if( FALSE == adjust_charset )
332     return;
333 
334 #ifndef MSDOS /* IF NOT DEFINED */
335   if( IsUtfEncoding( inputCodingSystem ) || IsUtfEncoding( keyboardCodingSystem ) ){
336     if( IsUtfEncoding( inputCodingSystem ) && !IsUtfEncoding( keyboardCodingSystem ) )
337       ConvertToUNI( istr );
338     else if( !IsUtfEncoding( inputCodingSystem ) && IsUtfEncoding( keyboardCodingSystem ) ){
339       if( AUTOSELECT == inputCodingSystem )
340 	inputCodingSystem = defaultCodingSystem;
341       ConvertFromUNI( istr, inputCodingSystem );
342     }
343   } else
344 #endif /* MSDOS */
345     if( BIG_FIVE == inputCodingSystem && BIG_FIVE != keyboardCodingSystem ){
346       ConvertCNStoBIG5( istr );
347     } else if( BIG_FIVE != inputCodingSystem && BIG_FIVE == keyboardCodingSystem ){
348       ConvertBIG5toCNS( istr );
349     }
350 }
351