1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2 
3 #include "iso2022kr.h"
4 
5 #include "ustring.h"
6 
7 
8 static const uint toU[94][94] = {
9 #include "ksc5601.inc"
10 };
11 
12 static const uint toE[65536] = {
13 #include "ksc5601-rev.inc"
14 };
15 
16 
17 /*! \class Iso2022KrCodec iso2022kr.h
18 
19     This class implements a translator between Unicode and the KS C 5601
20     1992 character set using the ISO-2022-KR encoding, as described in
21     RFC 1557.
22 
23     Unlike ISO-2022-JP, this encoding uses a single escape sequence
24     to identify the KS C 5601-1992 charset, and then SI/SO to switch
25     between that and ASCII. The encoding uses an escape code only to
26     identify "lines" that contain SO (i.e. KS C 5601 characters), but
27     some documents may include this only once at the beginning.
28 
29     Apparently, iso-2022-kr is not used in message headers, where
30     EUC-KR is preferred instead.
31 */
32 
33 /*! Creates a new Iso2022KrCodec object. */
34 
Iso2022KrCodec()35 Iso2022KrCodec::Iso2022KrCodec()
36     : Codec( "ISO-2022-KR" )
37 {
38 }
39 
40 
41 /*! Returns the ISO-2022-KR-encoded representation of the UString
42     \a u.
43 */
44 
fromUnicode(const UString & u)45 EString Iso2022KrCodec::fromUnicode( const UString &u )
46 {
47     EString s;
48 
49     enum { ASCII, KSC } mode = ASCII;
50 
51     // RFC 1557 says that ESC$)C must appear once at the beginning of a
52     // line before any SO occurs, so we always emit one at the start.
53 
54     s.append( 0x1B );
55     s.append( "$)C" );
56 
57     uint i = 0;
58     while ( i < u.length() ) {
59         uint n = u[i];
60 
61         if ( n < 128 ) {
62             if ( mode == KSC ) {
63                 s.append( 0x0F );
64                 mode = ASCII;
65             }
66             if ( n == 0x1B || n == 0x0E || n == 0x0F ) {
67                 recordError( i );
68                 break;
69             }
70             s.append( (char)n );
71         }
72         else if ( n < 65536 && toE[n] != 0 ) {
73             if ( mode == ASCII ) {
74                 s.append( 0x0E );
75                 mode = KSC;
76             }
77             n = toE[n];
78             s.append( ( n >> 8 ) );
79             s.append( ( n & 0xff ) );
80         }
81         else {
82             recordError( i );
83         }
84         i++;
85     }
86 
87     return s;
88 }
89 
90 
91 /*! Returns the Unicode representation of the EString \a s. */
92 
toUnicode(const EString & s)93 UString Iso2022KrCodec::toUnicode( const EString &s )
94 {
95     UString u;
96 
97     enum { ASCII, KSC } mode = ASCII;
98 
99     uint n = 0;
100     while ( n < s.length() ) {
101         char c = s[n];
102 
103         if ( c == 0x1b ) {
104             if ( s[n+1] == '$' && s[n+2] == ')' && s[n+3] == 'C' ) {
105                 // We don't do anything with this valid escape.
106             }
107             else {
108                 // We ignore any unknown escape sequences.
109                 recordError( n, s );
110             }
111             n += 2;
112         }
113         else if ( mode == ASCII ) {
114             if ( c == 0x0E ) {
115                 mode = KSC;
116             }
117             else if ( c == 0x0F ) {
118                 recordError( n, s );
119                 u.append( 0xFFFD );
120             }
121             else {
122                 u.append( c );
123             }
124         }
125         else if ( mode == KSC ) {
126             int ku = c;
127             int ten = s[n+1];
128 
129             if ( c == 0x0E ) {
130                 mode = ASCII;
131             }
132             else if ( ten == 0x1B ) {
133                 // Single byte
134                 recordError( n, s );
135                 u.append( 0xFFFD );
136             }
137             else {
138                 // Double byte, of whatever legality
139                 uint cp = 0xFFFD;
140                 ku -= 33;
141                 ten -= 33;
142                 if ( ku > 93 || ten > 93 )
143                     recordError( n, s );
144                 else if ( toU[ku][ten] == 0xFFFD )
145                     recordError( n, ku * 94 + ten );
146                 else
147                     cp = toU[ku][ten];
148                 u.append( cp );
149                 n++;
150             }
151         }
152 
153         n++;
154     }
155 
156     return u;
157 }
158 
159 //Nothing for charset.pl (yet).
160 //(codec ISO-2022-KR Iso2022KrCodec)
161