1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2
3 #include "iso2022kr.h"
4
5 #include "ustring.h"
6
7
8 static const uint toU[94][94] = {
9 #include "ksc5601.inc"
10 };
11
12 static const uint toE[65536] = {
13 #include "ksc5601-rev.inc"
14 };
15
16
17 /*! \class Iso2022KrCodec iso2022kr.h
18
19 This class implements a translator between Unicode and the KS C 5601
20 1992 character set using the ISO-2022-KR encoding, as described in
21 RFC 1557.
22
23 Unlike ISO-2022-JP, this encoding uses a single escape sequence
24 to identify the KS C 5601-1992 charset, and then SI/SO to switch
25 between that and ASCII. The encoding uses an escape code only to
26 identify "lines" that contain SO (i.e. KS C 5601 characters), but
27 some documents may include this only once at the beginning.
28
29 Apparently, iso-2022-kr is not used in message headers, where
30 EUC-KR is preferred instead.
31 */
32
33 /*! Creates a new Iso2022KrCodec object. */
34
Iso2022KrCodec()35 Iso2022KrCodec::Iso2022KrCodec()
36 : Codec( "ISO-2022-KR" )
37 {
38 }
39
40
41 /*! Returns the ISO-2022-KR-encoded representation of the UString
42 \a u.
43 */
44
fromUnicode(const UString & u)45 EString Iso2022KrCodec::fromUnicode( const UString &u )
46 {
47 EString s;
48
49 enum { ASCII, KSC } mode = ASCII;
50
51 // RFC 1557 says that ESC$)C must appear once at the beginning of a
52 // line before any SO occurs, so we always emit one at the start.
53
54 s.append( 0x1B );
55 s.append( "$)C" );
56
57 uint i = 0;
58 while ( i < u.length() ) {
59 uint n = u[i];
60
61 if ( n < 128 ) {
62 if ( mode == KSC ) {
63 s.append( 0x0F );
64 mode = ASCII;
65 }
66 if ( n == 0x1B || n == 0x0E || n == 0x0F ) {
67 recordError( i );
68 break;
69 }
70 s.append( (char)n );
71 }
72 else if ( n < 65536 && toE[n] != 0 ) {
73 if ( mode == ASCII ) {
74 s.append( 0x0E );
75 mode = KSC;
76 }
77 n = toE[n];
78 s.append( ( n >> 8 ) );
79 s.append( ( n & 0xff ) );
80 }
81 else {
82 recordError( i );
83 }
84 i++;
85 }
86
87 return s;
88 }
89
90
91 /*! Returns the Unicode representation of the EString \a s. */
92
toUnicode(const EString & s)93 UString Iso2022KrCodec::toUnicode( const EString &s )
94 {
95 UString u;
96
97 enum { ASCII, KSC } mode = ASCII;
98
99 uint n = 0;
100 while ( n < s.length() ) {
101 char c = s[n];
102
103 if ( c == 0x1b ) {
104 if ( s[n+1] == '$' && s[n+2] == ')' && s[n+3] == 'C' ) {
105 // We don't do anything with this valid escape.
106 }
107 else {
108 // We ignore any unknown escape sequences.
109 recordError( n, s );
110 }
111 n += 2;
112 }
113 else if ( mode == ASCII ) {
114 if ( c == 0x0E ) {
115 mode = KSC;
116 }
117 else if ( c == 0x0F ) {
118 recordError( n, s );
119 u.append( 0xFFFD );
120 }
121 else {
122 u.append( c );
123 }
124 }
125 else if ( mode == KSC ) {
126 int ku = c;
127 int ten = s[n+1];
128
129 if ( c == 0x0E ) {
130 mode = ASCII;
131 }
132 else if ( ten == 0x1B ) {
133 // Single byte
134 recordError( n, s );
135 u.append( 0xFFFD );
136 }
137 else {
138 // Double byte, of whatever legality
139 uint cp = 0xFFFD;
140 ku -= 33;
141 ten -= 33;
142 if ( ku > 93 || ten > 93 )
143 recordError( n, s );
144 else if ( toU[ku][ten] == 0xFFFD )
145 recordError( n, ku * 94 + ten );
146 else
147 cp = toU[ku][ten];
148 u.append( cp );
149 n++;
150 }
151 }
152
153 n++;
154 }
155
156 return u;
157 }
158
159 //Nothing for charset.pl (yet).
160 //(codec ISO-2022-KR Iso2022KrCodec)
161