1 /**
2 * Yudit Unicode Editor Source File
3 *
4 * GNU Copyright (C) 1997-2006 Gaspar Sinai <gaspar@yudit.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2,
8 * dated June 1991. See file COPYYING for details.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 #include "stoolkit/sencoder/SB_X11_KSC.h"
21 #include "stoolkit/SString.h"
22 #include "stoolkit/SStringVector.h"
23
24 #define SS_ESC 27
25
26 /**
27 * This is a sample (base) implementation of the core encoding class
28 * @author: Gaspar Sinai <gaspar@yudit.org>
29 * @version: 2000-05-12
30 * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
31 */
SB_X11_KSC()32 SB_X11_KSC::SB_X11_KSC() : SBEncoder ("\n,\r\n,\r"), ksc_5601_r ("ksc-5601-r")
33 {
34 ok = ksc_5601_r.isOK();
35 }
36
~SB_X11_KSC()37 SB_X11_KSC::~SB_X11_KSC ()
38 {
39 }
40
41 /**
42 * return false if this generic encoder does not exist.
43 */
44 bool
isOK() const45 SB_X11_KSC::isOK() const
46 {
47 return ok;
48 }
49
50 /**
51 * This is encoding a unicode string into a bytestring
52 * @param input is a unicode string.
53 */
54 const SString&
encode(const SV_UCS4 & input)55 SB_X11_KSC::encode (const SV_UCS4& input)
56 {
57 const SS_UCS4* in = input.array();
58 sstring.clear();
59 sstring.ensure(input.size()*2);
60 SS_UCS2 got;
61 SUniMap* current=0;
62
63 for (unsigned int i=0; i<input.size(); i++)
64 {
65 if (in[i] < 0x80)
66 {
67 if (current != 0)
68 {
69 sstring.append ((char) SS_ESC);
70 sstring.append ((char) '(');
71 sstring.append ((char) 'B');
72 }
73 current=0;
74 sstring.append ((char) in[i]);
75 continue;
76 }
77
78 if (ksc_5601_r.isOK() && (got=ksc_5601_r.encode (in[i])) != 0)
79 {
80 if ((got&0xff00) > 0xa000 && (got&0xff00) < 0xff00
81 && (got&0xff) > 0xa0 && (got&0xff) < 0xff)
82 {
83 if (current==0)
84 {
85 sstring.append ((char) SS_ESC);
86 sstring.append ((char) '$');
87 sstring.append ((char) '(');
88 sstring.append ((char) 'C');
89 }
90 current=&ksc_5601_r;
91 sstring.append ((char) ((got&0x7f00)>>8));
92 sstring.append ((char) (got&0x7f));
93 continue;
94 }
95 }
96 quoteString (in[i]);
97 }
98 if (current != 0)
99 {
100 sstring.append ((char) SS_ESC);
101 sstring.append ((char) '(');
102 sstring.append ((char) 'B');
103 }
104 return sstring;
105 }
106
107 /**
108 * Decode an input string into a unicode string.
109 * @param input is a string.
110 * he output can be null, in this case a line is not
111 * read fully. If input size is zero output will be flushed.
112 */
113 const SV_UCS4&
decode(const SString & input)114 SB_X11_KSC::decode (const SString& input)
115 {
116 const unsigned char* in = (unsigned char*) input.array();
117 ucs4string.clear();
118 ucs4string.ensure(input.size());
119 SS_UCS4 got;
120 SUniMap* current=0;
121
122 for (unsigned i=0; i<input.size(); i++)
123 {
124 if (input.size() > i+3 && in[i] == SS_ESC
125 && in[i+1] == '$'
126 && in[i+2] == '('
127 && in[i+3] == 'C')
128 {
129 current=&ksc_5601_r; // Korean
130 i++; i++; i++;
131 continue;
132 }
133 if (input.size() > i+2 && in[i] == SS_ESC && in[i+1] == '('
134 && in[i+2] == 'B')
135 {
136 current=0; // ASCII
137 i++; i++;
138 continue;
139 }
140 if (input.size() > i+3 && in[i] == SS_ESC && in[i+1] == '('
141 && in[i+2] == 'A' && in[i+3] > 0x20 && in[i+3] <0x7f)
142 {
143 current=0; // 1 byte ASCII
144 i++; i++; i++;
145 ucs4string.append ((SS_UCS4) in[i]);
146 continue;
147 }
148
149 // It should not happen but it does.
150 if (in[i] < ' ') current=0;
151
152 if (current!=0)
153 {
154 if ( input.size() > i+1 && in[i]>0x20 && in[i] < 0x7F
155 && in[i+1] > 0x20 && in[i+1] < 0x7F)
156 {
157 got = current->decode (((((SS_UCS2)in[i]<< 8) | in[i+1]) | 0x8080 )) ;
158 }
159 else
160 {
161 got = 0;
162 }
163 if (got != 0)
164 {
165 ucs4string.append (got);
166 }
167 else
168 {
169 quoteUCS4 ((unsigned char) in[i]);
170 quoteUCS4 ((unsigned char) in[i+1]);
171 }
172 i++;
173 continue;
174 }
175 ucs4string.append ((SS_UCS4) in[i]);
176 continue;
177 }
178 return ucs4string;
179 }
180
181
182 /**
183 * These methods guess the line delimiters for the input
184 * The one without arguments is giving the 'first approximation'
185 * It returns an inclusive list of all possibilities.
186 */
187 const SStringVector&
delimiters()188 SB_X11_KSC::delimiters ()
189 {
190 return realDelimiters;
191 }
192
193 /**
194 * These methods guess the line delimiters for the input
195 * The one without arguments is giving the 'first approximation'
196 * It returns an exact list
197 */
198 const SStringVector&
delimiters(const SString & sample)199 SB_X11_KSC::delimiters (const SString& sample)
200 {
201 return sampleDelimiters;
202 }
203