1 /**
2 * Yudit Unicode Editor Source File
3 *
4 * GNU Copyright (C) 1997-2006 Gaspar Sinai <gaspar@yudit.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2,
8 * dated June 1991. See file COPYYING for details.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 #include "stoolkit/SString.h"
21 #include "stoolkit/SBinHashtable.h"
22 #include "stoolkit/SEncoder.h"
23 #include "stoolkit/sencoder/SBEncoder.h"
24 #include "stoolkit/sencoder/SB_UTF8.h"
25 #include "stoolkit/sencoder/SB_Java.h"
26 #include "stoolkit/sencoder/SB_NCR.h"
27 #include "stoolkit/sencoder/SB_UTF7.h"
28 #include "stoolkit/sencoder/SB_Generic.h"
29 #include "stoolkit/sencoder/SB_EUC_JP.h"
30 #include "stoolkit/sencoder/SB_S_JIS.h"
31 #include "stoolkit/sencoder/SB_X11_JP.h"
32 #include "stoolkit/sencoder/SB_ISO2022_JP.h"
33 #include "stoolkit/sencoder/SB_X11_HZ.h"
34 #include "stoolkit/sencoder/SB_GB2312_8.h"
35 #include "stoolkit/sencoder/SB_GB18030.h"
36 #include "stoolkit/sencoder/SB_HZ.h"
37 #include "stoolkit/sencoder/SB_X11_KSC.h"
38 #include "stoolkit/sencoder/SB_EUC_KR.h"
39 #include "stoolkit/sencoder/SB_UHC.h"
40 #include "stoolkit/sencoder/SB_Johab.h"
41 #include "stoolkit/sencoder/SB_BIG5.h"
42 #include "stoolkit/sencoder/SB_UCS2.h"
43 #include "stoolkit/sencoder/SB_UInput.h"
44 #include "stoolkit/sencoder/SB_DeShape.h"
45 #include "stoolkit/sencoder/SB_BiDi.h"
46 #include "stoolkit/sencoder/SB_S_JIS0213.h"
47 #include "stoolkit/sencoder/SB_EUC_JP0213.h"
48 #include "stoolkit/sencoder/SB_ISO2022_JP3.h"
49 #include "stoolkit/SExcept.h"
50 #include "stoolkit/SUniMap.h"
51
52
53 static SStringVector _built_in(
54 "utf-8,utf-8-s,utf-7,java,java-s,ncr,ucs-2,ucs-2-le,ucs-2-be,utf-16,utf-16-le,utf-16-be,euc-jp,euc-jp-3,euc-kr,big-5,hz,iso-2022-x11,ksc-5601-x11,gb-18030,gb-2312-x11,gb-2312,iso-2022-jp,iso-2022-jp-3,shift-jis,shift-jis-3,uhc,johab,unicode,bidi"
55 );
56 /**
57 * Vector all the build-in encodings.
58 */
59 const SStringVector&
builtin()60 SEncoder::builtin()
61 {
62 return _built_in;
63 }
64
65 /**
66 * return all the external maps available
67 */
68 SStringVector
external()69 SEncoder::external()
70 {
71 SBinHashtable<int> mentioned;
72 for (unsigned int i=0; i<_built_in.size(); i++)
73 {
74 mentioned.put (_built_in[i], 1);
75 }
76 SStringVector ext = SUniMap::list();
77 SStringVector ret;
78 for (unsigned int j=0; j<ext.size(); j++)
79 {
80 if (mentioned.get(ext[j])!=0) continue;
81 mentioned.put (ext[j], 1);
82 ret.append (ext[j]);
83 }
84 return SStringVector(ret);
85 }
86
87 /**
88 * Try to find the converter. Default is utf-8
89 * New SBEncoder sould be added here.
90 */
91 void
load()92 SEncoder::load()
93 {
94 ok = true;
95 if (name == "utf-8")
96 {
97 delegate = new SB_UTF8(false);
98 }
99 else if (name == "utf-8-s")
100 {
101 delegate = new SB_UTF8(true); /* surrogate will be treated as normal char */
102 }
103 else if (name == "java")
104 {
105 delegate = new SB_Java(false);
106 }
107 else if (name == "java-s") /* surrogate will be treated as normal char */
108 {
109 delegate = new SB_Java(true);
110 }
111 else if (name == "ncr")
112 {
113 delegate = new SB_NCR();
114 }
115 else if (name == "utf-7")
116 {
117 delegate = new SB_UTF7();
118 }
119 else if (name == "gb-18030")
120 {
121 SB_GB18030* gb18030 = new SB_GB18030();
122 ok = gb18030->isOK();
123 delegate = gb18030;
124 }
125 else if (name == "big-5")
126 {
127 SB_BIG5* big_5 = new SB_BIG5();
128 ok = big_5->isOK();
129 delegate = big_5;
130 }
131 else if (name == "euc-jp")
132 {
133 SB_EUC_JP* euc_jp = new SB_EUC_JP();
134 ok = euc_jp->isOK();
135 delegate = euc_jp;
136 }
137 else if (name == "euc-jp-3")
138 {
139 SB_EUC_JP0213* euc_jp0213 = new SB_EUC_JP0213();
140 ok = euc_jp0213->isOK();
141 delegate = euc_jp0213;
142 }
143 else if (name == "euc-kr")
144 {
145 SB_EUC_KR* euc_kr = new SB_EUC_KR();
146 ok = euc_kr->isOK();
147 delegate = euc_kr;
148 }
149 else if (name == "uhc")
150 {
151 SB_UHC* uhc = new SB_UHC();
152 ok = uhc->isOK();
153 delegate = uhc;
154 }
155 else if (name == "ucs-2")
156 {
157 delegate = new SB_UCS2(SB_UCS2::AUTO_END, false);
158 }
159 /* I don't know why, it is all mixed up. workaround - mix them up */
160 else if (name == "ucs-2-be")
161 {
162 delegate = new SB_UCS2(SB_UCS2::LITTLE_END, false);
163 }
164 else if (name == "ucs-2-le")
165 {
166 delegate = new SB_UCS2(SB_UCS2::BIG_END, false);
167 }
168 else if (name == "utf-16")
169 {
170 delegate = new SB_UCS2(SB_UCS2::AUTO_END, true);
171 }
172 /* I don't know why, it is all mixed up. workaround - mix them up */
173 else if (name == "utf-16-be")
174 {
175 delegate = new SB_UCS2(SB_UCS2::LITTLE_END, true);
176 }
177 else if (name == "utf-16-le")
178 {
179 delegate = new SB_UCS2(SB_UCS2::BIG_END, true);
180 }
181 else if (name == "johab")
182 {
183 SB_Johab* johab = new SB_Johab();
184 ok = johab->isOK();
185 delegate = johab;
186 }
187 else if (name == "iso-2022-jp")
188 {
189 SB_ISO2022_JP* iso2022_jp = new SB_ISO2022_JP();
190 ok = iso2022_jp->isOK();
191 delegate = iso2022_jp;
192 }
193 else if (name == "iso-2022-jp-3")
194 {
195 SB_ISO2022_JP3* iso2022_jp3 = new SB_ISO2022_JP3();
196 ok = iso2022_jp3->isOK();
197 delegate = iso2022_jp3;
198 }
199 else if (name == "iso-2022-x11")
200 {
201 SB_X11_JP* x11_jp = new SB_X11_JP();
202 ok = x11_jp->isOK();
203 delegate = x11_jp;
204 }
205 else if (name == "shift-jis")
206 {
207 SB_S_JIS* s_jis = new SB_S_JIS();
208 ok = s_jis->isOK();
209 delegate = s_jis;
210 }
211 else if (name == "shift-jis-3")
212 {
213 SB_S_JIS0213* s_jis0213 = new SB_S_JIS0213();
214 ok = s_jis0213->isOK();
215 delegate = s_jis0213;
216 }
217 else if (name == "shift-jis-0213") /* alias to hide shift-jis-0213.my */
218 {
219 SB_S_JIS0213* s_jis0213 = new SB_S_JIS0213();
220 ok = s_jis0213->isOK();
221 delegate = s_jis0213;
222 }
223 else if (name == "gb-2312-x11")
224 {
225 SB_X11_HZ* x11_hz = new SB_X11_HZ();
226 ok = x11_hz->isOK();
227 delegate = x11_hz;
228 }
229 else if (name == "gb-2312")
230 {
231 SB_GB2312_8* gb_2312_8 = new SB_GB2312_8();
232 ok = gb_2312_8->isOK();
233 delegate = gb_2312_8;
234 }
235 else if (name == "ksc-5601-x11")
236 {
237 SB_X11_KSC* gb_x11_ksc = new SB_X11_KSC();
238 ok = gb_x11_ksc->isOK();
239 delegate = gb_x11_ksc;
240 }
241 else if (name == "hz")
242 {
243 SB_HZ* hz = new SB_HZ();
244 ok = hz->isOK();
245 delegate = hz;
246 }
247 else if (name == "unicode")
248 {
249 SB_UInput* uni = new SB_UInput();
250 ok = true;
251 delegate = uni;
252 }
253 else if (name == "deshape")
254 {
255 SB_DeShape* deshape = new SB_DeShape();
256 ok = deshape->isOK();
257 delegate = deshape;
258 }
259 else if (name == "bidi")
260 {
261 SB_BiDi* bidi = new SB_BiDi();
262 ok = bidi->isOK();
263 delegate = bidi;
264 }
265 else
266 {
267 SB_Generic* g = new SB_Generic(name);
268 ok = g->isOK();
269 if (ok)
270 {
271 delegate = g;
272 }
273 else
274 {
275 delete g;
276 delegate = new SB_UTF8(false);
277 }
278 }
279 }
280
281 /**
282 * Create a utf-8 converter
283 */
SEncoder(void)284 SEncoder::SEncoder (void)
285 {
286 name = "utf-8";
287 ok = true;
288 load();
289 }
290
291 /**
292 * return false if something is wrong with the map:
293 * The map not found or similar
294 */
295 bool
isOK() const296 SEncoder::isOK () const
297 {
298 return ok;
299 }
300
301 /**
302 * Create a converter with a name
303 * @param name is either a valid name
304 * or a map
305 */
SEncoder(const SString & _name)306 SEncoder::SEncoder (const SString& _name)
307 {
308 name = _name;
309 ok = true;
310 load ();
311 }
312
SEncoder(const SEncoder & c)313 SEncoder::SEncoder (const SEncoder& c)
314 {
315 name = c.getName();
316 load ();
317 }
318
319 SEncoder&
operator =(const SEncoder & c)320 SEncoder::operator = (const SEncoder& c)
321 {
322 if (this != &c)
323 {
324 delete ((SBEncoder*) delegate);
325 name = c.getName();
326 load ();
327 clear();
328 }
329 return *this;
330 }
331
~SEncoder()332 SEncoder::~SEncoder ()
333 {
334 delete ((SBEncoder*) delegate);
335 }
336
337 const SString&
getName() const338 SEncoder::getName() const
339 {
340 return name;
341 }
342
343 /**
344 * This is encoding a unicode string into a bytestring
345 * @param input is a unicode string.
346 */
347 const SString&
encode(const SV_UCS4 & input)348 SEncoder::encode (const SV_UCS4& input)
349 {
350 return ((SBEncoder*) delegate)->encode (input);
351 }
352
353 void
clear()354 SEncoder::clear()
355 {
356 buffer.clear();
357 delim.clear();
358 remaining.clear();
359 ((SBEncoder*) delegate)->clear();
360 }
361 /**
362 * Decode an input string into a unicode string.
363 * @param input is a string.
364 * he output can be null, in this case a line is not
365 * read fully. If input size is zero output will be flushed.
366 */
367 const SV_UCS4&
decode(const SString & input,bool more)368 SEncoder::decode (const SString& input, bool more)
369 {
370 if (delim.size() == 0 && input.size()!=0)
371 {
372 ((SBEncoder*) delegate)->delimiters(input);
373 }
374 buffer.append (input);
375 /**
376 * We need more input for the delimiter?
377 */
378 if (delim.size() != 0 && more)
379 {
380 unsigned int i;
381 /* there is a potential bug here - r n should be specified
382 in front of r or n */
383 for (i=0; i<delim.size(); i++)
384 {
385 if (buffer.find (delim[i]) >= 0) break;
386 }
387 retUCS4.clear();
388 if (i==delim.size()) return retUCS4;
389 }
390 retUCS4 = ((SBEncoder*) delegate)->decode (buffer);
391 SV_UCS4 additional;
392 if (!more)
393 {
394 additional = ((SBEncoder*) delegate)->decode("");
395 retUCS4.append (additional);
396
397 }
398 buffer.clear();
399 return retUCS4;
400 }
401
402 /**
403 * return key value map to see what decodes to what
404 * @param key will contain the keys
405 * @param value will contain the values
406 * @param _size is the maximum size of returned arrays
407 * @return the real size of the arrays.
408 */
409 unsigned int
getDecoderMap(SStringVector * key,SStringVector * value,unsigned int _size)410 SEncoder::getDecoderMap (SStringVector* key, SStringVector* value,
411 unsigned int _size)
412 {
413 return ((SBEncoder*) delegate)->getDecoderMap (key, value, _size);
414 }
415
416 /* for non-clustering it is remainder */
417 SString
preEditBuffer() const418 SEncoder::preEditBuffer() const
419 {
420 SString rm = ((SBEncoder*) delegate)->preEditBuffer();
421 rm.append (buffer);
422 return SString(rm);
423 }
424 /* for clustering */
425 SV_UCS4
postEditBuffer() const426 SEncoder::postEditBuffer () const
427 {
428 return ((SBEncoder*) delegate)->postEditBuffer();
429 }
430
431 /**
432 * These methods guess the line delimiters for the input
433 * The one without arguments is giving the 'first approximation'
434 * It returns an inclusive list of all possibilities.
435 */
436 const SStringVector&
delimiters()437 SEncoder::delimiters ()
438 {
439 return ((SBEncoder*) delegate)->delimiters();
440 }
441
442 SObject*
clone() const443 SEncoder::clone() const
444 {
445 SEncoder* n = new SEncoder(name);
446 CHECK_NEW (n);
447 return n;
448 }
449