1 /**
2  *  Yudit Unicode Editor Source File
3  *
4  *  GNU Copyright (C) 1997-2006  Gaspar Sinai <gaspar@yudit.org>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License, version 2,
8  *  dated June 1991. See file COPYYING for details.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 
20 #include "stoolkit/SString.h"
21 #include "stoolkit/SBinHashtable.h"
22 #include "stoolkit/SEncoder.h"
23 #include "stoolkit/sencoder/SBEncoder.h"
24 #include "stoolkit/sencoder/SB_UTF8.h"
25 #include "stoolkit/sencoder/SB_Java.h"
26 #include "stoolkit/sencoder/SB_NCR.h"
27 #include "stoolkit/sencoder/SB_UTF7.h"
28 #include "stoolkit/sencoder/SB_Generic.h"
29 #include "stoolkit/sencoder/SB_EUC_JP.h"
30 #include "stoolkit/sencoder/SB_S_JIS.h"
31 #include "stoolkit/sencoder/SB_X11_JP.h"
32 #include "stoolkit/sencoder/SB_ISO2022_JP.h"
33 #include "stoolkit/sencoder/SB_X11_HZ.h"
34 #include "stoolkit/sencoder/SB_GB2312_8.h"
35 #include "stoolkit/sencoder/SB_GB18030.h"
36 #include "stoolkit/sencoder/SB_HZ.h"
37 #include "stoolkit/sencoder/SB_X11_KSC.h"
38 #include "stoolkit/sencoder/SB_EUC_KR.h"
39 #include "stoolkit/sencoder/SB_UHC.h"
40 #include "stoolkit/sencoder/SB_Johab.h"
41 #include "stoolkit/sencoder/SB_BIG5.h"
42 #include "stoolkit/sencoder/SB_UCS2.h"
43 #include "stoolkit/sencoder/SB_UInput.h"
44 #include "stoolkit/sencoder/SB_DeShape.h"
45 #include "stoolkit/sencoder/SB_BiDi.h"
46 #include "stoolkit/sencoder/SB_S_JIS0213.h"
47 #include "stoolkit/sencoder/SB_EUC_JP0213.h"
48 #include "stoolkit/sencoder/SB_ISO2022_JP3.h"
49 #include "stoolkit/SExcept.h"
50 #include "stoolkit/SUniMap.h"
51 
52 
53 static SStringVector _built_in(
54 "utf-8,utf-8-s,utf-7,java,java-s,ncr,ucs-2,ucs-2-le,ucs-2-be,utf-16,utf-16-le,utf-16-be,euc-jp,euc-jp-3,euc-kr,big-5,hz,iso-2022-x11,ksc-5601-x11,gb-18030,gb-2312-x11,gb-2312,iso-2022-jp,iso-2022-jp-3,shift-jis,shift-jis-3,uhc,johab,unicode,bidi"
55 );
56 /**
57  * Vector all the build-in encodings.
58  */
59 const SStringVector&
builtin()60 SEncoder::builtin()
61 {
62   return _built_in;
63 }
64 
65 /**
66  * return all the external maps available
67  */
68 SStringVector
external()69 SEncoder::external()
70 {
71   SBinHashtable<int> mentioned;
72   for (unsigned int i=0; i<_built_in.size(); i++)
73   {
74     mentioned.put (_built_in[i], 1);
75   }
76   SStringVector ext = SUniMap::list();
77   SStringVector ret;
78   for (unsigned int j=0; j<ext.size(); j++)
79   {
80     if (mentioned.get(ext[j])!=0) continue;
81     mentioned.put (ext[j], 1);
82     ret.append (ext[j]);
83   }
84   return SStringVector(ret);
85 }
86 
87 /**
88  * Try to find the converter. Default is utf-8
89  * New SBEncoder sould  be added here.
90  */
91 void
load()92 SEncoder::load()
93 {
94   ok = true;
95   if (name == "utf-8")
96   {
97     delegate = new SB_UTF8(false);
98   }
99   else if (name == "utf-8-s")
100   {
101     delegate = new SB_UTF8(true); /* surrogate will be treated as normal char */
102   }
103   else if (name == "java")
104   {
105     delegate = new SB_Java(false);
106   }
107   else if (name == "java-s") /* surrogate will be treated as normal char */
108   {
109     delegate = new SB_Java(true);
110   }
111   else if (name == "ncr")
112   {
113     delegate = new SB_NCR();
114   }
115   else if (name == "utf-7")
116   {
117     delegate = new SB_UTF7();
118   }
119   else if (name == "gb-18030")
120   {
121     SB_GB18030* gb18030 = new SB_GB18030();
122     ok = gb18030->isOK();
123     delegate = gb18030;
124   }
125   else if (name == "big-5")
126   {
127     SB_BIG5* big_5 = new SB_BIG5();
128     ok = big_5->isOK();
129     delegate = big_5;
130   }
131   else if (name == "euc-jp")
132   {
133     SB_EUC_JP* euc_jp = new SB_EUC_JP();
134     ok = euc_jp->isOK();
135     delegate = euc_jp;
136   }
137   else if (name == "euc-jp-3")
138   {
139     SB_EUC_JP0213* euc_jp0213 = new SB_EUC_JP0213();
140     ok = euc_jp0213->isOK();
141     delegate = euc_jp0213;
142   }
143   else if (name == "euc-kr")
144   {
145     SB_EUC_KR* euc_kr = new SB_EUC_KR();
146     ok = euc_kr->isOK();
147     delegate = euc_kr;
148   }
149   else if (name == "uhc")
150   {
151     SB_UHC* uhc = new SB_UHC();
152     ok = uhc->isOK();
153     delegate = uhc;
154   }
155   else if (name == "ucs-2")
156   {
157     delegate = new SB_UCS2(SB_UCS2::AUTO_END, false);
158   }
159   /* I don't know why, it is all mixed up. workaround - mix them up */
160   else if (name == "ucs-2-be")
161   {
162     delegate = new SB_UCS2(SB_UCS2::LITTLE_END, false);
163   }
164   else if (name == "ucs-2-le")
165   {
166     delegate = new SB_UCS2(SB_UCS2::BIG_END, false);
167   }
168   else if (name == "utf-16")
169   {
170     delegate = new SB_UCS2(SB_UCS2::AUTO_END, true);
171   }
172   /* I don't know why, it is all mixed up. workaround - mix them up */
173   else if (name == "utf-16-be")
174   {
175     delegate = new SB_UCS2(SB_UCS2::LITTLE_END, true);
176   }
177   else if (name == "utf-16-le")
178   {
179     delegate = new SB_UCS2(SB_UCS2::BIG_END, true);
180   }
181   else if (name == "johab")
182   {
183     SB_Johab* johab = new SB_Johab();
184     ok = johab->isOK();
185     delegate = johab;
186   }
187   else if (name == "iso-2022-jp")
188   {
189     SB_ISO2022_JP* iso2022_jp = new SB_ISO2022_JP();
190     ok = iso2022_jp->isOK();
191     delegate = iso2022_jp;
192   }
193   else if (name == "iso-2022-jp-3")
194   {
195     SB_ISO2022_JP3* iso2022_jp3 = new SB_ISO2022_JP3();
196     ok = iso2022_jp3->isOK();
197     delegate = iso2022_jp3;
198   }
199   else if (name == "iso-2022-x11")
200   {
201     SB_X11_JP* x11_jp = new SB_X11_JP();
202     ok = x11_jp->isOK();
203     delegate = x11_jp;
204   }
205   else if (name == "shift-jis")
206   {
207     SB_S_JIS* s_jis = new SB_S_JIS();
208     ok = s_jis->isOK();
209     delegate = s_jis;
210   }
211   else if (name == "shift-jis-3")
212   {
213     SB_S_JIS0213* s_jis0213 = new SB_S_JIS0213();
214     ok = s_jis0213->isOK();
215     delegate = s_jis0213;
216   }
217   else if (name == "shift-jis-0213") /* alias to hide shift-jis-0213.my */
218   {
219     SB_S_JIS0213* s_jis0213 = new SB_S_JIS0213();
220     ok = s_jis0213->isOK();
221     delegate = s_jis0213;
222   }
223   else if (name == "gb-2312-x11")
224   {
225     SB_X11_HZ* x11_hz = new SB_X11_HZ();
226     ok = x11_hz->isOK();
227     delegate = x11_hz;
228   }
229   else if (name == "gb-2312")
230   {
231     SB_GB2312_8* gb_2312_8 = new SB_GB2312_8();
232     ok = gb_2312_8->isOK();
233     delegate = gb_2312_8;
234   }
235   else if (name == "ksc-5601-x11")
236   {
237     SB_X11_KSC* gb_x11_ksc = new SB_X11_KSC();
238     ok = gb_x11_ksc->isOK();
239     delegate = gb_x11_ksc;
240   }
241   else if (name == "hz")
242   {
243     SB_HZ* hz = new SB_HZ();
244     ok = hz->isOK();
245     delegate = hz;
246   }
247   else if (name == "unicode")
248   {
249     SB_UInput* uni = new SB_UInput();
250     ok = true;
251     delegate = uni;
252   }
253   else if (name == "deshape")
254   {
255     SB_DeShape* deshape = new SB_DeShape();
256     ok = deshape->isOK();
257     delegate = deshape;
258   }
259   else if (name == "bidi")
260   {
261     SB_BiDi* bidi = new SB_BiDi();
262     ok = bidi->isOK();
263     delegate = bidi;
264   }
265   else
266   {
267     SB_Generic* g = new SB_Generic(name);
268     ok = g->isOK();
269     if (ok)
270     {
271       delegate = g;
272     }
273     else
274     {
275       delete g;
276       delegate = new SB_UTF8(false);
277     }
278   }
279 }
280 
281 /**
282  * Create a utf-8 converter
283  */
SEncoder(void)284 SEncoder::SEncoder (void)
285 {
286   name = "utf-8";
287   ok = true;
288   load();
289 }
290 
291 /**
292  * return false if something is wrong with the map:
293  *  The map not found or similar
294  */
295 bool
isOK() const296 SEncoder::isOK () const
297 {
298   return ok;
299 }
300 
301 /**
302  * Create a converter with a name
303  * @param name is either a valid name
304  * or a map
305  */
SEncoder(const SString & _name)306 SEncoder::SEncoder (const SString& _name)
307 {
308   name = _name;
309   ok = true;
310   load ();
311 }
312 
SEncoder(const SEncoder & c)313 SEncoder::SEncoder (const SEncoder& c)
314 {
315   name = c.getName();
316   load ();
317 }
318 
319 SEncoder&
operator =(const SEncoder & c)320 SEncoder::operator = (const SEncoder& c)
321 {
322   if (this != &c)
323   {
324     delete ((SBEncoder*) delegate);
325     name = c.getName();
326     load ();
327     clear();
328   }
329   return *this;
330 }
331 
~SEncoder()332 SEncoder::~SEncoder ()
333 {
334   delete ((SBEncoder*) delegate);
335 }
336 
337 const SString&
getName() const338 SEncoder::getName() const
339 {
340   return name;
341 }
342 
343 /**
344  * This is encoding a unicode string into a bytestring
345  * @param input is a unicode string.
346  */
347 const SString&
encode(const SV_UCS4 & input)348 SEncoder::encode (const SV_UCS4& input)
349 {
350   return ((SBEncoder*) delegate)->encode (input);
351 }
352 
353 void
clear()354 SEncoder::clear()
355 {
356   buffer.clear();
357   delim.clear();
358   remaining.clear();
359   ((SBEncoder*) delegate)->clear();
360 }
361 /**
362  * Decode an input string into a unicode string.
363  * @param input is a string.
364  *   he output can be null, in this case a line is not
365  *   read fully. If input size is zero output will be flushed.
366  */
367 const SV_UCS4&
decode(const SString & input,bool more)368 SEncoder::decode (const SString& input, bool more)
369 {
370   if (delim.size() == 0 && input.size()!=0)
371   {
372     ((SBEncoder*) delegate)->delimiters(input);
373   }
374   buffer.append (input);
375   /**
376    * We need more input for the delimiter?
377    */
378   if (delim.size() != 0 && more)
379   {
380     unsigned int i;
381     /* there is a potential bug here - r n should be specified
382        in front of r or n  */
383     for (i=0; i<delim.size(); i++)
384     {
385       if (buffer.find (delim[i]) >= 0) break;
386     }
387     retUCS4.clear();
388     if (i==delim.size()) return retUCS4;
389   }
390   retUCS4 =  ((SBEncoder*) delegate)->decode (buffer);
391   SV_UCS4 additional;
392   if (!more)
393   {
394     additional = ((SBEncoder*) delegate)->decode("");
395     retUCS4.append (additional);
396 
397   }
398   buffer.clear();
399   return retUCS4;
400 }
401 
402 /**
403  * return key value map to see what decodes to what
404  * @param key will contain the keys
405  * @param value will contain the values
406  * @param _size is the maximum size of returned arrays
407  * @return the real size of the arrays.
408  */
409 unsigned int
getDecoderMap(SStringVector * key,SStringVector * value,unsigned int _size)410 SEncoder::getDecoderMap (SStringVector* key, SStringVector* value,
411         unsigned int _size)
412 {
413    return ((SBEncoder*) delegate)->getDecoderMap (key, value, _size);
414 }
415 
416 /* for non-clustering it is remainder */
417 SString
preEditBuffer() const418 SEncoder::preEditBuffer() const
419 {
420   SString rm = ((SBEncoder*) delegate)->preEditBuffer();
421   rm.append (buffer);
422   return SString(rm);
423 }
424  /* for clustering */
425 SV_UCS4
postEditBuffer() const426 SEncoder::postEditBuffer () const
427 {
428    return ((SBEncoder*) delegate)->postEditBuffer();
429 }
430 
431 /**
432  * These methods guess the line delimiters for the input
433  * The one without arguments is giving the 'first approximation'
434  * It returns an inclusive list of all possibilities.
435  */
436 const SStringVector&
delimiters()437 SEncoder::delimiters ()
438 {
439   return ((SBEncoder*) delegate)->delimiters();
440 }
441 
442 SObject*
clone() const443 SEncoder::clone() const
444 {
445   SEncoder* n = new SEncoder(name);
446   CHECK_NEW (n);
447   return n;
448 }
449